From 675aa84403f98776a7d463e1cc5f9bd41cdbab92 Mon Sep 17 00:00:00 2001 From: rodri Date: Sat, 25 Nov 2023 10:34:41 +0000 Subject: cleaned things up and improved the organization a bit. --- avx.h | 20 +++++++ bench/main.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- bench/mkfile | 23 ++++++++ dppd.s | 30 +++++----- main.c | 94 ++++++++++++++----------------- mkfile | 5 +- nanosec.c | 109 ------------------------------------ sse.h | 11 ++++ 8 files changed, 283 insertions(+), 188 deletions(-) create mode 100644 bench/mkfile delete mode 100644 nanosec.c diff --git a/avx.h b/avx.h index d2c6d08..3c7129c 100644 --- a/avx.h +++ b/avx.h @@ -45,6 +45,26 @@ #define VMOVAPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ VOP(0x28, 0x3, (d), (s)) +/* VMOVDQA */ +#define VMOVDQA_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQA_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOP(0x7F, 0x3, (s), (d)) +#define VMOVDQA_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQA_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOP(0x7F, 0x3, (s), (d)) + +/* VMODQU */ +#define VMOVDQU_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQU_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \ + VOP(0x7F, 0x3, (s), (d)) +#define VMOVDQU_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQU_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3); \ + VOP(0x7F, 0x3, (s), (d)) + /* VADDPD */ #define VADDPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ VOPi(0x58, 0x1, (d), (s1), (off)) diff --git a/bench/main.c b/bench/main.c index 9d00719..811c471 100644 --- a/bench/main.c +++ b/bench/main.c @@ -2,13 +2,61 @@ #include #include #include -#include "b.h" +#include "../bench9/b.h" -double dppd(Point2, Point2); -double dppda(Point2, Point2); -double dppd3(Point3, Point3); -double dppd3a(Point3, Point3); -Point3 xvec3(Point3, Point3); +double min(double, double); +double dotvec2_sse4(Point2, Point2); +double dotvec2_avx(Point2, Point2); +double dotvec3_sse4(Point3, Point3); +double dotvec3_avx(Point3, Point3); +Point2 Pt2b(double, double, double); +Point3 crossvec3_sse(Point3, Point3); +double hsubpd(double, double); +double fma(double, double, double); +Point2 addpt2_avx(Point2, Point2); + +double +fmin(double a, double b) +{ + return an > 0 || b1->n > 0){ + a = truerand()*frand(); + b = truerand()*frand(); + + benchin(b0); + for(i = 0; i < 1e6; i++) + fmin(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + min(a, b); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} static void bdotvec2(int fd) @@ -20,7 +68,7 @@ bdotvec2(int fd) benchinitgr(&g, "2d dot product"); b0 = benchadd(&g, "dotvec2"); - b1 = benchadd(&g, "dotvec2_simd"); + b1 = benchadd(&g, "dotvec2_sse4"); b2 = benchadd(&g, "dotvec2_avx"); while(b0->n > 0 || b1->n > 0){ @@ -34,12 +82,12 @@ bdotvec2(int fd) benchin(b1); for(i = 0; i < 1e6; i++) - dppd(a, b); + dotvec2_sse4(a, b); benchout(b1); benchin(b2); for(i = 0; i < 1e6; i++) - dppda(a, b); + dotvec2_avx(a, b); benchout(b2); } @@ -57,7 +105,7 @@ bdotvec3(int fd) benchinitgr(&g, "3d dot product"); b0 = benchadd(&g, "dotvec3"); - b1 = benchadd(&g, "dotvec3_simd"); + b1 = benchadd(&g, "dotvec3_sse4"); b2 = benchadd(&g, "dotvec3_avx"); while(b0->n > 0 || b1->n > 0){ @@ -71,12 +119,12 @@ bdotvec3(int fd) benchin(b1); for(i = 0; i < 1e6; i++) - dppd3(a, b); + dotvec3_sse4(a, b); benchout(b1); benchin(b2); for(i = 0; i < 1e6; i++) - dppd3a(a, b); + dotvec3_avx(a, b); benchout(b2); } @@ -94,7 +142,7 @@ bcrossvec3(int fd) benchinitgr(&g, "3d cross product"); b0 = benchadd(&g, "crossvec3"); - b1 = benchadd(&g, "crossvec3_simd"); + b1 = benchadd(&g, "crossvec3_sse"); while(b0->n > 0 || b1->n > 0){ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); @@ -107,7 +155,102 @@ bcrossvec3(int fd) benchin(b1); for(i = 0; i < 1e6; i++) - xvec3(a, b); + crossvec3_sse(a, b); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +bPt2(int fd) +{ + Bgr g; + B *b0, *b1; + double x, y, w; + int i; + + benchinitgr(&g, "Pt2"); + b0 = benchadd(&g, "Pt2"); + b1 = benchadd(&g, "Pt2b"); + + while(b0->n > 0 || b1->n > 0){ + x = truerand()*frand(); + y = truerand()*frand(); + w = truerand()*frand(); + + benchin(b0); + for(i = 0; i < 1e6; i++) + Pt2(x, y, w); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + Pt2b(x, y, w); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +bfma(int fd) +{ + Bgr g; + B *b0, *b1; + double a, b, c; + int i; + + benchinitgr(&g, "multiply + add"); + b0 = benchadd(&g, "madd"); + b1 = benchadd(&g, "fma_avx"); + + while(b0->n > 0 || b1->n > 0){ + a = truerand()*frand(); + b = truerand()*frand(); + c = truerand()*frand(); + + benchin(b0); + for(i = 0; i < 1e6; i++) + madd(a, b, c); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + fma(a, b, c); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +baddpt2(int fd) +{ + Bgr g; + B *b0, *b1; + Point2 a, b; + int i; + + benchinitgr(&g, "2d point sum"); + b0 = benchadd(&g, "addpt2"); + b1 = benchadd(&g, "addpt2_avx"); + + while(b0->n > 0 || b1->n > 0){ + a = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand()); + b = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand()); + + benchin(b0); + for(i = 0; i < 1e6; i++) + addpt2(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + addpt2_avx(a, b); benchout(b1); } @@ -124,11 +267,19 @@ threadmain(int argc, char **argv) if(benchwire(0) != 0) fprint(2, "failed to wire: %r\n"); + bmin(1); + bseparator(1); bdotvec2(1); bseparator(1); bdotvec3(1); bseparator(1); bcrossvec3(1); + bseparator(1); + bPt2(1); + bseparator(1); + bfma(1); + bseparator(1); + baddpt2(1); threadexitsall(nil); } diff --git a/bench/mkfile b/bench/mkfile new file mode 100644 index 0000000..e649008 --- /dev/null +++ b/bench/mkfile @@ -0,0 +1,23 @@ + #include -uvlong nanosec(void); double min(double, double); -double dppd(Point2, Point2); -double dppda(Point2, Point2); -double dppd3(Point3, Point3); -double dppd3a(Point3, Point3); +double dotvec2_sse4(Point2, Point2); +double dotvec2_avx(Point2, Point2); +double dotvec3_sse4(Point3, Point3); +double dotvec3_avx(Point3, Point3); Point2 Pt2b(double, double, double); -Point3 xvec3(Point3, Point3); +Point3 crossvec3_sse(Point3, Point3); double hsubpd(double, double); double fma(double, double, double); +Point2 addpt2_avx(Point2, Point2); double fmin(double a, double b) @@ -19,13 +19,18 @@ fmin(double a, double b) return a -#include -#include - -/* - * This code is a mixture of cpuid(1) and the nanosec() found in vmx, - * in order to force the use of nsec(2) in case we are running in a - * virtualized environment where the clock is mis-bhyve-ing. - */ - -typedef struct Res { - ulong ax, bx, cx, dx; -} Res; - -static uchar _cpuid[] = { - 0x5E, /* POP SI (PC) */ - 0x5D, /* POP BP (Res&) */ - 0x58, /* POP AX */ - 0x59, /* POP CX */ - - 0x51, /* PUSH CX */ - 0x50, /* PUSH AX */ - 0x55, /* PUSH BP */ - 0x56, /* PUSH SI */ - - 0x31, 0xDB, /* XOR BX, BX */ - 0x31, 0xD2, /* XOR DX, DX */ - - 0x0F, 0xA2, /* CPUID */ - - 0x89, 0x45, 0x00, /* MOV AX, 0(BP) */ - 0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */ - 0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */ - 0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */ - 0xC3, /* RET */ -}; - -static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid; - -/* - * nsec() is wallclock and can be adjusted by timesync - * so need to use cycles() instead, but fall back to - * nsec() in case we can't - */ -uvlong -nanosec(void) -{ - static uvlong fasthz, xstart; - char buf[13], path[128]; - ulong w; - uvlong x, div; - int fd; - Res r; - - if(fasthz == ~0ULL) - return nsec() - xstart; - - if(fasthz == 0){ - /* first long in a.out header */ - snprint(path, sizeof path, "/proc/%d/text", getpid()); - fd = open(path, OREAD); - if(fd < 0) - goto Wallclock; - if(read(fd, buf, 4) != 4){ - close(fd); - goto Wallclock; - } - close(fd); - - w = ((ulong *) buf)[0]; - - switch(w){ - default: - goto Wallclock; - case 0x978a0000: /* amd64 */ - /* patch out POP BP -> POP AX */ - _cpuid[1] = 0x58; - case 0xeb010000: /* 386 */ - break; - } - segflush(_cpuid, sizeof(_cpuid)); - - r = cpuid(0x40000000, 0); - ((ulong *) buf)[0] = r.bx; - ((ulong *) buf)[1] = r.cx; - ((ulong *) buf)[2] = r.dx; - buf[12] = 0; - - if(strstr(buf, "bhyve") != nil) - goto Wallclock; - - if(_tos->cyclefreq){ - fasthz = _tos->cyclefreq; - cycles(&xstart); - } else { -Wallclock: - fasthz = ~0ULL; - xstart = nsec(); - } - return 0; - } - cycles(&x); - x -= xstart; - - /* this is ugly */ - for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL); - - return x / (fasthz / div); -} diff --git a/sse.h b/sse.h index 968ecd7..cd98f86 100644 --- a/sse.h +++ b/sse.h @@ -6,7 +6,18 @@ BYTE $(((m)<<6)|((ro)<<3)|(rm)) #define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \ BYTE $(i) +#define F3OP(o, m, ro, rm) WORD $0x0FF3; BYTE $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define F3OPi(o, m, ro, rm, i) F3OP((o), (m), (ro), (rm)); \ + BYTE $(i) + +/* MOVDQA */ +#define MOVDQA_mr(off, s, d) OPi(0x6F, 0x1, (d), (s), (off)) +#define MOVDQA_rm(off, s, d) OPi(0x7F, 0x1, (s), (d), (off)) +/* MODQU */ +#define MOVDQU_mr(off, s, d) F3OPi(0x6F, 0x1, (d), (s), (off)) +#define MOVDQU_rm(off, s, d) F3OPi(0x7F, 0x1, (s), (d), (off)) /* MOVLPD */ //opcode = 660F12 -- cgit v1.2.3