diff options
-rw-r--r-- | avx.h | 20 | ||||
-rw-r--r-- | bench/main.c | 179 | ||||
-rw-r--r-- | bench/mkfile | 23 | ||||
-rw-r--r-- | dppd.s | 30 | ||||
-rw-r--r-- | main.c | 94 | ||||
-rw-r--r-- | mkfile | 5 | ||||
-rw-r--r-- | nanosec.c | 109 | ||||
-rw-r--r-- | sse.h | 11 |
8 files changed, 283 insertions, 188 deletions
@@ -45,6 +45,26 @@ #define VMOVAPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ VOP(0x28, 0x3, (d), (s)) +/* VMOVDQA */ +#define VMOVDQA_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQA_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOP(0x7F, 0x3, (s), (d)) +#define VMOVDQA_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQA_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOP(0x7F, 0x3, (s), (d)) + +/* VMODQU */ +#define VMOVDQU_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQU_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \ + VOP(0x7F, 0x3, (s), (d)) +#define VMOVDQU_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3); \ + VOPi(0x6F, 0x1, (d), (s), (off)) +#define VMOVDQU_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3); \ + VOP(0x7F, 0x3, (s), (d)) + /* VADDPD */ #define VADDPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ VOPi(0x58, 0x1, (d), (s1), (off)) diff --git a/bench/main.c b/bench/main.c index 9d00719..811c471 100644 --- a/bench/main.c +++ b/bench/main.c @@ -2,13 +2,61 @@ #include <libc.h> #include <thread.h> #include <geometry.h> -#include "b.h" +#include "../bench9/b.h" -double dppd(Point2, Point2); -double dppda(Point2, Point2); -double dppd3(Point3, Point3); -double dppd3a(Point3, Point3); -Point3 xvec3(Point3, Point3); +double min(double, double); +double dotvec2_sse4(Point2, Point2); +double dotvec2_avx(Point2, Point2); +double dotvec3_sse4(Point3, Point3); +double dotvec3_avx(Point3, Point3); +Point2 Pt2b(double, double, double); +Point3 crossvec3_sse(Point3, Point3); +double hsubpd(double, double); +double fma(double, double, double); +Point2 addpt2_avx(Point2, Point2); + +double +fmin(double a, double b) +{ + return a<b? a: b; +} + +double +madd(double a, double b, double c) +{ + return a + b*c; +} + +static void +bmin(int fd) +{ + Bgr g; + B *b0, *b1; + double a, b; + int i; + + benchinitgr(&g, "min"); + b0 = benchadd(&g, "fmin"); + b1 = benchadd(&g, "fmin_sse"); + + while(b0->n > 0 || b1->n > 0){ + a = truerand()*frand(); + b = truerand()*frand(); + + benchin(b0); + for(i = 0; i < 1e6; i++) + fmin(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + min(a, b); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} static void bdotvec2(int fd) @@ -20,7 +68,7 @@ bdotvec2(int fd) benchinitgr(&g, "2d dot product"); b0 = benchadd(&g, "dotvec2"); - b1 = benchadd(&g, "dotvec2_simd"); + b1 = benchadd(&g, "dotvec2_sse4"); b2 = benchadd(&g, "dotvec2_avx"); while(b0->n > 0 || b1->n > 0){ @@ -34,12 +82,12 @@ bdotvec2(int fd) benchin(b1); for(i = 0; i < 1e6; i++) - dppd(a, b); + dotvec2_sse4(a, b); benchout(b1); benchin(b2); for(i = 0; i < 1e6; i++) - dppda(a, b); + dotvec2_avx(a, b); benchout(b2); } @@ -57,7 +105,7 @@ bdotvec3(int fd) benchinitgr(&g, "3d dot product"); b0 = benchadd(&g, "dotvec3"); - b1 = benchadd(&g, "dotvec3_simd"); + b1 = benchadd(&g, "dotvec3_sse4"); b2 = benchadd(&g, "dotvec3_avx"); while(b0->n > 0 || b1->n > 0){ @@ -71,12 +119,12 @@ bdotvec3(int fd) benchin(b1); for(i = 0; i < 1e6; i++) - dppd3(a, b); + dotvec3_sse4(a, b); benchout(b1); benchin(b2); for(i = 0; i < 1e6; i++) - dppd3a(a, b); + dotvec3_avx(a, b); benchout(b2); } @@ -94,7 +142,7 @@ bcrossvec3(int fd) benchinitgr(&g, "3d cross product"); b0 = benchadd(&g, "crossvec3"); - b1 = benchadd(&g, "crossvec3_simd"); + b1 = benchadd(&g, "crossvec3_sse"); while(b0->n > 0 || b1->n > 0){ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); @@ -107,7 +155,102 @@ bcrossvec3(int fd) benchin(b1); for(i = 0; i < 1e6; i++) - xvec3(a, b); + crossvec3_sse(a, b); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +bPt2(int fd) +{ + Bgr g; + B *b0, *b1; + double x, y, w; + int i; + + benchinitgr(&g, "Pt2"); + b0 = benchadd(&g, "Pt2"); + b1 = benchadd(&g, "Pt2b"); + + while(b0->n > 0 || b1->n > 0){ + x = truerand()*frand(); + y = truerand()*frand(); + w = truerand()*frand(); + + benchin(b0); + for(i = 0; i < 1e6; i++) + Pt2(x, y, w); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + Pt2b(x, y, w); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +bfma(int fd) +{ + Bgr g; + B *b0, *b1; + double a, b, c; + int i; + + benchinitgr(&g, "multiply + add"); + b0 = benchadd(&g, "madd"); + b1 = benchadd(&g, "fma_avx"); + + while(b0->n > 0 || b1->n > 0){ + a = truerand()*frand(); + b = truerand()*frand(); + c = truerand()*frand(); + + benchin(b0); + for(i = 0; i < 1e6; i++) + madd(a, b, c); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + fma(a, b, c); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +baddpt2(int fd) +{ + Bgr g; + B *b0, *b1; + Point2 a, b; + int i; + + benchinitgr(&g, "2d point sum"); + b0 = benchadd(&g, "addpt2"); + b1 = benchadd(&g, "addpt2_avx"); + + while(b0->n > 0 || b1->n > 0){ + a = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand()); + b = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand()); + + benchin(b0); + for(i = 0; i < 1e6; i++) + addpt2(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + addpt2_avx(a, b); benchout(b1); } @@ -124,11 +267,19 @@ threadmain(int argc, char **argv) if(benchwire(0) != 0) fprint(2, "failed to wire: %r\n"); + bmin(1); + bseparator(1); bdotvec2(1); bseparator(1); bdotvec3(1); bseparator(1); bcrossvec3(1); + bseparator(1); + bPt2(1); + bseparator(1); + bfma(1); + bseparator(1); + baddpt2(1); threadexitsall(nil); } diff --git a/bench/mkfile b/bench/mkfile new file mode 100644 index 0000000..e649008 --- /dev/null +++ b/bench/mkfile @@ -0,0 +1,23 @@ +</$objtype/mkfile + +TARG=bench9 +BIN=/$objtype/bin +arch=`{echo __^$objtype^__} +CFLAGS=$CFLAGS -D$arch -p + +HFILES=\ + ../bench9/b.h\ + ../regs.h\ + ../sse.h\ + ../avx.h\ + +OFILES=\ + ../bench9/b.$O\ + ../bench9/b_$objtype.$O\ + ../min.$O\ + ../dppd.$O\ + main.$O\ + +default:V: all + +</sys/src/cmd/mkone @@ -5,24 +5,21 @@ DATA one(SB)/8,$1.0 GLOBL one(SB), $8 -TEXT dppd(SB), 1, $0 +TEXT dotvec2_sse4(SB), 1, $0 MOVQ SP, AX - MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ - MOVLPD(32, rAX, rX1) /* MOVLPD b+24(FP), X1 */ - MOVHPD(40, rAX, rX1) /* MOVHPD b+32(FP), X1*/ + MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */ + MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET -TEXT dppda(SB), 1, $0 +TEXT dotvec2_avx(SB), 1, $0 MOVQ SP, AX VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ - VZEROUPPER RET -TEXT dppd3(SB), 1, $0 +TEXT dotvec3_sse4(SB), 1, $0 MOVQ SP, AX MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ @@ -35,7 +32,7 @@ TEXT dppd3(SB), 1, $0 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET -TEXT dppd3a(SB), 1, $0 +TEXT dotvec3_avx(SB), 1, $0 MOVQ SP, AX VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */ @@ -43,7 +40,6 @@ TEXT dppd3a(SB), 1, $0 MOVSD a+16(FP), X1 MOVSD b+48(FP), X2 VFMADD231SD(rX1, rX2, rX0) - VZEROUPPER RET TEXT Pt2b(SB), 1, $0 @@ -63,7 +59,7 @@ TEXT hsubpd(SB), 1, $0 HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */ RET -TEXT xvec3(SB), 1, $0 +TEXT crossvec3_sse(SB), 1, $0 MOVQ SP, AX ADDQ $8, AX MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */ @@ -91,7 +87,7 @@ TEXT xvec3(SB), 1, $0 MOVSD X0, 24(DI) RET -TEXT xvec3a(SB), 1, $0 +TEXT crossvec3_avx(SB), 1, $0 MOVQ SP, AX ADDQ $8, AX @@ -101,5 +97,13 @@ TEXT fma(SB), 1, $0 MOVSD b+8(FP), X1 MOVSD c+16(FP), X2 VFMADD231SD(rX1, rX2, rX0) - VZEROUPPER + RET + +TEXT addpt2_avx(SB), 1, $0 + MOVQ SP, AX + ADDQ $8, AX + VMOVDQU_256mr(8, rAX, rX0) + VMOVDQU_256mr(32, rAX, rX1) + VADDPD_256rr(rX1, rX0, rX0) + VMOVDQU_256rm(rX0, rAX) RET @@ -2,16 +2,16 @@ #include <libc.h> #include <geometry.h> -uvlong nanosec(void); double min(double, double); -double dppd(Point2, Point2); -double dppda(Point2, Point2); -double dppd3(Point3, Point3); -double dppd3a(Point3, Point3); +double dotvec2_sse4(Point2, Point2); +double dotvec2_avx(Point2, Point2); +double dotvec3_sse4(Point3, Point3); +double dotvec3_avx(Point3, Point3); Point2 Pt2b(double, double, double); -Point3 xvec3(Point3, Point3); +Point3 crossvec3_sse(Point3, Point3); double hsubpd(double, double); double fma(double, double, double); +Point2 addpt2_avx(Point2, Point2); double fmin(double a, double b) @@ -19,13 +19,18 @@ fmin(double a, double b) return a<b? a: b; } +double +madd(double a, double b, double c) +{ + return a + b*c; +} + void main(int argc, char *argv[]) { - uvlong t0, t1; double a, b, r; - Point2 p0, p1; - Point3 p0t, p1t, pr; + Point2 p0, p1, pr; + Point3 p0t, p1t, prt; GEOMfmtinstall(); ARGBEGIN{default:sysfatal("shit");}ARGEND @@ -34,75 +39,62 @@ main(int argc, char *argv[]) a = strtod(argv[0], nil); b = strtod(argv[1], nil); - t0 = nanosec(); r = fmin(a, b); - t1 = nanosec(); - print("fmin(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); - t0 = nanosec(); + print("fmin(%g, %g) = %g\n", a, b, r); r = min(a, b); - t1 = nanosec(); - print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + print("min(%g, %g) = %g\n", a, b, r); print("\n"); p0 = Pt2b(a, 1, 1); p1 = Pt2b(b, 3, 1); - t0 = nanosec(); - r = dppd(p0, p1); - t1 = nanosec(); - print("dppd(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); - t0 = nanosec(); + r = dotvec2_sse4(p0, p1); + print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r); r = dotvec2(p0, p1); - t1 = nanosec(); - print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); - t0 = nanosec(); - r = dppda(p0, p1); - t1 = nanosec(); - print("dppda(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); + print("dotvec2(%v, %v) = %g\n", p0, p1, r); + r = dotvec2_avx(p0, p1); + print("dotvec2_avx(%v, %v) = %g\n", p0, p1, r); print("\n"); p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); - t0 = nanosec(); - r = dppd3(p0t, p1t); - t1 = nanosec(); - print("dppd3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); - t0 = nanosec(); + r = dotvec3_sse4(p0t, p1t); + print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r); r = dotvec3(p0t, p1t); - t1 = nanosec(); - print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); - t0 = nanosec(); - r = dppd3a(p0t, p1t); - t1 = nanosec(); - print("dppd3a(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); + print("dotvec3(%V, %V) = %g\n", p0t, p1t, r); + r = dotvec3_avx(p0t, p1t); + print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r); print("\n"); - t0 = nanosec(); r = hsubpd(a, b); - t1 = nanosec(); - print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + print("hsubpd(%g, %g) = %g\n", a, b, r); print("\n"); p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); - t0 = nanosec(); - pr = xvec3(p0t, p1t); - t1 = nanosec(); - print("xvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0); - t0 = nanosec(); - pr = crossvec3(p0t, p1t); - t1 = nanosec(); - print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0); + prt = crossvec3_sse(p0t, p1t); + print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt); + prt = crossvec3(p0t, p1t); + print("crossvec3(%V, %V) = %V\n", p0t, p1t, prt); print("\n"); - t0 = nanosec(); + r = madd(a, b, 21); + print("madd(%g, %g, 21) = %g\n", a, b, r); r = fma(a, b, 21); - t1 = nanosec(); - print("fma(%g, %g, 21) = %g\ttook %lludns\n", a, b, r, t1-t0); + print("fma(%g, %g, 21) = %g\n", a, b, r); + + print("\n"); + + p0 = Pt2b(a, 1, 1); + p1 = Pt2b(b, 3, 1); + pr = addpt2(p0, p1); + print("addpt2(%v, %v) = %v\n", p0, p1, pr); + pr = addpt2_avx(p0, p1); + print("addpt2_avx(%v, %v) = %v\n", p0, p1, pr); exits(nil); } @@ -6,7 +6,6 @@ OFILES=\ main.$O\ min.$O\ dppd.$O\ - nanosec.$O\ HFILES=\ regs.h\ @@ -14,3 +13,7 @@ HFILES=\ avx.h\ </sys/src/cmd/mkone + +pulldeps:VQ: + git/clone git://shithub.us/sigrid/bench9 || \ + git/clone https://git.sr.ht/~ft/bench9 diff --git a/nanosec.c b/nanosec.c deleted file mode 100644 index f82d47a..0000000 --- a/nanosec.c +++ /dev/null @@ -1,109 +0,0 @@ -#include <u.h> -#include <libc.h> -#include <tos.h> - -/* - * This code is a mixture of cpuid(1) and the nanosec() found in vmx, - * in order to force the use of nsec(2) in case we are running in a - * virtualized environment where the clock is mis-bhyve-ing. - */ - -typedef struct Res { - ulong ax, bx, cx, dx; -} Res; - -static uchar _cpuid[] = { - 0x5E, /* POP SI (PC) */ - 0x5D, /* POP BP (Res&) */ - 0x58, /* POP AX */ - 0x59, /* POP CX */ - - 0x51, /* PUSH CX */ - 0x50, /* PUSH AX */ - 0x55, /* PUSH BP */ - 0x56, /* PUSH SI */ - - 0x31, 0xDB, /* XOR BX, BX */ - 0x31, 0xD2, /* XOR DX, DX */ - - 0x0F, 0xA2, /* CPUID */ - - 0x89, 0x45, 0x00, /* MOV AX, 0(BP) */ - 0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */ - 0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */ - 0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */ - 0xC3, /* RET */ -}; - -static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid; - -/* - * nsec() is wallclock and can be adjusted by timesync - * so need to use cycles() instead, but fall back to - * nsec() in case we can't - */ -uvlong -nanosec(void) -{ - static uvlong fasthz, xstart; - char buf[13], path[128]; - ulong w; - uvlong x, div; - int fd; - Res r; - - if(fasthz == ~0ULL) - return nsec() - xstart; - - if(fasthz == 0){ - /* first long in a.out header */ - snprint(path, sizeof path, "/proc/%d/text", getpid()); - fd = open(path, OREAD); - if(fd < 0) - goto Wallclock; - if(read(fd, buf, 4) != 4){ - close(fd); - goto Wallclock; - } - close(fd); - - w = ((ulong *) buf)[0]; - - switch(w){ - default: - goto Wallclock; - case 0x978a0000: /* amd64 */ - /* patch out POP BP -> POP AX */ - _cpuid[1] = 0x58; - case 0xeb010000: /* 386 */ - break; - } - segflush(_cpuid, sizeof(_cpuid)); - - r = cpuid(0x40000000, 0); - ((ulong *) buf)[0] = r.bx; - ((ulong *) buf)[1] = r.cx; - ((ulong *) buf)[2] = r.dx; - buf[12] = 0; - - if(strstr(buf, "bhyve") != nil) - goto Wallclock; - - if(_tos->cyclefreq){ - fasthz = _tos->cyclefreq; - cycles(&xstart); - } else { -Wallclock: - fasthz = ~0ULL; - xstart = nsec(); - } - return 0; - } - cycles(&x); - x -= xstart; - - /* this is ugly */ - for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL); - - return x / (fasthz / div); -} @@ -6,7 +6,18 @@ BYTE $(((m)<<6)|((ro)<<3)|(rm)) #define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \ BYTE $(i) +#define F3OP(o, m, ro, rm) WORD $0x0FF3; BYTE $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define F3OPi(o, m, ro, rm, i) F3OP((o), (m), (ro), (rm)); \ + BYTE $(i) + +/* MOVDQA */ +#define MOVDQA_mr(off, s, d) OPi(0x6F, 0x1, (d), (s), (off)) +#define MOVDQA_rm(off, s, d) OPi(0x7F, 0x1, (s), (d), (off)) +/* MODQU */ +#define MOVDQU_mr(off, s, d) F3OPi(0x6F, 0x1, (d), (s), (off)) +#define MOVDQU_rm(off, s, d) F3OPi(0x7F, 0x1, (s), (d), (off)) /* MOVLPD */ //opcode = 660F12 |