diff options
author | rodri <rgl@antares-labs.eu> | 2023-11-24 15:39:06 +0000 |
---|---|---|
committer | rodri <rgl@antares-labs.eu> | 2023-11-24 15:39:06 +0000 |
commit | d850c3b7f47e58556c160f9d03ea20aa52452020 (patch) | |
tree | 20471edd63ca1c23349a1c8e340155fabedf5b27 | |
parent | 9404d16a4263a87559af64bfb18c91ccebaa601d (diff) | |
download | amd64-simd-d850c3b7f47e58556c160f9d03ea20aa52452020.tar.gz amd64-simd-d850c3b7f47e58556c160f9d03ea20aa52452020.tar.bz2 amd64-simd-d850c3b7f47e58556c160f9d03ea20aa52452020.zip |
add more avx instructions and a bench9 benchmark file.
-rw-r--r-- | bench/main.c | 134 | ||||
-rw-r--r-- | dppd.s | 17 | ||||
-rw-r--r-- | main.c | 20 | ||||
-rw-r--r-- | sse.h | 11 |
4 files changed, 182 insertions, 0 deletions
diff --git a/bench/main.c b/bench/main.c new file mode 100644 index 0000000..9d00719 --- /dev/null +++ b/bench/main.c @@ -0,0 +1,134 @@ +#include <u.h> +#include <libc.h> +#include <thread.h> +#include <geometry.h> +#include "b.h" + +double dppd(Point2, Point2); +double dppda(Point2, Point2); +double dppd3(Point3, Point3); +double dppd3a(Point3, Point3); +Point3 xvec3(Point3, Point3); + +static void +bdotvec2(int fd) +{ + Bgr g; + B *b0, *b1, *b2; + Point2 a, b; + int i; + + benchinitgr(&g, "2d dot product"); + b0 = benchadd(&g, "dotvec2"); + b1 = benchadd(&g, "dotvec2_simd"); + b2 = benchadd(&g, "dotvec2_avx"); + + while(b0->n > 0 || b1->n > 0){ + a = Vec2(truerand()*frand(), truerand()*frand()); + b = Vec2(truerand()*frand(), truerand()*frand()); + + benchin(b0); + for(i = 0; i < 1e6; i++) + dotvec2(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + dppd(a, b); + benchout(b1); + + benchin(b2); + for(i = 0; i < 1e6; i++) + dppda(a, b); + benchout(b2); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +bdotvec3(int fd) +{ + Bgr g; + B *b0, *b1, *b2; + Point3 a, b; + int i; + + benchinitgr(&g, "3d dot product"); + b0 = benchadd(&g, "dotvec3"); + b1 = benchadd(&g, "dotvec3_simd"); + b2 = benchadd(&g, "dotvec3_avx"); + + while(b0->n > 0 || b1->n > 0){ + a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); + b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); + + benchin(b0); + for(i = 0; i < 1e6; i++) + dotvec3(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + dppd3(a, b); + benchout(b1); + + benchin(b2); + for(i = 0; i < 1e6; i++) + dppd3a(a, b); + benchout(b2); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +static void +bcrossvec3(int fd) +{ + Bgr g; + B *b0, *b1; + Point3 a, b; + int i; + + benchinitgr(&g, "3d cross product"); + b0 = benchadd(&g, "crossvec3"); + b1 = benchadd(&g, "crossvec3_simd"); + + while(b0->n > 0 || b1->n > 0){ + a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); + b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); + + benchin(b0); + for(i = 0; i < 1e6; i++) + crossvec3(a, b); + benchout(b0); + + benchin(b1); + for(i = 0; i < 1e6; i++) + xvec3(a, b); + benchout(b1); + } + + benchprintgr(&g, fd); + benchfreegr(&g); +} + +void +threadmain(int argc, char **argv) +{ + ARGBEGIN{ + }ARGEND + + if(benchwire(0) != 0) + fprint(2, "failed to wire: %r\n"); + + bdotvec2(1); + bseparator(1); + bdotvec3(1); + bseparator(1); + bcrossvec3(1); + + threadexitsall(nil); +} @@ -12,6 +12,13 @@ TEXT dppd(SB), 1, $0 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET +TEXT dppda(SB), 1, $0 + MOVQ SP, AX + VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ + VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */ + VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + RET + TEXT dppd3(SB), 1, $0 MOVQ SP, AX MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ @@ -25,6 +32,16 @@ TEXT dppd3(SB), 1, $0 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET +TEXT dppd3a(SB), 1, $0 + MOVQ SP, AX + VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ + VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */ + VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + MOVSD a+16(FP), X1 + MOVSD b+48(FP), X2 + VFMADD231SD(rX1, rX2, rX0) + RET + TEXT Pt2b(SB), 1, $0 MOVQ BP, DI MOVSD x+8(FP), X0 @@ -5,7 +5,9 @@ uvlong nanosec(void); double min(double, double); double dppd(Point2, Point2); +double dppda(Point2, Point2); double dppd3(Point3, Point3); +double dppd3a(Point3, Point3); Point2 Pt2b(double, double, double); Point3 xvec3(Point3, Point3); double hsubpd(double, double); @@ -41,6 +43,8 @@ main(int argc, char *argv[]) t1 = nanosec(); print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + print("\n"); + p0 = Pt2b(a, 1, 1); p1 = Pt2b(b, 3, 1); t0 = nanosec(); @@ -51,6 +55,12 @@ main(int argc, char *argv[]) r = dotvec2(p0, p1); t1 = nanosec(); print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); + t0 = nanosec(); + r = dppda(p0, p1); + t1 = nanosec(); + print("dppda(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); + + print("\n"); p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); @@ -62,12 +72,20 @@ main(int argc, char *argv[]) r = dotvec3(p0t, p1t); t1 = nanosec(); print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); + t0 = nanosec(); + r = dppd3a(p0t, p1t); + t1 = nanosec(); + print("dppd3a(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); + + print("\n"); t0 = nanosec(); r = hsubpd(a, b); t1 = nanosec(); print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + print("\n"); + p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); t0 = nanosec(); @@ -79,6 +97,8 @@ main(int argc, char *argv[]) t1 = nanosec(); print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0); + print("\n"); + t0 = nanosec(); r = fma(a, b, 21); t1 = nanosec(); @@ -41,6 +41,8 @@ BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p)) #define VOP(o, m, ro, rm) BYTE $(o); \ BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define VOPi(o, m, ro, rm, i) VOP((o), (m), (ro), (rm)); \ + BYTE $(i) /* MOVLPD */ //opcode = 660F12 @@ -65,6 +67,15 @@ //imm8 = 0011 0001 #define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31) +/* VMOVAPD */ +#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOPi(0x10, 0x1, (d), (s), (off)) +#define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOP(0x28, 0x3, (d), (s)) +/* VDPPD */ +#define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66); \ + VOPi(0x41, 0x3, (d), (s1), 0x31) + /* VFMADD231SD (128 bit) */ #define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ VOP(0xB9, 0x3, (d), (s1)) |