aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrodri <rgl@antares-labs.eu>2023-11-24 15:39:06 +0000
committerrodri <rgl@antares-labs.eu>2023-11-24 15:39:06 +0000
commitd850c3b7f47e58556c160f9d03ea20aa52452020 (patch)
tree20471edd63ca1c23349a1c8e340155fabedf5b27
parent9404d16a4263a87559af64bfb18c91ccebaa601d (diff)
downloadamd64-simd-d850c3b7f47e58556c160f9d03ea20aa52452020.tar.gz
amd64-simd-d850c3b7f47e58556c160f9d03ea20aa52452020.tar.bz2
amd64-simd-d850c3b7f47e58556c160f9d03ea20aa52452020.zip
add more avx instructions and a bench9 benchmark file.
-rw-r--r--bench/main.c134
-rw-r--r--dppd.s17
-rw-r--r--main.c20
-rw-r--r--sse.h11
4 files changed, 182 insertions, 0 deletions
diff --git a/bench/main.c b/bench/main.c
new file mode 100644
index 0000000..9d00719
--- /dev/null
+++ b/bench/main.c
@@ -0,0 +1,134 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include <geometry.h>
+#include "b.h"
+
+double dppd(Point2, Point2);
+double dppda(Point2, Point2);
+double dppd3(Point3, Point3);
+double dppd3a(Point3, Point3);
+Point3 xvec3(Point3, Point3);
+
+static void
+bdotvec2(int fd)
+{
+ Bgr g;
+ B *b0, *b1, *b2;
+ Point2 a, b;
+ int i;
+
+ benchinitgr(&g, "2d dot product");
+ b0 = benchadd(&g, "dotvec2");
+ b1 = benchadd(&g, "dotvec2_simd");
+ b2 = benchadd(&g, "dotvec2_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Vec2(truerand()*frand(), truerand()*frand());
+ b = Vec2(truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ dotvec2(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ dppd(a, b);
+ benchout(b1);
+
+ benchin(b2);
+ for(i = 0; i < 1e6; i++)
+ dppda(a, b);
+ benchout(b2);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+bdotvec3(int fd)
+{
+ Bgr g;
+ B *b0, *b1, *b2;
+ Point3 a, b;
+ int i;
+
+ benchinitgr(&g, "3d dot product");
+ b0 = benchadd(&g, "dotvec3");
+ b1 = benchadd(&g, "dotvec3_simd");
+ b2 = benchadd(&g, "dotvec3_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ dotvec3(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ dppd3(a, b);
+ benchout(b1);
+
+ benchin(b2);
+ for(i = 0; i < 1e6; i++)
+ dppd3a(a, b);
+ benchout(b2);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+bcrossvec3(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ Point3 a, b;
+ int i;
+
+ benchinitgr(&g, "3d cross product");
+ b0 = benchadd(&g, "crossvec3");
+ b1 = benchadd(&g, "crossvec3_simd");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ crossvec3(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ xvec3(a, b);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ ARGBEGIN{
+ }ARGEND
+
+ if(benchwire(0) != 0)
+ fprint(2, "failed to wire: %r\n");
+
+ bdotvec2(1);
+ bseparator(1);
+ bdotvec3(1);
+ bseparator(1);
+ bcrossvec3(1);
+
+ threadexitsall(nil);
+}
diff --git a/dppd.s b/dppd.s
index 907a437..7239d9f 100644
--- a/dppd.s
+++ b/dppd.s
@@ -12,6 +12,13 @@ TEXT dppd(SB), 1, $0
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
+TEXT dppda(SB), 1, $0
+ MOVQ SP, AX
+ VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
+ VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ RET
+
TEXT dppd3(SB), 1, $0
MOVQ SP, AX
MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
@@ -25,6 +32,16 @@ TEXT dppd3(SB), 1, $0
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
+TEXT dppd3a(SB), 1, $0
+ MOVQ SP, AX
+ VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
+ VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ MOVSD a+16(FP), X1
+ MOVSD b+48(FP), X2
+ VFMADD231SD(rX1, rX2, rX0)
+ RET
+
TEXT Pt2b(SB), 1, $0
MOVQ BP, DI
MOVSD x+8(FP), X0
diff --git a/main.c b/main.c
index fe51889..f14a7ed 100644
--- a/main.c
+++ b/main.c
@@ -5,7 +5,9 @@
uvlong nanosec(void);
double min(double, double);
double dppd(Point2, Point2);
+double dppda(Point2, Point2);
double dppd3(Point3, Point3);
+double dppd3a(Point3, Point3);
Point2 Pt2b(double, double, double);
Point3 xvec3(Point3, Point3);
double hsubpd(double, double);
@@ -41,6 +43,8 @@ main(int argc, char *argv[])
t1 = nanosec();
print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("\n");
+
p0 = Pt2b(a, 1, 1);
p1 = Pt2b(b, 3, 1);
t0 = nanosec();
@@ -51,6 +55,12 @@ main(int argc, char *argv[])
r = dotvec2(p0, p1);
t1 = nanosec();
print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+ t0 = nanosec();
+ r = dppda(p0, p1);
+ t1 = nanosec();
+ print("dppda(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+
+ print("\n");
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
@@ -62,12 +72,20 @@ main(int argc, char *argv[])
r = dotvec3(p0t, p1t);
t1 = nanosec();
print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+ t0 = nanosec();
+ r = dppd3a(p0t, p1t);
+ t1 = nanosec();
+ print("dppd3a(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+
+ print("\n");
t0 = nanosec();
r = hsubpd(a, b);
t1 = nanosec();
print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("\n");
+
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
t0 = nanosec();
@@ -79,6 +97,8 @@ main(int argc, char *argv[])
t1 = nanosec();
print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+ print("\n");
+
t0 = nanosec();
r = fma(a, b, 21);
t1 = nanosec();
diff --git a/sse.h b/sse.h
index ffad543..6a03c9e 100644
--- a/sse.h
+++ b/sse.h
@@ -41,6 +41,8 @@
BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
#define VOP(o, m, ro, rm) BYTE $(o); \
BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define VOPi(o, m, ro, rm, i) VOP((o), (m), (ro), (rm)); \
+ BYTE $(i)
/* MOVLPD */
//opcode = 660F12
@@ -65,6 +67,15 @@
//imm8 = 0011 0001
#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)
+/* VMOVAPD */
+#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOPi(0x10, 0x1, (d), (s), (off))
+#define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOP(0x28, 0x3, (d), (s))
+/* VDPPD */
+#define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66); \
+ VOPi(0x41, 0x3, (d), (s1), 0x31)
+
/* VFMADD231SD (128 bit) */
#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
VOP(0xB9, 0x3, (d), (s1))