aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrodri <rgl@antares-labs.eu>2023-11-29 21:19:16 +0000
committerrodri <rgl@antares-labs.eu>2023-11-29 21:19:16 +0000
commita0b600a89c2e6e636579fe727235d036c08c7a9d (patch)
tree48be9afc10ab59f68aa4670d7b8b929e4d569d5f
parent092bcb0cb43b4a1ca351a3085c512bf6afa89989 (diff)
downloadamd64-simd-a0b600a89c2e6e636579fe727235d036c08c7a9d.tar.gz
amd64-simd-a0b600a89c2e6e636579fe727235d036c08c7a9d.tar.bz2
amd64-simd-a0b600a89c2e6e636579fe727235d036c08c7a9d.zip
remove instructions recently added to 9front. implemented some tests.
-rw-r--r--avx.h2
-rw-r--r--bench/main.c17
-rw-r--r--dppd.s32
-rw-r--r--main.c34
-rw-r--r--sse.h23
5 files changed, 73 insertions, 35 deletions
diff --git a/avx.h b/avx.h
index 5ae12ec..76a7e4b 100644
--- a/avx.h
+++ b/avx.h
@@ -55,7 +55,7 @@
#define VMOVDQA_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
VOP(0x7F, 0x3, (s), (d))
-/* VMODQU */
+/* VMOVDQU */
#define VMOVDQU_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \
VOPi(0x6F, 0x1, (d), (s), (off))
#define VMOVDQU_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \
diff --git a/bench/main.c b/bench/main.c
index 060a3d3..6f4886f 100644
--- a/bench/main.c
+++ b/bench/main.c
@@ -5,6 +5,7 @@
#include "../bench9/b.h"
double min(double, double);
+double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
double dotvec3_sse4(Point3, Point3);
@@ -64,14 +65,15 @@ static void
bdotvec2(int fd)
{
Bgr g;
- B *b0, *b1, *b2;
+ B *b0, *b1, *b2, *b3;
Point2 a, b;
int i;
benchinitgr(&g, "2d dot product");
b0 = benchadd(&g, "dotvec2");
- b1 = benchadd(&g, "dotvec2_sse4");
- b2 = benchadd(&g, "dotvec2_avx");
+ b1 = benchadd(&g, "dotvec2_sse");
+ b2 = benchadd(&g, "dotvec2_sse4");
+ b3 = benchadd(&g, "dotvec2_avx");
while(b0->n > 0 || b1->n > 0){
a = Vec2(truerand()*frand(), truerand()*frand());
@@ -84,13 +86,18 @@ bdotvec2(int fd)
benchin(b1);
for(i = 0; i < 1e6; i++)
- dotvec2_sse4(a, b);
+ dotvec2_sse(a, b);
benchout(b1);
benchin(b2);
for(i = 0; i < 1e6; i++)
- dotvec2_avx(a, b);
+ dotvec2_sse4(a, b);
benchout(b2);
+
+ benchin(b3);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_avx(a, b);
+ benchout(b3);
}
benchprintgr(&g, fd);
diff --git a/dppd.s b/dppd.s
index 55ee3d9..d480ddb 100644
--- a/dppd.s
+++ b/dppd.s
@@ -5,10 +5,30 @@
DATA one(SB)/8,$1.0
GLOBL one(SB), $8
+TEXT round(SB), 1, $0
+ MOVSD a+0(FP), X0
+ ROUNDSD $0x4, X0, X0
+ RET
+
+TEXT addsub_sse(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVUPD 0(BP), X1
+ MOVUPD 0(DX), X0
+ ADDSUBPD X1, X0
+ MOVUPD X0, 0(DX)
+ RET
+
+TEXT dotvec2_sse(SB), 1, $0
+ MOVUPD a+0(FP), X0
+ MOVUPD b+24(FP), X1
+ MULPD X1, X0
+ HADDPD X0, X0
+ RET
+
TEXT dotvec2_sse4(SB), 1, $0
MOVUPD a+0(FP), X0
MOVUPD b+24(FP), X1
- DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ DPPD $0x31, X1, X0
RET
TEXT dotvec2_avx(SB), 1, $0
@@ -24,7 +44,7 @@ TEXT dotvec2_avx(SB), 1, $0
TEXT dotvec3_sse4(SB), 1, $0
MOVUPD a+0(FP), X0
MOVUPD b+32(FP), X1
- DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ DPPD $0x31, X1, X0
MOVSD a+16(FP), X1
MULSD b+48(FP), X1
ADDSD X1, X0
@@ -56,7 +76,7 @@ TEXT Pt2b(SB), 1, $0
TEXT hsubpd(SB), 1, $0
MOVLPD a+0(FP), X0
MOVHPD b+8(FP), X0
- HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */
+ HSUBPD X0, X0
RET
TEXT crossvec3_sse(SB), 1, $0
@@ -68,15 +88,15 @@ TEXT crossvec3_sse(SB), 1, $0
MOVHPD a+24(FP), X2 /* X2 := [a.z][b.z] */
MOVAPD X1, X3
MULPD X2, X3
- HSUBPD(rX3, rX3) /* x */
+ HSUBPD X3, X3 /* x */
MOVAPD X2, X4
SHUFPD $0x1, X4, X4
MULPD X0, X4
- HSUBPD(rX4, rX4) /* y */
+ HSUBPD X4, X4 /* y */
MOVAPD X0, X5
MULPD X1, X5
SHUFPD $0x1, X5, X5
- HSUBPD(rX5, rX5) /* z */
+ HSUBPD X5, X5 /* z */
MOVQ BP, DI
MOVSD X3, 0(DI)
MOVSD X4, 8(DI)
diff --git a/main.c b/main.c
index 51a2f89..1c22cd8 100644
--- a/main.c
+++ b/main.c
@@ -3,6 +3,7 @@
#include <geometry.h>
double min(double, double);
+double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
double dotvec3_sse4(Point3, Point3);
@@ -14,6 +15,15 @@ double fma(double, double, double);
Point2 addpt2_sse(Point2, Point2);
Point2 addpt2_avx(Point2, Point2);
Point3 addpt3_avx(Point3, Point3);
+void addsub_sse(double*,double*);
+double round(double);
+
+void
+addsub(double *a, double *b)
+{
+ b[0] = b[0]-a[0];
+ b[1] = b[1]+a[1];
+}
double
fmin(double a, double b)
@@ -31,6 +41,7 @@ void
main(int argc, char *argv[])
{
double a, b, r;
+ double va[2], vb[2];
Point2 p0, p1, pr;
Point3 p0t, p1t, prt;
@@ -56,6 +67,9 @@ main(int argc, char *argv[])
r = dotvec2(p0, p1);
print("dotvec2(%v, %v) = %g\n", p0, p1, r);
r = 0;
+ r = dotvec2_sse(p0, p1);
+ print("dotvec2_sse(%v, %v) = %g\n", p0, p1, r);
+ r = 0;
r = dotvec2_sse4(p0, p1);
print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r);
r = 0;
@@ -127,5 +141,25 @@ main(int argc, char *argv[])
prt = addpt3_avx(p0t, p1t);
print("addpt3_avx(%V, %V) = %V\n", p0t, p1t, prt);
+ print("\n");
+
+ va[0] = va[1] = a;
+ vb[0] = vb[1] = b;
+ print("addsub([%g %g], [%g %g]) = ", va[0], va[1], vb[0], vb[1]);
+ addsub(va, vb);
+ print("[%g %g]\n", vb[0], vb[1]);
+
+ va[0] = va[1] = a;
+ vb[0] = vb[1] = b;
+ print("addsub_sse([%g %g], [%g %g]) = ", va[0], va[1], vb[0], vb[1]);
+ addsub_sse(va, vb);
+ print("[%g %g]\n", vb[0], vb[1]);
+
+ print("\n");
+
+ r = 0;
+ r = round(a);
+ print("round(%g) = %g\n", a, r);
+
exits(nil);
}
diff --git a/sse.h b/sse.h
index 0a711d5..3d7a52f 100644
--- a/sse.h
+++ b/sse.h
@@ -18,26 +18,3 @@
/* MODQU */
#define MOVDQU_mr(off, s, d) F3OPi(0x6F, 0x1, (d), (s), (off))
#define MOVDQU_rm(off, s, d) F3OPi(0x7F, 0x1, (s), (d), (off))
-
-/* MOVLPD */
-//opcode = 660F12
-//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
-//disp8 = 8 / 32
-//#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off))
-
-/* MOVHPD */
-//opcode = 660F16
-//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
-//disp8 = 16 / 40
-//#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off))
-
-/* HSUBPD */
-//opcode = 660F7D = 01100110 00001111 01111101
-//modrm = 11 000 000 [X0 → X0]
-#define HSUBPD(s, d) OP(0x7D, 0x3, (d), (s))
-
-/* DPPD */
-//opcode = 660F3A41 = 01100110 00001111 00111010 01000001
-//modrm = 11 000 001 [X1 → X0]
-//imm8 = 0011 0001
-#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)