aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrodri <rgl@antares-labs.eu>2023-12-01 21:58:15 +0000
committerrodri <rgl@antares-labs.eu>2023-12-01 21:58:15 +0000
commitcdfd06439c4c4354e9fdc4f4124149a8d7abdfe5 (patch)
tree7a943ff0655e8df9571a097c6beafa2a024f08a4
parenta0b600a89c2e6e636579fe727235d036c08c7a9d (diff)
downloadamd64-simd-cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5.tar.gz
amd64-simd-cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5.tar.bz2
amd64-simd-cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5.zip
implement memory aligned versions of some functions.
-rw-r--r--bench/main.c57
-rw-r--r--dppd.s33
-rw-r--r--main.c51
3 files changed, 139 insertions, 2 deletions
diff --git a/bench/main.c b/bench/main.c
index 6f4886f..514fb7c 100644
--- a/bench/main.c
+++ b/bench/main.c
@@ -8,8 +8,12 @@ double min(double, double);
double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
+double dotvec2_sse_a(Point2*, Point2*);
+double dotvec2_sse4_a(Point2*, Point2*);
+double dotvec2_avx_a(Point2*, Point2*);
double dotvec3_sse4(Point3, Point3);
double dotvec3_avx(Point3, Point3);
+double dotvec3_sse4_a(Point3*, Point3*);
Point2 Pt2b(double, double, double);
Point3 crossvec3_sse(Point3, Point3);
double hsubpd(double, double);
@@ -18,6 +22,21 @@ Point2 addpt2_sse(Point2, Point2);
Point2 addpt2_avx(Point2, Point2);
Point3 addpt3_avx(Point3, Point3);
+void *
+amalloc(ulong n, ulong a)
+{
+ void *p;
+
+ assert(a > 1 && (a&1) == 0);
+
+ a--;
+ p = malloc(n+a);
+ if(p == nil)
+ sysfatal("malloc: %r");
+ p = (void*)(((uintptr)p + a)&~a);
+ return p;
+}
+
double
fmin(double a, double b)
{
@@ -65,8 +84,9 @@ static void
bdotvec2(int fd)
{
Bgr g;
- B *b0, *b1, *b2, *b3;
+ B *b0, *b1, *b2, *b3, *b4, *b5, *b6;
Point2 a, b;
+ Point2 *aa, *bb;
int i;
benchinitgr(&g, "2d dot product");
@@ -74,10 +94,17 @@ bdotvec2(int fd)
b1 = benchadd(&g, "dotvec2_sse");
b2 = benchadd(&g, "dotvec2_sse4");
b3 = benchadd(&g, "dotvec2_avx");
+ b4 = benchadd(&g, "dotvec2_sse_a");
+ b5 = benchadd(&g, "dotvec2_sse4_a");
+ b6 = benchadd(&g, "dotvec2_avx_a");
while(b0->n > 0 || b1->n > 0){
a = Vec2(truerand()*frand(), truerand()*frand());
b = Vec2(truerand()*frand(), truerand()*frand());
+ aa = amalloc(sizeof(Point2), 16);
+ bb = amalloc(sizeof(Point2), 16);
+ *aa = a;
+ *bb = b;
benchin(b0);
for(i = 0; i < 1e6; i++)
@@ -98,6 +125,21 @@ bdotvec2(int fd)
for(i = 0; i < 1e6; i++)
dotvec2_avx(a, b);
benchout(b3);
+
+ benchin(b4);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_sse_a(aa, bb);
+ benchout(b4);
+
+ benchin(b5);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_sse4_a(aa, bb);
+ benchout(b5);
+
+ benchin(b6);
+ for(i = 0; i < 1e6; i++)
+ dotvec2_avx_a(aa, bb);
+ benchout(b6);
}
benchprintgr(&g, fd);
@@ -108,18 +150,24 @@ static void
bdotvec3(int fd)
{
Bgr g;
- B *b0, *b1, *b2;
+ B *b0, *b1, *b2, *b3;
Point3 a, b;
+ Point3 *aa, *bb;
int i;
benchinitgr(&g, "3d dot product");
b0 = benchadd(&g, "dotvec3");
b1 = benchadd(&g, "dotvec3_sse4");
b2 = benchadd(&g, "dotvec3_avx");
+ b3 = benchadd(&g, "dotvec3_sse4_a");
while(b0->n > 0 || b1->n > 0){
a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ aa = amalloc(sizeof(Point3), 16);
+ bb = amalloc(sizeof(Point3), 16);
+ *aa = a;
+ *bb = b;
benchin(b0);
for(i = 0; i < 1e6; i++)
@@ -135,6 +183,11 @@ bdotvec3(int fd)
for(i = 0; i < 1e6; i++)
dotvec3_avx(a, b);
benchout(b2);
+
+ benchin(b3);
+ for(i = 0; i < 1e6; i++)
+ dotvec3_sse4_a(aa, bb);
+ benchout(b3);
}
benchprintgr(&g, fd);
diff --git a/dppd.s b/dppd.s
index d480ddb..4c07876 100644
--- a/dppd.s
+++ b/dppd.s
@@ -41,6 +41,29 @@ TEXT dotvec2_avx(SB), 1, $0
VZEROUPPER
RET
+TEXT dotvec2_sse_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVAPD 0(DX), X1
+ MOVAPD 0(BP), X0
+ MULPD X1, X0
+ HADDPD X0, X0
+ RET
+
+TEXT dotvec2_sse4_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVAPD 0(DX), X1
+ MOVAPD 0(BP), X0
+ DPPD $0x31, X1, X0
+ RET
+
+TEXT dotvec2_avx_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ VMOVAPD_128mr(0, rDX, rX0)
+ VMOVAPD_128mr(0, rBP, rX1)
+ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ VZEROUPPER
+ RET
+
TEXT dotvec3_sse4(SB), 1, $0
MOVUPD a+0(FP), X0
MOVUPD b+32(FP), X1
@@ -63,6 +86,16 @@ TEXT dotvec3_avx(SB), 1, $0
VZEROUPPER
RET
+TEXT dotvec3_sse4_a(SB), 1, $0
+ MOVQ b+8(FP), DX
+ MOVAPD 0(DX), X0
+ MOVAPD 0(BP), X1
+ DPPD $0x31, X1, X0
+ MOVSD 16(DX), X1
+ MULSD 16(BP), X1
+ ADDSD X1, X0
+ RET
+
TEXT Pt2b(SB), 1, $0
MOVQ BP, DI
MOVSD x+8(FP), X0
diff --git a/main.c b/main.c
index 1c22cd8..ac27a80 100644
--- a/main.c
+++ b/main.c
@@ -6,8 +6,12 @@ double min(double, double);
double dotvec2_sse(Point2, Point2);
double dotvec2_sse4(Point2, Point2);
double dotvec2_avx(Point2, Point2);
+double dotvec2_sse_a(Point2*, Point2*);
+double dotvec2_sse4_a(Point2*, Point2*);
+double dotvec2_avx_a(Point2*, Point2*);
double dotvec3_sse4(Point3, Point3);
double dotvec3_avx(Point3, Point3);
+double dotvec3_sse4_a(Point3*, Point3*);
Point2 Pt2b(double, double, double);
Point3 crossvec3_sse(Point3, Point3);
double hsubpd(double, double);
@@ -18,6 +22,21 @@ Point3 addpt3_avx(Point3, Point3);
void addsub_sse(double*,double*);
double round(double);
+void *
+amalloc(ulong n, ulong a)
+{
+ void *p;
+
+ assert(a > 1 && (a&1) == 0);
+
+ a--;
+ p = malloc(n+a);
+ if(p == nil)
+ sysfatal("malloc: %r");
+ p = (void*)(((uintptr)p + a)&~a);
+ return p;
+}
+
void
addsub(double *a, double *b)
{
@@ -44,6 +63,8 @@ main(int argc, char *argv[])
double va[2], vb[2];
Point2 p0, p1, pr;
Point3 p0t, p1t, prt;
+ Point2 *ap0, *ap1, *apr;
+ Point3 *ap0t, *ap1t, *aprt;
GEOMfmtinstall();
ARGBEGIN{default:sysfatal("shit");}ARGEND
@@ -52,6 +73,14 @@ main(int argc, char *argv[])
a = strtod(argv[0], nil);
b = strtod(argv[1], nil);
+ ap0 = amalloc(sizeof(Point2), 16);
+ ap1 = amalloc(sizeof(Point2), 16);
+ apr = amalloc(sizeof(Point2), 16);
+
+ ap0t = amalloc(sizeof(Point3), 16);
+ ap1t = amalloc(sizeof(Point3), 16);
+ aprt = amalloc(sizeof(Point3), 16);
+
r = 0;
r = fmin(a, b);
print("fmin(%g, %g) = %g\n", a, b, r);
@@ -78,6 +107,20 @@ main(int argc, char *argv[])
print("\n");
+ *ap0 = Pt2b(a, 1, 1);
+ *ap1 = Pt2b(b, 3, 1);
+ r = 0;
+ r = dotvec2_sse_a(ap0, ap1);
+ print("dotvec2_sse_a(%v, %v) = %g\n", *ap0, *ap1, r);
+ r = 0;
+ r = dotvec2_sse4_a(ap0, ap1);
+ print("dotvec2_sse4_a(%v, %v) = %g\n", *ap0, *ap1, r);
+ r = 0;
+ r = dotvec2_avx_a(ap0, ap1);
+ print("dotvec2_avx_a(%v, %v) = %g\n", *ap0, *ap1, r);
+
+ print("\n");
+
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
r = 0;
@@ -92,6 +135,14 @@ main(int argc, char *argv[])
print("\n");
+ *ap0t = Pt3(a, 1, 9, 1);
+ *ap1t = Pt3(b, 3, 4, 1);
+ r = 0;
+ r = dotvec3_sse4_a(ap0t, ap1t);
+ print("dotvec3_sse4_a(%V, %V) = %g\n", *ap0t, *ap1t, r);
+
+ print("\n");
+
r = 0;
r = hsubpd(a, b);
print("hsubpd(%g, %g) = %g\n", a, b, r);