From cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5 Mon Sep 17 00:00:00 2001 From: rodri Date: Fri, 1 Dec 2023 21:58:15 +0000 Subject: implement memory aligned versions of some functions. --- bench/main.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- dppd.s | 33 +++++++++++++++++++++++++++++++++ main.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 2 deletions(-) diff --git a/bench/main.c b/bench/main.c index 6f4886f..514fb7c 100644 --- a/bench/main.c +++ b/bench/main.c @@ -8,8 +8,12 @@ double min(double, double); double dotvec2_sse(Point2, Point2); double dotvec2_sse4(Point2, Point2); double dotvec2_avx(Point2, Point2); +double dotvec2_sse_a(Point2*, Point2*); +double dotvec2_sse4_a(Point2*, Point2*); +double dotvec2_avx_a(Point2*, Point2*); double dotvec3_sse4(Point3, Point3); double dotvec3_avx(Point3, Point3); +double dotvec3_sse4_a(Point3*, Point3*); Point2 Pt2b(double, double, double); Point3 crossvec3_sse(Point3, Point3); double hsubpd(double, double); @@ -18,6 +22,21 @@ Point2 addpt2_sse(Point2, Point2); Point2 addpt2_avx(Point2, Point2); Point3 addpt3_avx(Point3, Point3); +void * +amalloc(ulong n, ulong a) +{ + void *p; + + assert(a > 1 && (a&1) == 0); + + a--; + p = malloc(n+a); + if(p == nil) + sysfatal("malloc: %r"); + p = (void*)(((uintptr)p + a)&~a); + return p; +} + double fmin(double a, double b) { @@ -65,8 +84,9 @@ static void bdotvec2(int fd) { Bgr g; - B *b0, *b1, *b2, *b3; + B *b0, *b1, *b2, *b3, *b4, *b5, *b6; Point2 a, b; + Point2 *aa, *bb; int i; benchinitgr(&g, "2d dot product"); @@ -74,10 +94,17 @@ bdotvec2(int fd) b1 = benchadd(&g, "dotvec2_sse"); b2 = benchadd(&g, "dotvec2_sse4"); b3 = benchadd(&g, "dotvec2_avx"); + b4 = benchadd(&g, "dotvec2_sse_a"); + b5 = benchadd(&g, "dotvec2_sse4_a"); + b6 = benchadd(&g, "dotvec2_avx_a"); while(b0->n > 0 || b1->n > 0){ a = Vec2(truerand()*frand(), truerand()*frand()); b = Vec2(truerand()*frand(), truerand()*frand()); + aa = amalloc(sizeof(Point2), 16); + bb = amalloc(sizeof(Point2), 16); + *aa = a; + *bb = b; benchin(b0); for(i = 0; i < 1e6; i++) @@ -98,6 +125,21 @@ bdotvec2(int fd) for(i = 0; i < 1e6; i++) dotvec2_avx(a, b); benchout(b3); + + benchin(b4); + for(i = 0; i < 1e6; i++) + dotvec2_sse_a(aa, bb); + benchout(b4); + + benchin(b5); + for(i = 0; i < 1e6; i++) + dotvec2_sse4_a(aa, bb); + benchout(b5); + + benchin(b6); + for(i = 0; i < 1e6; i++) + dotvec2_avx_a(aa, bb); + benchout(b6); } benchprintgr(&g, fd); @@ -108,18 +150,24 @@ static void bdotvec3(int fd) { Bgr g; - B *b0, *b1, *b2; + B *b0, *b1, *b2, *b3; Point3 a, b; + Point3 *aa, *bb; int i; benchinitgr(&g, "3d dot product"); b0 = benchadd(&g, "dotvec3"); b1 = benchadd(&g, "dotvec3_sse4"); b2 = benchadd(&g, "dotvec3_avx"); + b3 = benchadd(&g, "dotvec3_sse4_a"); while(b0->n > 0 || b1->n > 0){ a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); b = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand()); + aa = amalloc(sizeof(Point3), 16); + bb = amalloc(sizeof(Point3), 16); + *aa = a; + *bb = b; benchin(b0); for(i = 0; i < 1e6; i++) @@ -135,6 +183,11 @@ bdotvec3(int fd) for(i = 0; i < 1e6; i++) dotvec3_avx(a, b); benchout(b2); + + benchin(b3); + for(i = 0; i < 1e6; i++) + dotvec3_sse4_a(aa, bb); + benchout(b3); } benchprintgr(&g, fd); diff --git a/dppd.s b/dppd.s index d480ddb..4c07876 100644 --- a/dppd.s +++ b/dppd.s @@ -41,6 +41,29 @@ TEXT dotvec2_avx(SB), 1, $0 VZEROUPPER RET +TEXT dotvec2_sse_a(SB), 1, $0 + MOVQ b+8(FP), DX + MOVAPD 0(DX), X1 + MOVAPD 0(BP), X0 + MULPD X1, X0 + HADDPD X0, X0 + RET + +TEXT dotvec2_sse4_a(SB), 1, $0 + MOVQ b+8(FP), DX + MOVAPD 0(DX), X1 + MOVAPD 0(BP), X0 + DPPD $0x31, X1, X0 + RET + +TEXT dotvec2_avx_a(SB), 1, $0 + MOVQ b+8(FP), DX + VMOVAPD_128mr(0, rDX, rX0) + VMOVAPD_128mr(0, rBP, rX1) + VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + VZEROUPPER + RET + TEXT dotvec3_sse4(SB), 1, $0 MOVUPD a+0(FP), X0 MOVUPD b+32(FP), X1 @@ -63,6 +86,16 @@ TEXT dotvec3_avx(SB), 1, $0 VZEROUPPER RET +TEXT dotvec3_sse4_a(SB), 1, $0 + MOVQ b+8(FP), DX + MOVAPD 0(DX), X0 + MOVAPD 0(BP), X1 + DPPD $0x31, X1, X0 + MOVSD 16(DX), X1 + MULSD 16(BP), X1 + ADDSD X1, X0 + RET + TEXT Pt2b(SB), 1, $0 MOVQ BP, DI MOVSD x+8(FP), X0 diff --git a/main.c b/main.c index 1c22cd8..ac27a80 100644 --- a/main.c +++ b/main.c @@ -6,8 +6,12 @@ double min(double, double); double dotvec2_sse(Point2, Point2); double dotvec2_sse4(Point2, Point2); double dotvec2_avx(Point2, Point2); +double dotvec2_sse_a(Point2*, Point2*); +double dotvec2_sse4_a(Point2*, Point2*); +double dotvec2_avx_a(Point2*, Point2*); double dotvec3_sse4(Point3, Point3); double dotvec3_avx(Point3, Point3); +double dotvec3_sse4_a(Point3*, Point3*); Point2 Pt2b(double, double, double); Point3 crossvec3_sse(Point3, Point3); double hsubpd(double, double); @@ -18,6 +22,21 @@ Point3 addpt3_avx(Point3, Point3); void addsub_sse(double*,double*); double round(double); +void * +amalloc(ulong n, ulong a) +{ + void *p; + + assert(a > 1 && (a&1) == 0); + + a--; + p = malloc(n+a); + if(p == nil) + sysfatal("malloc: %r"); + p = (void*)(((uintptr)p + a)&~a); + return p; +} + void addsub(double *a, double *b) { @@ -44,6 +63,8 @@ main(int argc, char *argv[]) double va[2], vb[2]; Point2 p0, p1, pr; Point3 p0t, p1t, prt; + Point2 *ap0, *ap1, *apr; + Point3 *ap0t, *ap1t, *aprt; GEOMfmtinstall(); ARGBEGIN{default:sysfatal("shit");}ARGEND @@ -52,6 +73,14 @@ main(int argc, char *argv[]) a = strtod(argv[0], nil); b = strtod(argv[1], nil); + ap0 = amalloc(sizeof(Point2), 16); + ap1 = amalloc(sizeof(Point2), 16); + apr = amalloc(sizeof(Point2), 16); + + ap0t = amalloc(sizeof(Point3), 16); + ap1t = amalloc(sizeof(Point3), 16); + aprt = amalloc(sizeof(Point3), 16); + r = 0; r = fmin(a, b); print("fmin(%g, %g) = %g\n", a, b, r); @@ -78,6 +107,20 @@ main(int argc, char *argv[]) print("\n"); + *ap0 = Pt2b(a, 1, 1); + *ap1 = Pt2b(b, 3, 1); + r = 0; + r = dotvec2_sse_a(ap0, ap1); + print("dotvec2_sse_a(%v, %v) = %g\n", *ap0, *ap1, r); + r = 0; + r = dotvec2_sse4_a(ap0, ap1); + print("dotvec2_sse4_a(%v, %v) = %g\n", *ap0, *ap1, r); + r = 0; + r = dotvec2_avx_a(ap0, ap1); + print("dotvec2_avx_a(%v, %v) = %g\n", *ap0, *ap1, r); + + print("\n"); + p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); r = 0; @@ -92,6 +135,14 @@ main(int argc, char *argv[]) print("\n"); + *ap0t = Pt3(a, 1, 9, 1); + *ap1t = Pt3(b, 3, 4, 1); + r = 0; + r = dotvec3_sse4_a(ap0t, ap1t); + print("dotvec3_sse4_a(%V, %V) = %g\n", *ap0t, *ap1t, r); + + print("\n"); + r = 0; r = hsubpd(a, b); print("hsubpd(%g, %g) = %g\n", a, b, r); -- cgit v1.2.3