diff options
-rw-r--r-- | avx.h | 16 | ||||
-rw-r--r-- | dppd.s | 85 | ||||
-rw-r--r-- | main.c | 33 | ||||
-rw-r--r-- | sse.h | 4 |
4 files changed, 87 insertions, 51 deletions
@@ -26,14 +26,14 @@ #define VZEROALL VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_NO); BYTE $0x77 /* VMOVUPD */ -#define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ - VOPi(0x10, 0x1, (d), (s), (off)) -#define VMOVUPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ - VOP(0x10, 0x3, (d), (s)) -#define VMOVUPD_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ - VOPi(0x10, 0x1, (d), (s), (off)) -#define VMOVUPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ - VOP(0x10, 0x3, (d), (s)) +#define VMOVUPD_128mr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOP(0x10, 0x0, (d), (s)) +#define VMOVUPD_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOP(0x11, 0x0, (s), (d)) +#define VMOVUPD_256mr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOP(0x10, 0x0, (d), (s)) +#define VMOVUPD_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOP(0x11, 0x0, (s), (d)) /* VMOVAPD */ #define VMOVAPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ @@ -6,40 +6,41 @@ DATA one(SB)/8,$1.0 GLOBL one(SB), $8 TEXT dotvec2_sse4(SB), 1, $0 - MOVQ SP, AX - MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */ - MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */ + MOVUPD a+0(FP), X0 + MOVUPD b+24(FP), X1 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET TEXT dotvec2_avx(SB), 1, $0 MOVQ SP, AX - VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ - VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */ + ADDQ $8, AX + VMOVUPD_128mr(rAX, rX0) + ADDQ $24, AX + VMOVUPD_128mr(rAX, rX1) VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + VZEROUPPER RET TEXT dotvec3_sse4(SB), 1, $0 - MOVQ SP, AX - MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ - MOVLPD(40, rAX, rX1) /* MOVLPD b+32(FP), X1 */ - MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */ - DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ - MOVSD one(SB), X1 - MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */ - MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */ + MOVUPD a+0(FP), X0 + MOVUPD b+32(FP), X1 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + MOVSD a+16(FP), X1 + MULSD b+48(FP), X1 + ADDSD X1, X0 RET TEXT dotvec3_avx(SB), 1, $0 MOVQ SP, AX - VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ - VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */ + ADDQ $8, AX + VMOVUPD_128mr(rAX, rX0) + ADDQ $32, AX + VMOVUPD_128mr(rAX, rX1) VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ MOVSD a+16(FP), X1 MOVSD b+48(FP), X2 VFMADD231SD(rX1, rX2, rX0) + VZEROUPPER RET TEXT Pt2b(SB), 1, $0 @@ -53,21 +54,18 @@ TEXT Pt2b(SB), 1, $0 RET TEXT hsubpd(SB), 1, $0 - MOVQ SP, AX - MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD b+8(FP), X0 */ + MOVLPD a+0(FP), X0 + MOVHPD b+8(FP), X0 HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */ RET TEXT crossvec3_sse(SB), 1, $0 - MOVQ SP, AX - ADDQ $8, AX - MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */ - MOVHPD(8, rAX, rX0) /* MOVHPD a+0(FP), X0 */ - MOVLPD(16, rAX, rX1) /* MOVLPD a+8(FP), X1 */ - MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */ - MOVLPD(56, rAX, rX2) /* MOVLPD b+48(FP), X2 */ - MOVHPD(24, rAX, rX2) /* MOVHPD a+16(FP), X2 */ + MOVLPD b+40(FP), X0 + MOVHPD a+8(FP), X0 /* X0 := [a.x][b.x] */ + MOVLPD a+16(FP), X1 + MOVHPD b+48(FP), X1 /* X1 := [b.y][a.y] */ + MOVLPD b+56(FP), X2 + MOVHPD a+24(FP), X2 /* X2 := [a.z][b.z] */ MOVAPD X1, X3 MULPD X2, X3 HSUBPD(rX3, rX3) /* x */ @@ -99,21 +97,38 @@ TEXT fma(SB), 1, $0 VFMADD231SD(rX1, rX2, rX0) RET +TEXT addpt2_sse(SB), 1, $0 + MOVUPD a+8(FP), X0 + MOVUPD b+32(FP), X1 + ADDPD X1, X0 + MOVSD a+24(FP), X2 + ADDSD b+48(FP), X2 + MOVQ BP, DI + MOVUPD X0, (DI) + MOVSD X2, 16(DI) + RET + /* TODO: write only 24 bytes */ TEXT addpt2_avx(SB), 1, $0 MOVQ SP, AX - ADDQ $8, AX - VMOVDQU_256mr(8, rAX, rX0) - VMOVDQU_256mr(32, rAX, rX1) + ADDQ $16, AX + VMOVUPD_256mr(rAX, rX0) + ADDQ $24, AX + VMOVUPD_256mr(rAX, rX1) VADDPD_256rr(rX1, rX0, rX0) - VMOVDQU_256rm(rX0, rAX) + MOVQ BP, DI + VMOVUPD_256rm(rX0, rDI) + VZEROUPPER RET TEXT addpt3_avx(SB), 1, $0 MOVQ SP, AX - ADDQ $8, AX - VMOVDQU_256mr(8, rAX, rX0) - VMOVDQU_256mr(40, rAX, rX1) + ADDQ $16, AX + VMOVUPD_256mr(rAX, rX0) + ADDQ $32, AX + VMOVUPD_256mr(rAX, rX1) VADDPD_256rr(rX1, rX0, rX0) - VMOVDQU_256rm(rX0, rAX) + MOVQ BP, DI + VMOVUPD_256rm(rX0, rDI) + VZEROUPPER RET @@ -11,6 +11,7 @@ Point2 Pt2b(double, double, double); Point3 crossvec3_sse(Point3, Point3); double hsubpd(double, double); double fma(double, double, double); +Point2 addpt2_sse(Point2, Point2); Point2 addpt2_avx(Point2, Point2); Point3 addpt3_avx(Point3, Point3); @@ -40,8 +41,10 @@ main(int argc, char *argv[]) a = strtod(argv[0], nil); b = strtod(argv[1], nil); + r = 0; r = fmin(a, b); print("fmin(%g, %g) = %g\n", a, b, r); + r = 0; r = min(a, b); print("min(%g, %g) = %g\n", a, b, r); @@ -49,10 +52,13 @@ main(int argc, char *argv[]) p0 = Pt2b(a, 1, 1); p1 = Pt2b(b, 3, 1); - r = dotvec2_sse4(p0, p1); - print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r); + r = 0; r = dotvec2(p0, p1); print("dotvec2(%v, %v) = %g\n", p0, p1, r); + r = 0; + r = dotvec2_sse4(p0, p1); + print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r); + r = 0; r = dotvec2_avx(p0, p1); print("dotvec2_avx(%v, %v) = %g\n", p0, p1, r); @@ -60,15 +66,19 @@ main(int argc, char *argv[]) p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); - r = dotvec3_sse4(p0t, p1t); - print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r); + r = 0; r = dotvec3(p0t, p1t); print("dotvec3(%V, %V) = %g\n", p0t, p1t, r); + r = 0; + r = dotvec3_sse4(p0t, p1t); + print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r); + r = 0; r = dotvec3_avx(p0t, p1t); print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r); print("\n"); + r = 0; r = hsubpd(a, b); print("hsubpd(%g, %g) = %g\n", a, b, r); @@ -76,15 +86,19 @@ main(int argc, char *argv[]) p0t = Pt3(a, 1, 9, 1); p1t = Pt3(b, 3, 4, 1); - prt = crossvec3_sse(p0t, p1t); - print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt); + prt = Vec3(0,0,0); prt = crossvec3(p0t, p1t); print("crossvec3(%V, %V) = %V\n", p0t, p1t, prt); + prt = Vec3(0,0,0); + prt = crossvec3_sse(p0t, p1t); + print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt); print("\n"); + r = 0; r = madd(a, b, 21); print("madd(%g, %g, 21) = %g\n", a, b, r); + r = 0; r = fma(a, b, 21); print("fma(%g, %g, 21) = %g\n", a, b, r); @@ -92,8 +106,13 @@ main(int argc, char *argv[]) p0 = Pt2b(a, 1, 1); p1 = Pt2b(b, 3, 1); + pr = Vec2(0,0); pr = addpt2(p0, p1); print("addpt2(%v, %v) = %v\n", p0, p1, pr); + pr = Vec2(0,0); + pr = addpt2_sse(p0, p1); + print("addpt2_sse(%v, %v) = %v\n", p0, p1, pr); + pr = Vec2(0,0); pr = addpt2_avx(p0, p1); print("addpt2_avx(%v, %v) = %v\n", p0, p1, pr); @@ -101,8 +120,10 @@ main(int argc, char *argv[]) p0t = Pt3(a, 1, 1, b); p1t = Pt3(b, 3, 1, a); + prt = Vec3(0,0,0); prt = addpt3(p0t, p1t); print("addpt3(%V, %V) = %V\n", p0t, p1t, prt); + prt = Vec3(0,0,0); prt = addpt3_avx(p0t, p1t); print("addpt3_avx(%V, %V) = %V\n", p0t, p1t, prt); @@ -23,13 +23,13 @@ //opcode = 660F12 //modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] //disp8 = 8 / 32 -#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off)) +//#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off)) /* MOVHPD */ //opcode = 660F16 //modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] //disp8 = 16 / 40 -#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off)) +//#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off)) /* HSUBPD */ //opcode = 660F7D = 01100110 00001111 01111101 |