diff options
-rw-r--r-- | dppd.s | 25 | ||||
-rw-r--r-- | main.c | 6 | ||||
-rw-r--r-- | sse.h | 42 |
3 files changed, 47 insertions, 26 deletions
@@ -7,8 +7,8 @@ TEXT dppd(SB), 1, $0 MOVQ SP, AX MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ - MOVLPD(32, rAX, rX1) /* MOVLPD b+0(FP), X1 */ - MOVHPD(40, rAX, rX1) /* MOVHPD b+8(FP), X1*/ + MOVLPD(32, rAX, rX1) /* MOVLPD b+24(FP), X1 */ + MOVHPD(40, rAX, rX1) /* MOVHPD b+32(FP), X1*/ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET @@ -16,12 +16,12 @@ TEXT dppd3(SB), 1, $0 MOVQ SP, AX MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ - MOVLPD(40, rAX, rX1) /* MOVLPD b+0(FP), X1 */ - MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */ + MOVLPD(40, rAX, rX1) /* MOVLPD b+32(FP), X1 */ + MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ MOVSD one(SB), X1 MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */ - MOVHPD(56, rAX, rX1) /* MOVHPD b+16(FP), X1 */ + MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET @@ -38,18 +38,18 @@ TEXT Pt2b(SB), 1, $0 TEXT hsubpd(SB), 1, $0 MOVQ SP, AX MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD b+0(FP), X0 */ + MOVHPD(16, rAX, rX0) /* MOVHPD b+8(FP), X0 */ HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */ RET TEXT xvec3(SB), 1, $0 MOVQ SP, AX ADDQ $8, AX - MOVLPD(40, rAX, rX0) /* MOVLPD b+0(FP), X0 */ + MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */ MOVHPD(8, rAX, rX0) /* MOVHPD a+0(FP), X0 */ MOVLPD(16, rAX, rX1) /* MOVLPD a+8(FP), X1 */ - MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */ - MOVLPD(56, rAX, rX2) /* MOVLPD b+16(FP), X2 */ + MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */ + MOVLPD(56, rAX, rX2) /* MOVLPD b+48(FP), X2 */ MOVHPD(24, rAX, rX2) /* MOVHPD a+16(FP), X2 */ MOVAPD X1, X3 MULPD X2, X3 @@ -69,3 +69,10 @@ TEXT xvec3(SB), 1, $0 XORPD X0, X0 MOVSD X0, 24(DI) RET + +TEXT fma(SB), 1, $0 + MOVSD a+0(FP), X0 + MOVSD b+8(FP), X1 + MOVSD c+16(FP), X2 + VFMADD231SD(rX1, rX2, rX0) + RET @@ -9,6 +9,7 @@ double dppd3(Point3, Point3); Point2 Pt2b(double, double, double); Point3 xvec3(Point3, Point3); double hsubpd(double, double); +double fma(double, double, double); double fmin(double a, double b) @@ -78,5 +79,10 @@ main(int argc, char *argv[]) t1 = nanosec(); print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0); + t0 = nanosec(); + r = fma(a, b, 21); + t1 = nanosec(); + print("fma(%g, %g, 21) = %g\ttook %lludns\n", a, b, r, t1-t0); + exits(nil); } @@ -15,15 +15,33 @@ #define rX5 5 #define rX6 6 +#define VEX_m_0F (1) +#define VEX_m_0F38 (2) +#define VEX_m_0F3A (3) +#define VEX_L_128 (0) +#define VEX_L_256 (1) +#define VEX_p_NO (0) +#define VEX_p_66 (1) +#define VEX_p_F3 (2) +#define VEX_p_F2 (3) + #define OP(o, m, ro, rm) WORD $0x0F66; BYTE $(o); \ BYTE $(((m)<<6)|((ro)<<3)|(rm)) #define OPi(o, m, ro, rm, i) OP((o), (m), (ro), (rm)); \ BYTE $(i) -#define OP4(o, m, ro, rm) LONG $0x(o)0F66; \ +#define OP4(o, m, ro, rm) WORD $0x0F66; WORD $(o); \ BYTE $(((m)<<6)|((ro)<<3)|(rm)) #define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \ BYTE $(i) +#define VEX3(r, x, b, m, w, v, l, p) BYTE $0xC4; \ + BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m)); \ + BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p)) +#define VEX2(r, b, l, p) BYTE $0xC5; \ + BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p)) +#define VOP(o, m, ro, rm) BYTE $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) + /* MOVLPD */ //opcode = 660F12 //modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] @@ -47,20 +65,10 @@ //imm8 = 0011 0001 #define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31) +/* VFMADD231SD (128 bit) */ +#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ + VOP(0xB9, 0x3, (d), (s1)) -#define VEX_m_0F (1) -#define VEX_m_0F38 (2) -#define VEX_m_0F3A (3) -#define VEX_L_128 (0) -#define VEX_L_256 (1) -#define VEX_p_NO (0) -#define VEX_p_66 (1) -#define VEX_p_F3 (2) -#define VEX_p_F2 (3) - -#define VEX2(r, x, b, m, w, v, l, p) BYTE $0xC5; \ - BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m)); \ - BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p)); - -#define VEX3(r, b, l, p) BYTE $0xC4; \ - BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p)); +/* VFMADD231PD (128 bit) */ +#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ + VOP(0xB8, 0x3, (d), (s1)) |