aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrodri <rgl@antares-labs.eu>2023-11-24 12:08:15 +0000
committerrodri <rgl@antares-labs.eu>2023-11-24 12:08:15 +0000
commit9404d16a4263a87559af64bfb18c91ccebaa601d (patch)
tree0a17daf278b3656042a35e1227b665bc87dbf992
parentd8ab83e060bf9bb6b1d51915d63578a6dc8cacaf (diff)
downloadamd64-simd-9404d16a4263a87559af64bfb18c91ccebaa601d.tar.gz
amd64-simd-9404d16a4263a87559af64bfb18c91ccebaa601d.tar.bz2
amd64-simd-9404d16a4263a87559af64bfb18c91ccebaa601d.zip
fix the 9 asm notes. add VFMA231[SP]D instructions.
-rw-r--r--dppd.s25
-rw-r--r--main.c6
-rw-r--r--sse.h42
3 files changed, 47 insertions, 26 deletions
diff --git a/dppd.s b/dppd.s
index 2b1833c..907a437 100644
--- a/dppd.s
+++ b/dppd.s
@@ -7,8 +7,8 @@ TEXT dppd(SB), 1, $0
MOVQ SP, AX
MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
- MOVLPD(32, rAX, rX1) /* MOVLPD b+0(FP), X1 */
- MOVHPD(40, rAX, rX1) /* MOVHPD b+8(FP), X1*/
+ MOVLPD(32, rAX, rX1) /* MOVLPD b+24(FP), X1 */
+ MOVHPD(40, rAX, rX1) /* MOVHPD b+32(FP), X1*/
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
@@ -16,12 +16,12 @@ TEXT dppd3(SB), 1, $0
MOVQ SP, AX
MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
- MOVLPD(40, rAX, rX1) /* MOVLPD b+0(FP), X1 */
- MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */
+ MOVLPD(40, rAX, rX1) /* MOVLPD b+32(FP), X1 */
+ MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
MOVSD one(SB), X1
MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */
- MOVHPD(56, rAX, rX1) /* MOVHPD b+16(FP), X1 */
+ MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
@@ -38,18 +38,18 @@ TEXT Pt2b(SB), 1, $0
TEXT hsubpd(SB), 1, $0
MOVQ SP, AX
MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
- MOVHPD(16, rAX, rX0) /* MOVHPD b+0(FP), X0 */
+ MOVHPD(16, rAX, rX0) /* MOVHPD b+8(FP), X0 */
HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */
RET
TEXT xvec3(SB), 1, $0
MOVQ SP, AX
ADDQ $8, AX
- MOVLPD(40, rAX, rX0) /* MOVLPD b+0(FP), X0 */
+ MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */
MOVHPD(8, rAX, rX0) /* MOVHPD a+0(FP), X0 */
MOVLPD(16, rAX, rX1) /* MOVLPD a+8(FP), X1 */
- MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */
- MOVLPD(56, rAX, rX2) /* MOVLPD b+16(FP), X2 */
+ MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */
+ MOVLPD(56, rAX, rX2) /* MOVLPD b+48(FP), X2 */
MOVHPD(24, rAX, rX2) /* MOVHPD a+16(FP), X2 */
MOVAPD X1, X3
MULPD X2, X3
@@ -69,3 +69,10 @@ TEXT xvec3(SB), 1, $0
XORPD X0, X0
MOVSD X0, 24(DI)
RET
+
+TEXT fma(SB), 1, $0
+ MOVSD a+0(FP), X0
+ MOVSD b+8(FP), X1
+ MOVSD c+16(FP), X2
+ VFMADD231SD(rX1, rX2, rX0)
+ RET
diff --git a/main.c b/main.c
index f30da53..fe51889 100644
--- a/main.c
+++ b/main.c
@@ -9,6 +9,7 @@ double dppd3(Point3, Point3);
Point2 Pt2b(double, double, double);
Point3 xvec3(Point3, Point3);
double hsubpd(double, double);
+double fma(double, double, double);
double
fmin(double a, double b)
@@ -78,5 +79,10 @@ main(int argc, char *argv[])
t1 = nanosec();
print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+ t0 = nanosec();
+ r = fma(a, b, 21);
+ t1 = nanosec();
+ print("fma(%g, %g, 21) = %g\ttook %lludns\n", a, b, r, t1-t0);
+
exits(nil);
}
diff --git a/sse.h b/sse.h
index b2ba3f4..ffad543 100644
--- a/sse.h
+++ b/sse.h
@@ -15,15 +15,33 @@
#define rX5 5
#define rX6 6
+#define VEX_m_0F (1)
+#define VEX_m_0F38 (2)
+#define VEX_m_0F3A (3)
+#define VEX_L_128 (0)
+#define VEX_L_256 (1)
+#define VEX_p_NO (0)
+#define VEX_p_66 (1)
+#define VEX_p_F3 (2)
+#define VEX_p_F2 (3)
+
#define OP(o, m, ro, rm) WORD $0x0F66; BYTE $(o); \
BYTE $(((m)<<6)|((ro)<<3)|(rm))
#define OPi(o, m, ro, rm, i) OP((o), (m), (ro), (rm)); \
BYTE $(i)
-#define OP4(o, m, ro, rm) LONG $0x(o)0F66; \
+#define OP4(o, m, ro, rm) WORD $0x0F66; WORD $(o); \
BYTE $(((m)<<6)|((ro)<<3)|(rm))
#define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \
BYTE $(i)
+#define VEX3(r, x, b, m, w, v, l, p) BYTE $0xC4; \
+ BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m)); \
+ BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p))
+#define VEX2(r, b, l, p) BYTE $0xC5; \
+ BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p))
+#define VOP(o, m, ro, rm) BYTE $(o); \
+ BYTE $(((m)<<6)|((ro)<<3)|(rm))
+
/* MOVLPD */
//opcode = 660F12
//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1]
@@ -47,20 +65,10 @@
//imm8 = 0011 0001
#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31)
+/* VFMADD231SD (128 bit) */
+#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
+ VOP(0xB9, 0x3, (d), (s1))
-#define VEX_m_0F (1)
-#define VEX_m_0F38 (2)
-#define VEX_m_0F3A (3)
-#define VEX_L_128 (0)
-#define VEX_L_256 (1)
-#define VEX_p_NO (0)
-#define VEX_p_66 (1)
-#define VEX_p_F3 (2)
-#define VEX_p_F2 (3)
-
-#define VEX2(r, x, b, m, w, v, l, p) BYTE $0xC5; \
- BYTE $(((~r)<<7)|((~x)<<6)|((~b)<<5)|(m)); \
- BYTE $(((w)<<7)|((~v)<<3)|((l)<<2)|(p));
-
-#define VEX3(r, b, l, p) BYTE $0xC4; \
- BYTE $(((~r)<<7)|((~v)<<3)|((l)<<2)|(p));
+/* VFMADD231PD (128 bit) */
+#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \
+ VOP(0xB8, 0x3, (d), (s1))