aboutsummaryrefslogtreecommitdiff
path: root/dppd.s
diff options
context:
space:
mode:
Diffstat (limited to 'dppd.s')
-rw-r--r--dppd.s85
1 files changed, 50 insertions, 35 deletions
diff --git a/dppd.s b/dppd.s
index db805ff..55ee3d9 100644
--- a/dppd.s
+++ b/dppd.s
@@ -6,40 +6,41 @@ DATA one(SB)/8,$1.0
GLOBL one(SB), $8
TEXT dotvec2_sse4(SB), 1, $0
- MOVQ SP, AX
- MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */
- MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */
+ MOVUPD a+0(FP), X0
+ MOVUPD b+24(FP), X1
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
TEXT dotvec2_avx(SB), 1, $0
MOVQ SP, AX
- VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
- VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */
+ ADDQ $8, AX
+ VMOVUPD_128mr(rAX, rX0)
+ ADDQ $24, AX
+ VMOVUPD_128mr(rAX, rX1)
VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
+ VZEROUPPER
RET
TEXT dotvec3_sse4(SB), 1, $0
- MOVQ SP, AX
- MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
- MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
- MOVLPD(40, rAX, rX1) /* MOVLPD b+32(FP), X1 */
- MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */
- DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
- MOVSD one(SB), X1
- MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */
- MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */
+ MOVUPD a+0(FP), X0
+ MOVUPD b+32(FP), X1
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
+ MOVSD a+16(FP), X1
+ MULSD b+48(FP), X1
+ ADDSD X1, X0
RET
TEXT dotvec3_avx(SB), 1, $0
MOVQ SP, AX
- VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
- VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */
+ ADDQ $8, AX
+ VMOVUPD_128mr(rAX, rX0)
+ ADDQ $32, AX
+ VMOVUPD_128mr(rAX, rX1)
VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
MOVSD a+16(FP), X1
MOVSD b+48(FP), X2
VFMADD231SD(rX1, rX2, rX0)
+ VZEROUPPER
RET
TEXT Pt2b(SB), 1, $0
@@ -53,21 +54,18 @@ TEXT Pt2b(SB), 1, $0
RET
TEXT hsubpd(SB), 1, $0
- MOVQ SP, AX
- MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
- MOVHPD(16, rAX, rX0) /* MOVHPD b+8(FP), X0 */
+ MOVLPD a+0(FP), X0
+ MOVHPD b+8(FP), X0
HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */
RET
TEXT crossvec3_sse(SB), 1, $0
- MOVQ SP, AX
- ADDQ $8, AX
- MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */
- MOVHPD(8, rAX, rX0) /* MOVHPD a+0(FP), X0 */
- MOVLPD(16, rAX, rX1) /* MOVLPD a+8(FP), X1 */
- MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */
- MOVLPD(56, rAX, rX2) /* MOVLPD b+48(FP), X2 */
- MOVHPD(24, rAX, rX2) /* MOVHPD a+16(FP), X2 */
+ MOVLPD b+40(FP), X0
+ MOVHPD a+8(FP), X0 /* X0 := [a.x][b.x] */
+ MOVLPD a+16(FP), X1
+ MOVHPD b+48(FP), X1 /* X1 := [b.y][a.y] */
+ MOVLPD b+56(FP), X2
+ MOVHPD a+24(FP), X2 /* X2 := [a.z][b.z] */
MOVAPD X1, X3
MULPD X2, X3
HSUBPD(rX3, rX3) /* x */
@@ -99,21 +97,38 @@ TEXT fma(SB), 1, $0
VFMADD231SD(rX1, rX2, rX0)
RET
+TEXT addpt2_sse(SB), 1, $0
+ MOVUPD a+8(FP), X0
+ MOVUPD b+32(FP), X1
+ ADDPD X1, X0
+ MOVSD a+24(FP), X2
+ ADDSD b+48(FP), X2
+ MOVQ BP, DI
+ MOVUPD X0, (DI)
+ MOVSD X2, 16(DI)
+ RET
+
/* TODO: write only 24 bytes */
TEXT addpt2_avx(SB), 1, $0
MOVQ SP, AX
- ADDQ $8, AX
- VMOVDQU_256mr(8, rAX, rX0)
- VMOVDQU_256mr(32, rAX, rX1)
+ ADDQ $16, AX
+ VMOVUPD_256mr(rAX, rX0)
+ ADDQ $24, AX
+ VMOVUPD_256mr(rAX, rX1)
VADDPD_256rr(rX1, rX0, rX0)
- VMOVDQU_256rm(rX0, rAX)
+ MOVQ BP, DI
+ VMOVUPD_256rm(rX0, rDI)
+ VZEROUPPER
RET
TEXT addpt3_avx(SB), 1, $0
MOVQ SP, AX
- ADDQ $8, AX
- VMOVDQU_256mr(8, rAX, rX0)
- VMOVDQU_256mr(40, rAX, rX1)
+ ADDQ $16, AX
+ VMOVUPD_256mr(rAX, rX0)
+ ADDQ $32, AX
+ VMOVUPD_256mr(rAX, rX1)
VADDPD_256rr(rX1, rX0, rX0)
- VMOVDQU_256rm(rX0, rAX)
+ MOVQ BP, DI
+ VMOVUPD_256rm(rX0, rDI)
+ VZEROUPPER
RET