diff options
Diffstat (limited to 'dppd.s')
-rw-r--r-- | dppd.s | 85 |
1 files changed, 50 insertions, 35 deletions
@@ -6,40 +6,41 @@ DATA one(SB)/8,$1.0 GLOBL one(SB), $8 TEXT dotvec2_sse4(SB), 1, $0 - MOVQ SP, AX - MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */ - MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */ + MOVUPD a+0(FP), X0 + MOVUPD b+24(FP), X1 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET TEXT dotvec2_avx(SB), 1, $0 MOVQ SP, AX - VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ - VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */ + ADDQ $8, AX + VMOVUPD_128mr(rAX, rX0) + ADDQ $24, AX + VMOVUPD_128mr(rAX, rX1) VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + VZEROUPPER RET TEXT dotvec3_sse4(SB), 1, $0 - MOVQ SP, AX - MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ - MOVLPD(40, rAX, rX1) /* MOVLPD b+32(FP), X1 */ - MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */ - DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ - MOVSD one(SB), X1 - MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */ - MOVHPD(56, rAX, rX1) /* MOVHPD b+48(FP), X1 */ + MOVUPD a+0(FP), X0 + MOVUPD b+32(FP), X1 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + MOVSD a+16(FP), X1 + MULSD b+48(FP), X1 + ADDSD X1, X0 RET TEXT dotvec3_avx(SB), 1, $0 MOVQ SP, AX - VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ - VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */ + ADDQ $8, AX + VMOVUPD_128mr(rAX, rX0) + ADDQ $32, AX + VMOVUPD_128mr(rAX, rX1) VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ MOVSD a+16(FP), X1 MOVSD b+48(FP), X2 VFMADD231SD(rX1, rX2, rX0) + VZEROUPPER RET TEXT Pt2b(SB), 1, $0 @@ -53,21 +54,18 @@ TEXT Pt2b(SB), 1, $0 RET TEXT hsubpd(SB), 1, $0 - MOVQ SP, AX - MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD b+8(FP), X0 */ + MOVLPD a+0(FP), X0 + MOVHPD b+8(FP), X0 HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */ RET TEXT crossvec3_sse(SB), 1, $0 - MOVQ SP, AX - ADDQ $8, AX - MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */ - MOVHPD(8, rAX, rX0) /* MOVHPD a+0(FP), X0 */ - MOVLPD(16, rAX, rX1) /* MOVLPD a+8(FP), X1 */ - MOVHPD(48, rAX, rX1) /* MOVHPD b+40(FP), X1 */ - MOVLPD(56, rAX, rX2) /* MOVLPD b+48(FP), X2 */ - MOVHPD(24, rAX, rX2) /* MOVHPD a+16(FP), X2 */ + MOVLPD b+40(FP), X0 + MOVHPD a+8(FP), X0 /* X0 := [a.x][b.x] */ + MOVLPD a+16(FP), X1 + MOVHPD b+48(FP), X1 /* X1 := [b.y][a.y] */ + MOVLPD b+56(FP), X2 + MOVHPD a+24(FP), X2 /* X2 := [a.z][b.z] */ MOVAPD X1, X3 MULPD X2, X3 HSUBPD(rX3, rX3) /* x */ @@ -99,21 +97,38 @@ TEXT fma(SB), 1, $0 VFMADD231SD(rX1, rX2, rX0) RET +TEXT addpt2_sse(SB), 1, $0 + MOVUPD a+8(FP), X0 + MOVUPD b+32(FP), X1 + ADDPD X1, X0 + MOVSD a+24(FP), X2 + ADDSD b+48(FP), X2 + MOVQ BP, DI + MOVUPD X0, (DI) + MOVSD X2, 16(DI) + RET + /* TODO: write only 24 bytes */ TEXT addpt2_avx(SB), 1, $0 MOVQ SP, AX - ADDQ $8, AX - VMOVDQU_256mr(8, rAX, rX0) - VMOVDQU_256mr(32, rAX, rX1) + ADDQ $16, AX + VMOVUPD_256mr(rAX, rX0) + ADDQ $24, AX + VMOVUPD_256mr(rAX, rX1) VADDPD_256rr(rX1, rX0, rX0) - VMOVDQU_256rm(rX0, rAX) + MOVQ BP, DI + VMOVUPD_256rm(rX0, rDI) + VZEROUPPER RET TEXT addpt3_avx(SB), 1, $0 MOVQ SP, AX - ADDQ $8, AX - VMOVDQU_256mr(8, rAX, rX0) - VMOVDQU_256mr(40, rAX, rX1) + ADDQ $16, AX + VMOVUPD_256mr(rAX, rX0) + ADDQ $32, AX + VMOVUPD_256mr(rAX, rX1) VADDPD_256rr(rX1, rX0, rX0) - VMOVDQU_256rm(rX0, rAX) + MOVQ BP, DI + VMOVUPD_256rm(rX0, rDI) + VZEROUPPER RET |