diff options
author | rodri <rgl@antares-labs.eu> | 2023-11-25 10:34:41 +0000 |
---|---|---|
committer | rodri <rgl@antares-labs.eu> | 2023-11-25 10:34:41 +0000 |
commit | 675aa84403f98776a7d463e1cc5f9bd41cdbab92 (patch) | |
tree | 7c0f2fbb1814c5a9f8975307da8e79b0c0165d89 /dppd.s | |
parent | cc3307440e698d58843a5273519f4988c01937f1 (diff) | |
download | amd64-simd-675aa84403f98776a7d463e1cc5f9bd41cdbab92.tar.gz amd64-simd-675aa84403f98776a7d463e1cc5f9bd41cdbab92.tar.bz2 amd64-simd-675aa84403f98776a7d463e1cc5f9bd41cdbab92.zip |
cleaned things up and improved the organization a bit.
Diffstat (limited to 'dppd.s')
-rw-r--r-- | dppd.s | 30 |
1 files changed, 17 insertions, 13 deletions
@@ -5,24 +5,21 @@ DATA one(SB)/8,$1.0 GLOBL one(SB), $8 -TEXT dppd(SB), 1, $0 +TEXT dotvec2_sse4(SB), 1, $0 MOVQ SP, AX - MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ - MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ - MOVLPD(32, rAX, rX1) /* MOVLPD b+24(FP), X1 */ - MOVHPD(40, rAX, rX1) /* MOVHPD b+32(FP), X1*/ + MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */ + MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */ DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET -TEXT dppda(SB), 1, $0 +TEXT dotvec2_avx(SB), 1, $0 MOVQ SP, AX VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ - VZEROUPPER RET -TEXT dppd3(SB), 1, $0 +TEXT dotvec3_sse4(SB), 1, $0 MOVQ SP, AX MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ @@ -35,7 +32,7 @@ TEXT dppd3(SB), 1, $0 DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ RET -TEXT dppd3a(SB), 1, $0 +TEXT dotvec3_avx(SB), 1, $0 MOVQ SP, AX VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */ @@ -43,7 +40,6 @@ TEXT dppd3a(SB), 1, $0 MOVSD a+16(FP), X1 MOVSD b+48(FP), X2 VFMADD231SD(rX1, rX2, rX0) - VZEROUPPER RET TEXT Pt2b(SB), 1, $0 @@ -63,7 +59,7 @@ TEXT hsubpd(SB), 1, $0 HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */ RET -TEXT xvec3(SB), 1, $0 +TEXT crossvec3_sse(SB), 1, $0 MOVQ SP, AX ADDQ $8, AX MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */ @@ -91,7 +87,7 @@ TEXT xvec3(SB), 1, $0 MOVSD X0, 24(DI) RET -TEXT xvec3a(SB), 1, $0 +TEXT crossvec3_avx(SB), 1, $0 MOVQ SP, AX ADDQ $8, AX @@ -101,5 +97,13 @@ TEXT fma(SB), 1, $0 MOVSD b+8(FP), X1 MOVSD c+16(FP), X2 VFMADD231SD(rX1, rX2, rX0) - VZEROUPPER + RET + +TEXT addpt2_avx(SB), 1, $0 + MOVQ SP, AX + ADDQ $8, AX + VMOVDQU_256mr(8, rAX, rX0) + VMOVDQU_256mr(32, rAX, rX1) + VADDPD_256rr(rX1, rX0, rX0) + VMOVDQU_256rm(rX0, rAX) RET |