From cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5 Mon Sep 17 00:00:00 2001 From: rodri Date: Fri, 1 Dec 2023 21:58:15 +0000 Subject: implement memory aligned versions of some functions. --- dppd.s | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'dppd.s') diff --git a/dppd.s b/dppd.s index d480ddb..4c07876 100644 --- a/dppd.s +++ b/dppd.s @@ -41,6 +41,29 @@ TEXT dotvec2_avx(SB), 1, $0 VZEROUPPER RET +TEXT dotvec2_sse_a(SB), 1, $0 + MOVQ b+8(FP), DX + MOVAPD 0(DX), X1 + MOVAPD 0(BP), X0 + MULPD X1, X0 + HADDPD X0, X0 + RET + +TEXT dotvec2_sse4_a(SB), 1, $0 + MOVQ b+8(FP), DX + MOVAPD 0(DX), X1 + MOVAPD 0(BP), X0 + DPPD $0x31, X1, X0 + RET + +TEXT dotvec2_avx_a(SB), 1, $0 + MOVQ b+8(FP), DX + VMOVAPD_128mr(0, rDX, rX0) + VMOVAPD_128mr(0, rBP, rX1) + VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + VZEROUPPER + RET + TEXT dotvec3_sse4(SB), 1, $0 MOVUPD a+0(FP), X0 MOVUPD b+32(FP), X1 @@ -63,6 +86,16 @@ TEXT dotvec3_avx(SB), 1, $0 VZEROUPPER RET +TEXT dotvec3_sse4_a(SB), 1, $0 + MOVQ b+8(FP), DX + MOVAPD 0(DX), X0 + MOVAPD 0(BP), X1 + DPPD $0x31, X1, X0 + MOVSD 16(DX), X1 + MULSD 16(BP), X1 + ADDSD X1, X0 + RET + TEXT Pt2b(SB), 1, $0 MOVQ BP, DI MOVSD x+8(FP), X0 -- cgit v1.2.3