From cc3307440e698d58843a5273519f4988c01937f1 Mon Sep 17 00:00:00 2001 From: rodri Date: Fri, 24 Nov 2023 22:13:49 +0000 Subject: add more avx instructions and place VZEROUPPERs. --- avx.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- dppd.s | 8 ++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/avx.h b/avx.h index ef0b2c3..d2c6d08 100644 --- a/avx.h +++ b/avx.h @@ -22,19 +22,83 @@ /* VZEROUPPER */ #define VZEROUPPER VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_NO); BYTE $0x77 -/* VMOVAPD */ +/* VZEROALL */ +#define VZEROALL VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_NO); BYTE $0x77 + +/* VMOVUPD */ #define VMOVUPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ VOPi(0x10, 0x1, (d), (s), (off)) +#define VMOVUPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOP(0x10, 0x3, (d), (s)) +#define VMOVUPD_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOPi(0x10, 0x1, (d), (s), (off)) +#define VMOVUPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOP(0x10, 0x3, (d), (s)) + +/* VMOVAPD */ +#define VMOVAPD_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ + VOPi(0x28, 0x1, (d), (s), (off)) #define VMOVAPD_128rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \ VOP(0x28, 0x3, (d), (s)) +#define VMOVAPD_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOPi(0x28, 0x1, (d), (s), (off)) +#define VMOVAPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \ + VOP(0x28, 0x3, (d), (s)) + +/* VADDPD */ +#define VADDPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOPi(0x58, 0x1, (d), (s1), (off)) +#define VADDPD_128rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOP(0x58, 0x3, (d), (s1)) +#define VADDPD_256mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOPi(0x58, 0x1, (d), (s1), (off)) +#define VADDPD_256rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOP(0x58, 0x3, (d), (s1)) + +/* VSUBPD */ +#define VSUBPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOPi(0x5C, 0x1, (d), (s1), (off)) +#define VSUBPD_128rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOP(0x5C, 0x3, (d), (s1)) +#define VSUBPD_256mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOPi(0x5C, 0x1, (d), (s1), (off)) +#define VSUBPD_256rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOP(0x5C, 0x3, (d), (s1)) + +/* VHADDPD */ +#define VHADDPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOPi(0x7C, 0x1, (d), (s1), (off)) +#define VHADDPD_128rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOP(0x7C, 0x3, (d), (s1)) +#define VHADDPD_256mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOPi(0x7C, 0x1, (d), (s1), (off)) +#define VHADDPD_256rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOP(0x7C, 0x3, (d), (s1)) + +/* VHSUBPD */ +#define VHSUBPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOPi(0x7D, 0x1, (d), (s1), (off)) +#define VHSUBPD_128rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \ + VOP(0x7D, 0x3, (d), (s1)) +#define VHSUBPD_256mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOPi(0x7D, 0x1, (d), (s1), (off)) +#define VHSUBPD_256rr(s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_256,VEX_p_66); \ + VOP(0x7D, 0x3, (d), (s1)) + /* VDPPD */ #define VDPPD(s0, s1, d) VEX3(0,0,0,VEX_m_0F3A,0,(s0),VEX_L_128,VEX_p_66); \ VOPi(0x41, 0x3, (d), (s1), 0x31) /* VFMADD231SD (128 bit) */ -#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ +#define VFMADD231SD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ + VOP(0xB9, 0x3, (d), (s1)) +/* VFMADD231SD (256 bit) */ +#define VFMADD231SD_256(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_256,VEX_p_66); \ VOP(0xB9, 0x3, (d), (s1)) /* VFMADD231PD (128 bit) */ -#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ +#define VFMADD231PD(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_128,VEX_p_66); \ + VOP(0xB8, 0x3, (d), (s1)) +/* VFMADD231PD (256 bit) */ +#define VFMADD231PD_256(s0, s1, d) VEX3(0,0,0,VEX_m_0F38,1,(s0),VEX_L_256,VEX_p_66); \ VOP(0xB8, 0x3, (d), (s1)) diff --git a/dppd.s b/dppd.s index e738dde..de938b8 100644 --- a/dppd.s +++ b/dppd.s @@ -19,6 +19,7 @@ TEXT dppda(SB), 1, $0 VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */ VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */ VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */ + VZEROUPPER RET TEXT dppd3(SB), 1, $0 @@ -42,6 +43,7 @@ TEXT dppd3a(SB), 1, $0 MOVSD a+16(FP), X1 MOVSD b+48(FP), X2 VFMADD231SD(rX1, rX2, rX0) + VZEROUPPER RET TEXT Pt2b(SB), 1, $0 @@ -89,9 +91,15 @@ TEXT xvec3(SB), 1, $0 MOVSD X0, 24(DI) RET +TEXT xvec3a(SB), 1, $0 + MOVQ SP, AX + ADDQ $8, AX + + TEXT fma(SB), 1, $0 MOVSD a+0(FP), X0 MOVSD b+8(FP), X1 MOVSD c+16(FP), X2 VFMADD231SD(rX1, rX2, rX0) + VZEROUPPER RET -- cgit v1.2.3