aboutsummaryrefslogtreecommitdiff
path: root/dppd.s
blob: d480ddb564cce2e82c0f20b8aac0d8ded90c3c3f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#include "regs.h"
#include "sse.h"
#include "avx.h"

DATA one(SB)/8,$1.0
GLOBL one(SB), $8

TEXT round(SB), 1, $0
	MOVSD a+0(FP), X0
	ROUNDSD $0x4, X0, X0
	RET

TEXT addsub_sse(SB), 1, $0
	MOVQ b+8(FP), DX
	MOVUPD 0(BP), X1
	MOVUPD 0(DX), X0
	ADDSUBPD X1, X0
	MOVUPD X0, 0(DX)
	RET

TEXT dotvec2_sse(SB), 1, $0
	MOVUPD a+0(FP), X0
	MOVUPD b+24(FP), X1
	MULPD X1, X0
	HADDPD X0, X0
	RET

TEXT dotvec2_sse4(SB), 1, $0
	MOVUPD a+0(FP), X0
	MOVUPD b+24(FP), X1
	DPPD $0x31, X1, X0
	RET

TEXT dotvec2_avx(SB), 1, $0
	MOVQ SP, AX
	ADDQ $8, AX
	VMOVUPD_128mr(rAX, rX0)
	ADDQ $24, AX
	VMOVUPD_128mr(rAX, rX1)
	VDPPD(rX1, rX0, rX0)		/* VDPPD $0x31, X1, X0, X0 */
	VZEROUPPER
	RET

TEXT dotvec3_sse4(SB), 1, $0
	MOVUPD a+0(FP), X0
	MOVUPD b+32(FP), X1
	DPPD $0x31, X1, X0
	MOVSD a+16(FP), X1
	MULSD b+48(FP), X1
	ADDSD X1, X0
	RET

TEXT dotvec3_avx(SB), 1, $0
	MOVQ SP, AX
	ADDQ $8, AX
	VMOVUPD_128mr(rAX, rX0)
	ADDQ $32, AX
	VMOVUPD_128mr(rAX, rX1)
	VDPPD(rX1, rX0, rX0)		/* VDPPD $0x31, X1, X0, X0 */
	MOVSD a+16(FP), X1
	MOVSD b+48(FP), X2
	VFMADD231SD(rX1, rX2, rX0)
	VZEROUPPER
	RET

TEXT Pt2b(SB), 1, $0
	MOVQ BP, DI
	MOVSD x+8(FP), X0
	MOVSD X0, 0(DI)
	MOVSD y+16(FP), X0
	MOVSD X0, 8(DI)
	MOVSD w+24(FP), X0
	MOVSD X0, 16(DI)
	RET

TEXT hsubpd(SB), 1, $0
	MOVLPD a+0(FP), X0
	MOVHPD b+8(FP), X0
	HSUBPD X0, X0
	RET

TEXT crossvec3_sse(SB), 1, $0
	MOVLPD b+40(FP), X0
	MOVHPD a+8(FP), X0	/* X0 := [a.x][b.x] */
	MOVLPD a+16(FP), X1
	MOVHPD b+48(FP), X1	/* X1 := [b.y][a.y] */
	MOVLPD b+56(FP), X2
	MOVHPD a+24(FP), X2	/* X2 := [a.z][b.z] */
	MOVAPD X1, X3
	MULPD X2, X3
	HSUBPD X3, X3		/* x */
	MOVAPD X2, X4
	SHUFPD $0x1, X4, X4
	MULPD X0, X4
	HSUBPD X4, X4		/* y */
	MOVAPD X0, X5
	MULPD X1, X5
	SHUFPD $0x1, X5, X5
	HSUBPD X5, X5		/* z */
	MOVQ BP, DI
	MOVSD X3, 0(DI)
	MOVSD X4, 8(DI)
	MOVSD X5, 16(DI)
	XORPD X0, X0
	MOVSD X0, 24(DI)
	RET

TEXT crossvec3_avx(SB), 1, $0
	MOVQ SP, AX
	ADDQ $8, AX
	

TEXT fma(SB), 1, $0
	MOVSD a+0(FP), X0
	MOVSD b+8(FP), X1
	MOVSD c+16(FP), X2
	VFMADD231SD(rX1, rX2, rX0)
	RET

TEXT addpt2_sse(SB), 1, $0
	MOVUPD a+8(FP), X0
	MOVUPD b+32(FP), X1
	ADDPD X1, X0
	MOVSD a+24(FP), X2
	ADDSD b+48(FP), X2
	MOVQ BP, DI
	MOVUPD X0, (DI)
	MOVSD X2, 16(DI)
	RET

/* TODO: write only 24 bytes */
TEXT addpt2_avx(SB), 1, $0
	MOVQ SP, AX
	ADDQ $16, AX
	VMOVUPD_256mr(rAX, rX0)
	ADDQ $24, AX
	VMOVUPD_256mr(rAX, rX1)
	VADDPD_256rr(rX1, rX0, rX0)
	MOVQ BP, DI
	VMOVUPD_256rm(rX0, rDI)
	VZEROUPPER
	RET

TEXT addpt3_avx(SB), 1, $0
	MOVQ SP, AX
	ADDQ $16, AX
	VMOVUPD_256mr(rAX, rX0)
	ADDQ $32, AX
	VMOVUPD_256mr(rAX, rX1)
	VADDPD_256rr(rX1, rX0, rX0)
	MOVQ BP, DI
	VMOVUPD_256rm(rX0, rDI)
	VZEROUPPER
	RET