aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrodri <rgl@antares-labs.eu>2023-11-25 10:34:41 +0000
committerrodri <rgl@antares-labs.eu>2023-11-25 10:34:41 +0000
commit675aa84403f98776a7d463e1cc5f9bd41cdbab92 (patch)
tree7c0f2fbb1814c5a9f8975307da8e79b0c0165d89
parentcc3307440e698d58843a5273519f4988c01937f1 (diff)
downloadamd64-simd-675aa84403f98776a7d463e1cc5f9bd41cdbab92.tar.gz
amd64-simd-675aa84403f98776a7d463e1cc5f9bd41cdbab92.tar.bz2
amd64-simd-675aa84403f98776a7d463e1cc5f9bd41cdbab92.zip
cleaned things up and improved the organization a bit.
-rw-r--r--avx.h20
-rw-r--r--bench/main.c179
-rw-r--r--bench/mkfile23
-rw-r--r--dppd.s30
-rw-r--r--main.c94
-rw-r--r--mkfile5
-rw-r--r--nanosec.c109
-rw-r--r--sse.h11
8 files changed, 283 insertions, 188 deletions
diff --git a/avx.h b/avx.h
index d2c6d08..3c7129c 100644
--- a/avx.h
+++ b/avx.h
@@ -45,6 +45,26 @@
#define VMOVAPD_256rr(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
VOP(0x28, 0x3, (d), (s))
+/* VMOVDQA */
+#define VMOVDQA_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQA_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66); \
+ VOP(0x7F, 0x3, (s), (d))
+#define VMOVDQA_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
+ VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQA_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66); \
+ VOP(0x7F, 0x3, (s), (d))
+
+/* VMODQU */
+#define VMOVDQU_128mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \
+ VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQU_128rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3); \
+ VOP(0x7F, 0x3, (s), (d))
+#define VMOVDQU_256mr(off, s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3); \
+ VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQU_256rm(s, d) VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3); \
+ VOP(0x7F, 0x3, (s), (d))
+
/* VADDPD */
#define VADDPD_128mr(off, s0, s1, d) VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66); \
VOPi(0x58, 0x1, (d), (s1), (off))
diff --git a/bench/main.c b/bench/main.c
index 9d00719..811c471 100644
--- a/bench/main.c
+++ b/bench/main.c
@@ -2,13 +2,61 @@
#include <libc.h>
#include <thread.h>
#include <geometry.h>
-#include "b.h"
+#include "../bench9/b.h"
-double dppd(Point2, Point2);
-double dppda(Point2, Point2);
-double dppd3(Point3, Point3);
-double dppd3a(Point3, Point3);
-Point3 xvec3(Point3, Point3);
+double min(double, double);
+double dotvec2_sse4(Point2, Point2);
+double dotvec2_avx(Point2, Point2);
+double dotvec3_sse4(Point3, Point3);
+double dotvec3_avx(Point3, Point3);
+Point2 Pt2b(double, double, double);
+Point3 crossvec3_sse(Point3, Point3);
+double hsubpd(double, double);
+double fma(double, double, double);
+Point2 addpt2_avx(Point2, Point2);
+
+double
+fmin(double a, double b)
+{
+ return a<b? a: b;
+}
+
+double
+madd(double a, double b, double c)
+{
+ return a + b*c;
+}
+
+static void
+bmin(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ double a, b;
+ int i;
+
+ benchinitgr(&g, "min");
+ b0 = benchadd(&g, "fmin");
+ b1 = benchadd(&g, "fmin_sse");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = truerand()*frand();
+ b = truerand()*frand();
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ fmin(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ min(a, b);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
static void
bdotvec2(int fd)
@@ -20,7 +68,7 @@ bdotvec2(int fd)
benchinitgr(&g, "2d dot product");
b0 = benchadd(&g, "dotvec2");
- b1 = benchadd(&g, "dotvec2_simd");
+ b1 = benchadd(&g, "dotvec2_sse4");
b2 = benchadd(&g, "dotvec2_avx");
while(b0->n > 0 || b1->n > 0){
@@ -34,12 +82,12 @@ bdotvec2(int fd)
benchin(b1);
for(i = 0; i < 1e6; i++)
- dppd(a, b);
+ dotvec2_sse4(a, b);
benchout(b1);
benchin(b2);
for(i = 0; i < 1e6; i++)
- dppda(a, b);
+ dotvec2_avx(a, b);
benchout(b2);
}
@@ -57,7 +105,7 @@ bdotvec3(int fd)
benchinitgr(&g, "3d dot product");
b0 = benchadd(&g, "dotvec3");
- b1 = benchadd(&g, "dotvec3_simd");
+ b1 = benchadd(&g, "dotvec3_sse4");
b2 = benchadd(&g, "dotvec3_avx");
while(b0->n > 0 || b1->n > 0){
@@ -71,12 +119,12 @@ bdotvec3(int fd)
benchin(b1);
for(i = 0; i < 1e6; i++)
- dppd3(a, b);
+ dotvec3_sse4(a, b);
benchout(b1);
benchin(b2);
for(i = 0; i < 1e6; i++)
- dppd3a(a, b);
+ dotvec3_avx(a, b);
benchout(b2);
}
@@ -94,7 +142,7 @@ bcrossvec3(int fd)
benchinitgr(&g, "3d cross product");
b0 = benchadd(&g, "crossvec3");
- b1 = benchadd(&g, "crossvec3_simd");
+ b1 = benchadd(&g, "crossvec3_sse");
while(b0->n > 0 || b1->n > 0){
a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
@@ -107,7 +155,102 @@ bcrossvec3(int fd)
benchin(b1);
for(i = 0; i < 1e6; i++)
- xvec3(a, b);
+ crossvec3_sse(a, b);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+bPt2(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ double x, y, w;
+ int i;
+
+ benchinitgr(&g, "Pt2");
+ b0 = benchadd(&g, "Pt2");
+ b1 = benchadd(&g, "Pt2b");
+
+ while(b0->n > 0 || b1->n > 0){
+ x = truerand()*frand();
+ y = truerand()*frand();
+ w = truerand()*frand();
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ Pt2(x, y, w);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ Pt2b(x, y, w);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+bfma(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ double a, b, c;
+ int i;
+
+ benchinitgr(&g, "multiply + add");
+ b0 = benchadd(&g, "madd");
+ b1 = benchadd(&g, "fma_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = truerand()*frand();
+ b = truerand()*frand();
+ c = truerand()*frand();
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ madd(a, b, c);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ fma(a, b, c);
+ benchout(b1);
+ }
+
+ benchprintgr(&g, fd);
+ benchfreegr(&g);
+}
+
+static void
+baddpt2(int fd)
+{
+ Bgr g;
+ B *b0, *b1;
+ Point2 a, b;
+ int i;
+
+ benchinitgr(&g, "2d point sum");
+ b0 = benchadd(&g, "addpt2");
+ b1 = benchadd(&g, "addpt2_avx");
+
+ while(b0->n > 0 || b1->n > 0){
+ a = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand());
+ b = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+ benchin(b0);
+ for(i = 0; i < 1e6; i++)
+ addpt2(a, b);
+ benchout(b0);
+
+ benchin(b1);
+ for(i = 0; i < 1e6; i++)
+ addpt2_avx(a, b);
benchout(b1);
}
@@ -124,11 +267,19 @@ threadmain(int argc, char **argv)
if(benchwire(0) != 0)
fprint(2, "failed to wire: %r\n");
+ bmin(1);
+ bseparator(1);
bdotvec2(1);
bseparator(1);
bdotvec3(1);
bseparator(1);
bcrossvec3(1);
+ bseparator(1);
+ bPt2(1);
+ bseparator(1);
+ bfma(1);
+ bseparator(1);
+ baddpt2(1);
threadexitsall(nil);
}
diff --git a/bench/mkfile b/bench/mkfile
new file mode 100644
index 0000000..e649008
--- /dev/null
+++ b/bench/mkfile
@@ -0,0 +1,23 @@
+</$objtype/mkfile
+
+TARG=bench9
+BIN=/$objtype/bin
+arch=`{echo __^$objtype^__}
+CFLAGS=$CFLAGS -D$arch -p
+
+HFILES=\
+ ../bench9/b.h\
+ ../regs.h\
+ ../sse.h\
+ ../avx.h\
+
+OFILES=\
+ ../bench9/b.$O\
+ ../bench9/b_$objtype.$O\
+ ../min.$O\
+ ../dppd.$O\
+ main.$O\
+
+default:V: all
+
+</sys/src/cmd/mkone
diff --git a/dppd.s b/dppd.s
index de938b8..b746117 100644
--- a/dppd.s
+++ b/dppd.s
@@ -5,24 +5,21 @@
DATA one(SB)/8,$1.0
GLOBL one(SB), $8
-TEXT dppd(SB), 1, $0
+TEXT dotvec2_sse4(SB), 1, $0
MOVQ SP, AX
- MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
- MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
- MOVLPD(32, rAX, rX1) /* MOVLPD b+24(FP), X1 */
- MOVHPD(40, rAX, rX1) /* MOVHPD b+32(FP), X1*/
+ MOVDQU_mr(8, rAX, rX0) /* MOVDQU a+0(FP), X0 */
+ MOVDQU_mr(32, rAX, rX1) /* MOVDQU b+24(FP), X1 */
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
-TEXT dppda(SB), 1, $0
+TEXT dotvec2_avx(SB), 1, $0
MOVQ SP, AX
VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
VMOVUPD_128mr(32, rAX, rX1) /* VMOVUPD b+24(FP), X1 */
VDPPD(rX1, rX0, rX0) /* VDPPD $0x31, X1, X0, X0 */
- VZEROUPPER
RET
-TEXT dppd3(SB), 1, $0
+TEXT dotvec3_sse4(SB), 1, $0
MOVQ SP, AX
MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */
MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */
@@ -35,7 +32,7 @@ TEXT dppd3(SB), 1, $0
DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */
RET
-TEXT dppd3a(SB), 1, $0
+TEXT dotvec3_avx(SB), 1, $0
MOVQ SP, AX
VMOVUPD_128mr(8, rAX, rX0) /* VMOVUPD a+0(FP), X0 */
VMOVUPD_128mr(40, rAX, rX1) /* VMOVUPD b+32(FP), X1 */
@@ -43,7 +40,6 @@ TEXT dppd3a(SB), 1, $0
MOVSD a+16(FP), X1
MOVSD b+48(FP), X2
VFMADD231SD(rX1, rX2, rX0)
- VZEROUPPER
RET
TEXT Pt2b(SB), 1, $0
@@ -63,7 +59,7 @@ TEXT hsubpd(SB), 1, $0
HSUBPD(rX0, rX0) /* HSUBPD X0, X0 */
RET
-TEXT xvec3(SB), 1, $0
+TEXT crossvec3_sse(SB), 1, $0
MOVQ SP, AX
ADDQ $8, AX
MOVLPD(40, rAX, rX0) /* MOVLPD b+32(FP), X0 */
@@ -91,7 +87,7 @@ TEXT xvec3(SB), 1, $0
MOVSD X0, 24(DI)
RET
-TEXT xvec3a(SB), 1, $0
+TEXT crossvec3_avx(SB), 1, $0
MOVQ SP, AX
ADDQ $8, AX
@@ -101,5 +97,13 @@ TEXT fma(SB), 1, $0
MOVSD b+8(FP), X1
MOVSD c+16(FP), X2
VFMADD231SD(rX1, rX2, rX0)
- VZEROUPPER
+ RET
+
+TEXT addpt2_avx(SB), 1, $0
+ MOVQ SP, AX
+ ADDQ $8, AX
+ VMOVDQU_256mr(8, rAX, rX0)
+ VMOVDQU_256mr(32, rAX, rX1)
+ VADDPD_256rr(rX1, rX0, rX0)
+ VMOVDQU_256rm(rX0, rAX)
RET
diff --git a/main.c b/main.c
index f14a7ed..274daf4 100644
--- a/main.c
+++ b/main.c
@@ -2,16 +2,16 @@
#include <libc.h>
#include <geometry.h>
-uvlong nanosec(void);
double min(double, double);
-double dppd(Point2, Point2);
-double dppda(Point2, Point2);
-double dppd3(Point3, Point3);
-double dppd3a(Point3, Point3);
+double dotvec2_sse4(Point2, Point2);
+double dotvec2_avx(Point2, Point2);
+double dotvec3_sse4(Point3, Point3);
+double dotvec3_avx(Point3, Point3);
Point2 Pt2b(double, double, double);
-Point3 xvec3(Point3, Point3);
+Point3 crossvec3_sse(Point3, Point3);
double hsubpd(double, double);
double fma(double, double, double);
+Point2 addpt2_avx(Point2, Point2);
double
fmin(double a, double b)
@@ -19,13 +19,18 @@ fmin(double a, double b)
return a<b? a: b;
}
+double
+madd(double a, double b, double c)
+{
+ return a + b*c;
+}
+
void
main(int argc, char *argv[])
{
- uvlong t0, t1;
double a, b, r;
- Point2 p0, p1;
- Point3 p0t, p1t, pr;
+ Point2 p0, p1, pr;
+ Point3 p0t, p1t, prt;
GEOMfmtinstall();
ARGBEGIN{default:sysfatal("shit");}ARGEND
@@ -34,75 +39,62 @@ main(int argc, char *argv[])
a = strtod(argv[0], nil);
b = strtod(argv[1], nil);
- t0 = nanosec();
r = fmin(a, b);
- t1 = nanosec();
- print("fmin(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
- t0 = nanosec();
+ print("fmin(%g, %g) = %g\n", a, b, r);
r = min(a, b);
- t1 = nanosec();
- print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("min(%g, %g) = %g\n", a, b, r);
print("\n");
p0 = Pt2b(a, 1, 1);
p1 = Pt2b(b, 3, 1);
- t0 = nanosec();
- r = dppd(p0, p1);
- t1 = nanosec();
- print("dppd(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
- t0 = nanosec();
+ r = dotvec2_sse4(p0, p1);
+ print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r);
r = dotvec2(p0, p1);
- t1 = nanosec();
- print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
- t0 = nanosec();
- r = dppda(p0, p1);
- t1 = nanosec();
- print("dppda(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+ print("dotvec2(%v, %v) = %g\n", p0, p1, r);
+ r = dotvec2_avx(p0, p1);
+ print("dotvec2_avx(%v, %v) = %g\n", p0, p1, r);
print("\n");
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
- t0 = nanosec();
- r = dppd3(p0t, p1t);
- t1 = nanosec();
- print("dppd3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
- t0 = nanosec();
+ r = dotvec3_sse4(p0t, p1t);
+ print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r);
r = dotvec3(p0t, p1t);
- t1 = nanosec();
- print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
- t0 = nanosec();
- r = dppd3a(p0t, p1t);
- t1 = nanosec();
- print("dppd3a(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+ print("dotvec3(%V, %V) = %g\n", p0t, p1t, r);
+ r = dotvec3_avx(p0t, p1t);
+ print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r);
print("\n");
- t0 = nanosec();
r = hsubpd(a, b);
- t1 = nanosec();
- print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("hsubpd(%g, %g) = %g\n", a, b, r);
print("\n");
p0t = Pt3(a, 1, 9, 1);
p1t = Pt3(b, 3, 4, 1);
- t0 = nanosec();
- pr = xvec3(p0t, p1t);
- t1 = nanosec();
- print("xvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
- t0 = nanosec();
- pr = crossvec3(p0t, p1t);
- t1 = nanosec();
- print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+ prt = crossvec3_sse(p0t, p1t);
+ print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt);
+ prt = crossvec3(p0t, p1t);
+ print("crossvec3(%V, %V) = %V\n", p0t, p1t, prt);
print("\n");
- t0 = nanosec();
+ r = madd(a, b, 21);
+ print("madd(%g, %g, 21) = %g\n", a, b, r);
r = fma(a, b, 21);
- t1 = nanosec();
- print("fma(%g, %g, 21) = %g\ttook %lludns\n", a, b, r, t1-t0);
+ print("fma(%g, %g, 21) = %g\n", a, b, r);
+
+ print("\n");
+
+ p0 = Pt2b(a, 1, 1);
+ p1 = Pt2b(b, 3, 1);
+ pr = addpt2(p0, p1);
+ print("addpt2(%v, %v) = %v\n", p0, p1, pr);
+ pr = addpt2_avx(p0, p1);
+ print("addpt2_avx(%v, %v) = %v\n", p0, p1, pr);
exits(nil);
}
diff --git a/mkfile b/mkfile
index 7e94b16..52f9c48 100644
--- a/mkfile
+++ b/mkfile
@@ -6,7 +6,6 @@ OFILES=\
main.$O\
min.$O\
dppd.$O\
- nanosec.$O\
HFILES=\
regs.h\
@@ -14,3 +13,7 @@ HFILES=\
avx.h\
</sys/src/cmd/mkone
+
+pulldeps:VQ:
+ git/clone git://shithub.us/sigrid/bench9 || \
+ git/clone https://git.sr.ht/~ft/bench9
diff --git a/nanosec.c b/nanosec.c
deleted file mode 100644
index f82d47a..0000000
--- a/nanosec.c
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <u.h>
-#include <libc.h>
-#include <tos.h>
-
-/*
- * This code is a mixture of cpuid(1) and the nanosec() found in vmx,
- * in order to force the use of nsec(2) in case we are running in a
- * virtualized environment where the clock is mis-bhyve-ing.
- */
-
-typedef struct Res {
- ulong ax, bx, cx, dx;
-} Res;
-
-static uchar _cpuid[] = {
- 0x5E, /* POP SI (PC) */
- 0x5D, /* POP BP (Res&) */
- 0x58, /* POP AX */
- 0x59, /* POP CX */
-
- 0x51, /* PUSH CX */
- 0x50, /* PUSH AX */
- 0x55, /* PUSH BP */
- 0x56, /* PUSH SI */
-
- 0x31, 0xDB, /* XOR BX, BX */
- 0x31, 0xD2, /* XOR DX, DX */
-
- 0x0F, 0xA2, /* CPUID */
-
- 0x89, 0x45, 0x00, /* MOV AX, 0(BP) */
- 0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */
- 0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */
- 0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */
- 0xC3, /* RET */
-};
-
-static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid;
-
-/*
- * nsec() is wallclock and can be adjusted by timesync
- * so need to use cycles() instead, but fall back to
- * nsec() in case we can't
- */
-uvlong
-nanosec(void)
-{
- static uvlong fasthz, xstart;
- char buf[13], path[128];
- ulong w;
- uvlong x, div;
- int fd;
- Res r;
-
- if(fasthz == ~0ULL)
- return nsec() - xstart;
-
- if(fasthz == 0){
- /* first long in a.out header */
- snprint(path, sizeof path, "/proc/%d/text", getpid());
- fd = open(path, OREAD);
- if(fd < 0)
- goto Wallclock;
- if(read(fd, buf, 4) != 4){
- close(fd);
- goto Wallclock;
- }
- close(fd);
-
- w = ((ulong *) buf)[0];
-
- switch(w){
- default:
- goto Wallclock;
- case 0x978a0000: /* amd64 */
- /* patch out POP BP -> POP AX */
- _cpuid[1] = 0x58;
- case 0xeb010000: /* 386 */
- break;
- }
- segflush(_cpuid, sizeof(_cpuid));
-
- r = cpuid(0x40000000, 0);
- ((ulong *) buf)[0] = r.bx;
- ((ulong *) buf)[1] = r.cx;
- ((ulong *) buf)[2] = r.dx;
- buf[12] = 0;
-
- if(strstr(buf, "bhyve") != nil)
- goto Wallclock;
-
- if(_tos->cyclefreq){
- fasthz = _tos->cyclefreq;
- cycles(&xstart);
- } else {
-Wallclock:
- fasthz = ~0ULL;
- xstart = nsec();
- }
- return 0;
- }
- cycles(&x);
- x -= xstart;
-
- /* this is ugly */
- for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);
-
- return x / (fasthz / div);
-}
diff --git a/sse.h b/sse.h
index 968ecd7..cd98f86 100644
--- a/sse.h
+++ b/sse.h
@@ -6,7 +6,18 @@
BYTE $(((m)<<6)|((ro)<<3)|(rm))
#define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \
BYTE $(i)
+#define F3OP(o, m, ro, rm) WORD $0x0FF3; BYTE $(o); \
+ BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define F3OPi(o, m, ro, rm, i) F3OP((o), (m), (ro), (rm)); \
+ BYTE $(i)
+
+/* MOVDQA */
+#define MOVDQA_mr(off, s, d) OPi(0x6F, 0x1, (d), (s), (off))
+#define MOVDQA_rm(off, s, d) OPi(0x7F, 0x1, (s), (d), (off))
+/* MODQU */
+#define MOVDQU_mr(off, s, d) F3OPi(0x6F, 0x1, (d), (s), (off))
+#define MOVDQU_rm(off, s, d) F3OPi(0x7F, 0x1, (s), (d), (off))
/* MOVLPD */
//opcode = 660F12