diff options
-rw-r--r-- | dppd.s | 71 | ||||
-rw-r--r-- | main.c | 82 | ||||
-rw-r--r-- | min.s | 4 | ||||
-rw-r--r-- | mkfile | 14 | ||||
-rw-r--r-- | nanosec.c | 109 | ||||
-rw-r--r-- | sse.h | 50 |
6 files changed, 330 insertions, 0 deletions
@@ -0,0 +1,71 @@ +#include "sse.h" + +DATA one(SB)/8,$1.0 +GLOBL one(SB), $8 + +TEXT dppd(SB), 1, $0 + MOVQ SP, AX + MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ + MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ + MOVLPD(32, rAX, rX1) /* MOVLPD b+0(FP), X1 */ + MOVHPD(40, rAX, rX1) /* MOVHPD b+8(FP), X1*/ + DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + RET + +TEXT dppd3(SB), 1, $0 + MOVQ SP, AX + MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ + MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ + MOVLPD(40, rAX, rX1) /* MOVLPD b+0(FP), X1 */ + MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */ + DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + MOVSD one(SB), X1 + MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */ + MOVHPD(56, rAX, rX1) /* MOVHPD b+16(FP), X1 */ + DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + RET + +TEXT Pt2b(SB), 1, $0 + MOVQ BP, DI + MOVSD x+8(FP), X0 + MOVSD X0, 0(DI) + MOVSD y+16(FP), X0 + MOVSD X0, 8(DI) + MOVSD w+24(FP), X0 + MOVSD X0, 16(DI) + RET + +TEXT hsubpd(SB), 1, $0 + MOVQ SP, AX + MOVLPD(8, rAX, rX0) + MOVHPD(16, rAX, rX0) + HSUBPD(rX0, rX0) + RET + +TEXT xvec3(SB), 1, $0 + MOVQ SP, AX + ADDQ $8, AX + MOVLPD(40, rAX, rX0) + MOVHPD(8, rAX, rX0) + MOVLPD(16, rAX, rX1) + MOVHPD(48, rAX, rX1) + MOVLPD(56, rAX, rX2) + MOVHPD(24, rAX, rX2) + MOVAPD X1, X3 + MULPD X2, X3 + HSUBPD(rX3, rX3) /* x */ + MOVAPD X2, X4 + SHUFPD $0x1, X4, X4 + MULPD X0, X4 + HSUBPD(rX4, rX4) /* y */ + MOVAPD X0, X5 + MULPD X1, X5 + SHUFPD $0x1, X5, X5 + HSUBPD(rX5, rX5) /* z */ + MOVQ BP, DI + MOVSD X3, 0(DI) + MOVSD X4, 8(DI) + MOVSD X5, 16(DI) + XORPD X0, X0 + MOVSD X0, 24(DI) + RET @@ -0,0 +1,82 @@ +#include <u.h> +#include <libc.h> +#include <geometry.h> + +uvlong nanosec(void); +double min(double, double); +double dppd(Point2, Point2); +double dppd3(Point3, Point3); +Point2 Pt2b(double, double, double); +Point3 xvec3(Point3, Point3); +double hsubpd(double, double); + +double +fmin(double a, double b) +{ + return a<b? a: b; +} + +void +main(int argc, char *argv[]) +{ + uvlong t0, t1; + double a, b, r; + Point2 p0, p1; + Point3 p0t, p1t, pr; + + GEOMfmtinstall(); + ARGBEGIN{default:sysfatal("shit");}ARGEND + if(argc != 2) + sysfatal("shit"); + a = strtod(argv[0], nil); + b = strtod(argv[1], nil); + + t0 = nanosec(); + r = fmin(a, b); + t1 = nanosec(); + print("fmin(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + t0 = nanosec(); + r = min(a, b); + t1 = nanosec(); + print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + + p0 = Pt2b(a, 1, 1); + p1 = Pt2b(b, 3, 1); + t0 = nanosec(); + r = dppd(p0, p1); + t1 = nanosec(); + print("dppd(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); + t0 = nanosec(); + r = dotvec2(p0, p1); + t1 = nanosec(); + print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0); + + p0t = Pt3(a, 1, 9, 1); + p1t = Pt3(b, 3, 4, 1); + t0 = nanosec(); + r = dppd3(p0t, p1t); + t1 = nanosec(); + print("dppd3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); + t0 = nanosec(); + r = dotvec3(p0t, p1t); + t1 = nanosec(); + print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0); + + t0 = nanosec(); + r = hsubpd(a, b); + t1 = nanosec(); + print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0); + + p0t = Pt3(a, 1, 9, 1); + p1t = Pt3(b, 3, 4, 1); + t0 = nanosec(); + pr = xvec3(p0t, p1t); + t1 = nanosec(); + print("xvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0); + t0 = nanosec(); + pr = crossvec3(p0t, p1t); + t1 = nanosec(); + print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0); + + exits(nil); +} @@ -0,0 +1,4 @@ +TEXT min(SB), 1, $0 + MOVSD a+0(FP), X0 + MINSD b+8(FP), X0 + RET @@ -0,0 +1,14 @@ +</$objtype/mkfile + +BIN=/$objtype/bin +TARG=stuff +OFILES=\ + main.$O\ + min.$O\ + dppd.$O\ + nanosec.$O\ + +HFILES=\ + sse.h\ + +</sys/src/cmd/mkone diff --git a/nanosec.c b/nanosec.c new file mode 100644 index 0000000..f82d47a --- /dev/null +++ b/nanosec.c @@ -0,0 +1,109 @@ +#include <u.h> +#include <libc.h> +#include <tos.h> + +/* + * This code is a mixture of cpuid(1) and the nanosec() found in vmx, + * in order to force the use of nsec(2) in case we are running in a + * virtualized environment where the clock is mis-bhyve-ing. + */ + +typedef struct Res { + ulong ax, bx, cx, dx; +} Res; + +static uchar _cpuid[] = { + 0x5E, /* POP SI (PC) */ + 0x5D, /* POP BP (Res&) */ + 0x58, /* POP AX */ + 0x59, /* POP CX */ + + 0x51, /* PUSH CX */ + 0x50, /* PUSH AX */ + 0x55, /* PUSH BP */ + 0x56, /* PUSH SI */ + + 0x31, 0xDB, /* XOR BX, BX */ + 0x31, 0xD2, /* XOR DX, DX */ + + 0x0F, 0xA2, /* CPUID */ + + 0x89, 0x45, 0x00, /* MOV AX, 0(BP) */ + 0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */ + 0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */ + 0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */ + 0xC3, /* RET */ +}; + +static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid; + +/* + * nsec() is wallclock and can be adjusted by timesync + * so need to use cycles() instead, but fall back to + * nsec() in case we can't + */ +uvlong +nanosec(void) +{ + static uvlong fasthz, xstart; + char buf[13], path[128]; + ulong w; + uvlong x, div; + int fd; + Res r; + + if(fasthz == ~0ULL) + return nsec() - xstart; + + if(fasthz == 0){ + /* first long in a.out header */ + snprint(path, sizeof path, "/proc/%d/text", getpid()); + fd = open(path, OREAD); + if(fd < 0) + goto Wallclock; + if(read(fd, buf, 4) != 4){ + close(fd); + goto Wallclock; + } + close(fd); + + w = ((ulong *) buf)[0]; + + switch(w){ + default: + goto Wallclock; + case 0x978a0000: /* amd64 */ + /* patch out POP BP -> POP AX */ + _cpuid[1] = 0x58; + case 0xeb010000: /* 386 */ + break; + } + segflush(_cpuid, sizeof(_cpuid)); + + r = cpuid(0x40000000, 0); + ((ulong *) buf)[0] = r.bx; + ((ulong *) buf)[1] = r.cx; + ((ulong *) buf)[2] = r.dx; + buf[12] = 0; + + if(strstr(buf, "bhyve") != nil) + goto Wallclock; + + if(_tos->cyclefreq){ + fasthz = _tos->cyclefreq; + cycles(&xstart); + } else { +Wallclock: + fasthz = ~0ULL; + xstart = nsec(); + } + return 0; + } + cycles(&x); + x -= xstart; + + /* this is ugly */ + for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL); + + return x / (fasthz / div); +} @@ -0,0 +1,50 @@ +#define rAX 0 +#define rCX 1 +#define rDX 2 +#define rBX 3 +#define rSP 4 +#define rBP 5 +#define rSI 6 +#define rDI 7 + +#define rX0 0 +#define rX1 1 +#define rX2 2 +#define rX3 3 +#define rX4 4 +#define rX5 5 +#define rX6 6 + +#define OP(o, m, ro, rm) WORD $0x0F66; /* op + modr/m byte */ \ + BYTE $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define OPi(o, m, ro, rm, i) OP((o), (m), (ro), (rm)); \ + BYTE $(i) +#define OP4(o, m, ro, rm) WORD $0x0F66; \ + WORD $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \ + BYTE $(i) + +/* MOVLPD */ +//opcode = 660F12 +//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] +//disp8 = 8 / 32 +#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off)) + +/* MOVHPD */ +//opcode = 660F16 +//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] +//disp8 = 16 / 40 +#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off)) + +/* HSUBPD */ +//opcode = 660F7D = 01100110 00001111 01111101 +//modrm = 11 000 000 [X0 → X0] +#define HSUBPD(s, d) OP(0x7D, 0x3, (d), (s)) + +/* DPPD */ +//opcode = 660F3A41 = 01100110 00001111 00111010 01000001 +//modrm = 11 000 001 [X1 → X0] +//imm8 = 0011 0001 +#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31) |