From 96ccc85d6a58b7ebfcb69d6d470ff209668f6187 Mon Sep 17 00:00:00 2001 From: rodri Date: Thu, 23 Nov 2023 11:43:50 +0000 Subject: initial SIMD trials of some libgeometry functions. --- dppd.s | 71 ++++++++++++++++++++++++++++++++++++++++ main.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++ min.s | 4 +++ mkfile | 14 ++++++++ nanosec.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ sse.h | 50 ++++++++++++++++++++++++++++ 6 files changed, 330 insertions(+) create mode 100644 dppd.s create mode 100644 main.c create mode 100644 min.s create mode 100644 mkfile create mode 100644 nanosec.c create mode 100644 sse.h diff --git a/dppd.s b/dppd.s new file mode 100644 index 0000000..5d5a2e4 --- /dev/null +++ b/dppd.s @@ -0,0 +1,71 @@ +#include "sse.h" + +DATA one(SB)/8,$1.0 +GLOBL one(SB), $8 + +TEXT dppd(SB), 1, $0 + MOVQ SP, AX + MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ + MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ + MOVLPD(32, rAX, rX1) /* MOVLPD b+0(FP), X1 */ + MOVHPD(40, rAX, rX1) /* MOVHPD b+8(FP), X1*/ + DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + RET + +TEXT dppd3(SB), 1, $0 + MOVQ SP, AX + MOVLPD(8, rAX, rX0) /* MOVLPD a+0(FP), X0 */ + MOVHPD(16, rAX, rX0) /* MOVHPD a+8(FP), X0 */ + MOVLPD(40, rAX, rX1) /* MOVLPD b+0(FP), X1 */ + MOVHPD(48, rAX, rX1) /* MOVHPD b+8(FP), X1 */ + DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + MOVSD one(SB), X1 + MOVHPD(24, rAX, rX0) /* MOVHPD a+16(FP), X0 */ + MOVHPD(56, rAX, rX1) /* MOVHPD b+16(FP), X1 */ + DPPD(rX1, rX0) /* DPPD $0x31, X1, X0 */ + RET + +TEXT Pt2b(SB), 1, $0 + MOVQ BP, DI + MOVSD x+8(FP), X0 + MOVSD X0, 0(DI) + MOVSD y+16(FP), X0 + MOVSD X0, 8(DI) + MOVSD w+24(FP), X0 + MOVSD X0, 16(DI) + RET + +TEXT hsubpd(SB), 1, $0 + MOVQ SP, AX + MOVLPD(8, rAX, rX0) + MOVHPD(16, rAX, rX0) + HSUBPD(rX0, rX0) + RET + +TEXT xvec3(SB), 1, $0 + MOVQ SP, AX + ADDQ $8, AX + MOVLPD(40, rAX, rX0) + MOVHPD(8, rAX, rX0) + MOVLPD(16, rAX, rX1) + MOVHPD(48, rAX, rX1) + MOVLPD(56, rAX, rX2) + MOVHPD(24, rAX, rX2) + MOVAPD X1, X3 + MULPD X2, X3 + HSUBPD(rX3, rX3) /* x */ + MOVAPD X2, X4 + SHUFPD $0x1, X4, X4 + MULPD X0, X4 + HSUBPD(rX4, rX4) /* y */ + MOVAPD X0, X5 + MULPD X1, X5 + SHUFPD $0x1, X5, X5 + HSUBPD(rX5, rX5) /* z */ + MOVQ BP, DI + MOVSD X3, 0(DI) + MOVSD X4, 8(DI) + MOVSD X5, 16(DI) + XORPD X0, X0 + MOVSD X0, 24(DI) + RET diff --git a/main.c b/main.c new file mode 100644 index 0000000..f30da53 --- /dev/null +++ b/main.c @@ -0,0 +1,82 @@ +#include +#include +#include + +uvlong nanosec(void); +double min(double, double); +double dppd(Point2, Point2); +double dppd3(Point3, Point3); +Point2 Pt2b(double, double, double); +Point3 xvec3(Point3, Point3); +double hsubpd(double, double); + +double +fmin(double a, double b) +{ + return a +#include +#include + +/* + * This code is a mixture of cpuid(1) and the nanosec() found in vmx, + * in order to force the use of nsec(2) in case we are running in a + * virtualized environment where the clock is mis-bhyve-ing. + */ + +typedef struct Res { + ulong ax, bx, cx, dx; +} Res; + +static uchar _cpuid[] = { + 0x5E, /* POP SI (PC) */ + 0x5D, /* POP BP (Res&) */ + 0x58, /* POP AX */ + 0x59, /* POP CX */ + + 0x51, /* PUSH CX */ + 0x50, /* PUSH AX */ + 0x55, /* PUSH BP */ + 0x56, /* PUSH SI */ + + 0x31, 0xDB, /* XOR BX, BX */ + 0x31, 0xD2, /* XOR DX, DX */ + + 0x0F, 0xA2, /* CPUID */ + + 0x89, 0x45, 0x00, /* MOV AX, 0(BP) */ + 0x89, 0x5d, 0x04, /* MOV BX, 4(BP) */ + 0x89, 0x4d, 0x08, /* MOV CX, 8(BP) */ + 0x89, 0x55, 0x0C, /* MOV DX, 12(BP) */ + 0xC3, /* RET */ +}; + +static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid; + +/* + * nsec() is wallclock and can be adjusted by timesync + * so need to use cycles() instead, but fall back to + * nsec() in case we can't + */ +uvlong +nanosec(void) +{ + static uvlong fasthz, xstart; + char buf[13], path[128]; + ulong w; + uvlong x, div; + int fd; + Res r; + + if(fasthz == ~0ULL) + return nsec() - xstart; + + if(fasthz == 0){ + /* first long in a.out header */ + snprint(path, sizeof path, "/proc/%d/text", getpid()); + fd = open(path, OREAD); + if(fd < 0) + goto Wallclock; + if(read(fd, buf, 4) != 4){ + close(fd); + goto Wallclock; + } + close(fd); + + w = ((ulong *) buf)[0]; + + switch(w){ + default: + goto Wallclock; + case 0x978a0000: /* amd64 */ + /* patch out POP BP -> POP AX */ + _cpuid[1] = 0x58; + case 0xeb010000: /* 386 */ + break; + } + segflush(_cpuid, sizeof(_cpuid)); + + r = cpuid(0x40000000, 0); + ((ulong *) buf)[0] = r.bx; + ((ulong *) buf)[1] = r.cx; + ((ulong *) buf)[2] = r.dx; + buf[12] = 0; + + if(strstr(buf, "bhyve") != nil) + goto Wallclock; + + if(_tos->cyclefreq){ + fasthz = _tos->cyclefreq; + cycles(&xstart); + } else { +Wallclock: + fasthz = ~0ULL; + xstart = nsec(); + } + return 0; + } + cycles(&x); + x -= xstart; + + /* this is ugly */ + for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL); + + return x / (fasthz / div); +} diff --git a/sse.h b/sse.h new file mode 100644 index 0000000..e8d92ea --- /dev/null +++ b/sse.h @@ -0,0 +1,50 @@ +#define rAX 0 +#define rCX 1 +#define rDX 2 +#define rBX 3 +#define rSP 4 +#define rBP 5 +#define rSI 6 +#define rDI 7 + +#define rX0 0 +#define rX1 1 +#define rX2 2 +#define rX3 3 +#define rX4 4 +#define rX5 5 +#define rX6 6 + +#define OP(o, m, ro, rm) WORD $0x0F66; /* op + modr/m byte */ \ + BYTE $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define OPi(o, m, ro, rm, i) OP((o), (m), (ro), (rm)); \ + BYTE $(i) +#define OP4(o, m, ro, rm) WORD $0x0F66; \ + WORD $(o); \ + BYTE $(((m)<<6)|((ro)<<3)|(rm)) +#define OP4i(o, m, ro, rm, i) OP4((o), (m), (ro), (rm)); \ + BYTE $(i) + +/* MOVLPD */ +//opcode = 660F12 +//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] +//disp8 = 8 / 32 +#define MOVLPD(off, s, d) OPi(0x12, 0x1, (d), (s), (off)) + +/* MOVHPD */ +//opcode = 660F16 +//modrm = 01 000 000 [AX → X0] / 01 001 000 [AX → X1] +//disp8 = 16 / 40 +#define MOVHPD(off, s, d) OPi(0x16, 0x1, (d), (s), (off)) + +/* HSUBPD */ +//opcode = 660F7D = 01100110 00001111 01111101 +//modrm = 11 000 000 [X0 → X0] +#define HSUBPD(s, d) OP(0x7D, 0x3, (d), (s)) + +/* DPPD */ +//opcode = 660F3A41 = 01100110 00001111 00111010 01000001 +//modrm = 11 000 001 [X1 → X0] +//imm8 = 0011 0001 +#define DPPD(s, d) OP4i(0x413A, 0x3, (d), (s), 0x31) -- cgit v1.2.3