From 2156d4d5ee57ba4f4e0e7f8d0bd0b55c21d3d0ad Mon Sep 17 00:00:00 2001 From: kk Date: Thu, 20 Mar 2025 00:14:33 +0800 Subject: [PATCH 1/3] remove using namepace --- README.md | 2 +- src/benchmark.zig | 2 +- src/main.zig | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5c229b7..b5d1a5c 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ pub fn build(b: *std.Build) void { Now in your code you may import and use zmath: ```zig -const zm = @import("zmath"); +const zm = @import("zmath").zmath; pub fn main() !void { // diff --git a/src/benchmark.zig b/src/benchmark.zig index eb1aa8a..0907a4b 100644 --- a/src/benchmark.zig +++ b/src/benchmark.zig @@ -65,7 +65,7 @@ pub fn main() !void { const std = @import("std"); const time = std.time; const Timer = time.Timer; -const zm = @import("zmath"); +const zm = @import("zmath").zmath; var prng = std.Random.DefaultPrng.init(0); const random = prng.random(); diff --git a/src/main.zig b/src/main.zig index 3158a6c..a3458c1 100644 --- a/src/main.zig +++ b/src/main.zig @@ -7,10 +7,11 @@ // See util.zig for additional functionality. // //-------------------------------------------------------------------------------------------------- -pub usingnamespace @import("zmath.zig"); +pub const zmath = @import("zmath.zig"); pub const util = @import("util.zig"); // ensure transitive closure of test coverage comptime { _ = util; + _ = zmath; } From 48854200f74f5392bf10b3e84760b8394289bf1c Mon Sep 17 00:00:00 2001 From: kk Date: Mon, 24 Mar 2025 13:49:43 +0800 Subject: [PATCH 2/3] Move contetns of zmath to main.zig --- README.md | 2 +- src/benchmark.zig | 2 +- src/main.zig | 4574 ++++++++++++++++++++++++++++++++++++++++++++- src/util.zig | 2 +- src/zmath.zig | 4571 -------------------------------------------- 5 files changed, 4575 insertions(+), 4576 deletions(-) delete mode 100644 src/zmath.zig diff --git a/README.md b/README.md index b5d1a5c..5c229b7 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ pub fn build(b: *std.Build) void { Now in your code you may import and use zmath: ```zig -const zm = @import("zmath").zmath; +const zm = @import("zmath"); pub fn main() !void { // diff --git a/src/benchmark.zig b/src/benchmark.zig index 0907a4b..eb1aa8a 100644 --- a/src/benchmark.zig +++ b/src/benchmark.zig @@ -65,7 +65,7 @@ pub fn main() !void { const std = @import("std"); const time = std.time; const Timer = time.Timer; -const zm = @import("zmath").zmath; +const zm = @import("zmath"); var prng = std.Random.DefaultPrng.init(0); const random = prng.random(); diff --git a/src/main.zig b/src/main.zig index a3458c1..fa5a5cd 100644 --- a/src/main.zig +++ b/src/main.zig @@ -7,11 +7,4581 @@ // See util.zig for additional functionality. // //-------------------------------------------------------------------------------------------------- -pub const zmath = @import("zmath.zig"); pub const util = @import("util.zig"); // ensure transitive closure of test coverage comptime { _ = util; - _ = zmath; } + +// ============================================================================== +// +// SIMD math library for game developers +// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath +// +// Should work on all OSes supported by Zig. Works on x86_64 and ARM. +// Provides ~140 optimized routines and ~70 extensive tests. +// Can be used with any graphics API. +// +// zmath uses row-major matrices, row vectors (each row vector is stored in a SIMD register). +// Handedness is determined by which function version is used (Rh vs. Lh), +// otherwise the function works with either left-handed or right-handed view coordinates. +// +// const va = f32x4(1.0, 2.0, 3.0, 1.0); +// const vb = f32x4(-1.0, 1.0, -1.0, 1.0); +// const v0 = va + vb - f32x4(0.0, 1.0, 0.0, 1.0) * f32x4s(3.0); +// const v1 = cross3(va, vb) + f32x4(1.0, 1.0, 1.0, 1.0); +// const v2 = va + dot3(va, vb) / v1; // dotN() returns scalar replicated on all vector components +// +// const m = rotationX(math.pi * 0.25); +// const v = f32x4(...); +// const v0 = mul(v, m); // 'v' treated as a row vector +// const v1 = mul(m, v); // 'v' treated as a column vector +// const f = m[row][column]; +// +// const b = va < vb; +// if (all(b, 0)) { ... } // '0' means check all vector components; if all are 'true' +// if (all(b, 3)) { ... } // '3' means check first three vector components; if all first three are 'true' +// if (any(b, 0)) { ... } // '0' means check all vector components; if any is 'true' +// if (any(b, 3)) { ... } // '3' means check first three vector components; if any from first three is 'true' +// +// var v4 = load(mem[0..], F32x4, 0); +// var v8 = load(mem[100..], F32x8, 0); +// var v16 = load(mem[200..], F32x16, 0); +// +// var camera_position = [3]f32{ 1.0, 2.0, 3.0 }; +// var cam_pos = loadArr3(camera_position); +// ... +// storeArr3(&camera_position, cam_pos); +// +// v4 = sin(v4); // SIMDx4 +// v8 = cos(v8); // .x86_64 -> 2 x SIMDx4, .x86_64+avx+fma -> SIMDx8 +// v16 = atan(v16); // .x86_64 -> 4 x SIMDx4, .x86_64+avx+fma -> 2 x SIMDx8, .x86_64+avx512f -> SIMDx16 +// +// store(mem[0..], v4, 0); +// store(mem[100..], v8, 0); +// store(mem[200..], v16, 0); +// +// ------------------------------------------------------------------------------ +// 1. Initialization functions +// ------------------------------------------------------------------------------ +// +// f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 +// f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 +// f32x16(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, +// e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 +// +// f32x4s(e0: f32) F32x4 +// f32x8s(e0: f32) F32x8 +// f32x16s(e0: f32) F32x16 +// +// boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 +// boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 +// boolx16(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, +// e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 +// +// load(mem: []const f32, comptime T: type, comptime len: u32) T +// store(mem: []f32, v: anytype, comptime len: u32) void +// +// loadArr2(arr: [2]f32) F32x4 +// loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 +// loadArr3(arr: [3]f32) F32x4 +// loadArr3w(arr: [3]f32, w: f32) F32x4 +// loadArr4(arr: [4]f32) F32x4 +// +// storeArr2(arr: *[2]f32, v: F32x4) void +// storeArr3(arr: *[3]f32, v: F32x4) void +// storeArr4(arr: *[4]f32, v: F32x4) void +// +// arr3Ptr(ptr: anytype) *const [3]f32 +// arrNPtr(ptr: anytype) [*]const f32 +// +// splat(comptime T: type, value: f32) T +// splatInt(comptime T: type, value: u32) T +// +// ------------------------------------------------------------------------------ +// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) +// ------------------------------------------------------------------------------ +// +// all(vb: anytype, comptime len: u32) bool +// any(vb: anytype, comptime len: u32) bool +// +// isNearEqual(v0: F32xN, v1: F32xN, epsilon: F32xN) BoolxN +// isNan(v: F32xN) BoolxN +// isInf(v: F32xN) BoolxN +// isInBounds(v: F32xN, bounds: F32xN) BoolxN +// +// andInt(v0: F32xN, v1: F32xN) F32xN +// andNotInt(v0: F32xN, v1: F32xN) F32xN +// orInt(v0: F32xN, v1: F32xN) F32xN +// norInt(v0: F32xN, v1: F32xN) F32xN +// xorInt(v0: F32xN, v1: F32xN) F32xN +// +// minFast(v0: F32xN, v1: F32xN) F32xN +// maxFast(v0: F32xN, v1: F32xN) F32xN +// min(v0: F32xN, v1: F32xN) F32xN +// max(v0: F32xN, v1: F32xN) F32xN +// round(v: F32xN) F32xN +// floor(v: F32xN) F32xN +// trunc(v: F32xN) F32xN +// ceil(v: F32xN) F32xN +// clamp(v0: F32xN, v1: F32xN) F32xN +// clampFast(v0: F32xN, v1: F32xN) F32xN +// saturate(v: F32xN) F32xN +// saturateFast(v: F32xN) F32xN +// lerp(v0: F32xN, v1: F32xN, t: f32) F32xN +// lerpV(v0: F32xN, v1: F32xN, t: F32xN) F32xN +// lerpInverse(v0: F32xN, v1: F32xN, t: f32) F32xN +// lerpInverseV(v0: F32xN, v1: F32xN, t: F32xN) F32xN +// mapLinear(v: F32xN, min1: f32, max1: f32, min2: f32, max2: f32) F32xN +// mapLinearV(v: F32xN, min1: F32xN, max1: F32xN, min2: F32xN, max2: F32xN) F32xN +// sqrt(v: F32xN) F32xN +// abs(v: F32xN) F32xN +// mod(v0: F32xN, v1: F32xN) F32xN +// modAngle(v: F32xN) F32xN +// mulAdd(v0: F32xN, v1: F32xN, v2: F32xN) F32xN +// select(mask: BoolxN, v0: F32xN, v1: F32xN) +// sin(v: F32xN) F32xN +// cos(v: F32xN) F32xN +// sincos(v: F32xN) [2]F32xN +// asin(v: F32xN) F32xN +// acos(v: F32xN) F32xN +// atan(v: F32xN) F32xN +// atan2(vy: F32xN, vx: F32xN) F32xN +// cmulSoa(re0: F32xN, im0: F32xN, re1: F32xN, im1: F32xN) [2]F32xN +// +// ------------------------------------------------------------------------------ +// 3. 2D, 3D, 4D vector functions +// ------------------------------------------------------------------------------ +// +// swizzle(v: Vec, c, c, c, c) Vec (comptime c = .x | .y | .z | .w) +// dot2(v0: Vec, v1: Vec) F32x4 +// dot3(v0: Vec, v1: Vec) F32x4 +// dot4(v0: Vec, v1: Vec) F32x4 +// cross3(v0: Vec, v1: Vec) Vec +// lengthSq2(v: Vec) F32x4 +// lengthSq3(v: Vec) F32x4 +// lengthSq4(v: Vec) F32x4 +// length2(v: Vec) F32x4 +// length3(v: Vec) F32x4 +// length4(v: Vec) F32x4 +// normalize2(v: Vec) Vec +// normalize3(v: Vec) Vec +// normalize4(v: Vec) Vec +// +// vecToArr2(v: Vec) [2]f32 +// vecToArr3(v: Vec) [3]f32 +// vecToArr4(v: Vec) [4]f32 +// +// ------------------------------------------------------------------------------ +// 4. Matrix functions +// ------------------------------------------------------------------------------ +// +// identity() Mat +// mul(m0: Mat, m1: Mat) Mat +// mul(s: f32, m: Mat) Mat +// mul(m: Mat, s: f32) Mat +// mul(v: Vec, m: Mat) Vec +// mul(m: Mat, v: Vec) Vec +// transpose(m: Mat) Mat +// rotationX(angle: f32) Mat +// rotationY(angle: f32) Mat +// rotationZ(angle: f32) Mat +// translation(x: f32, y: f32, z: f32) Mat +// translationV(v: Vec) Mat +// scaling(x: f32, y: f32, z: f32) Mat +// scalingV(v: Vec) Mat +// lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat +// lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat +// lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat +// lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat +// perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// determinant(m: Mat) F32x4 +// inverse(m: Mat) Mat +// inverseDet(m: Mat, det: ?*F32x4) Mat +// matToQuat(m: Mat) Quat +// matFromAxisAngle(axis: Vec, angle: f32) Mat +// matFromNormAxisAngle(axis: Vec, angle: f32) Mat +// matFromQuat(quat: Quat) Mat +// matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat +// matFromRollPitchYawV(angles: Vec) Mat +// matFromArr(arr: [16]f32) Mat +// +// loadMat(mem: []const f32) Mat +// loadMat43(mem: []const f32) Mat +// loadMat34(mem: []const f32) Mat +// storeMat(mem: []f32, m: Mat) void +// storeMat43(mem: []f32, m: Mat) void +// storeMat34(mem: []f32, m: Mat) void +// +// matToArr(m: Mat) [16]f32 +// matToArr43(m: Mat) [12]f32 +// matToArr34(m: Mat) [12]f32 +// +// ------------------------------------------------------------------------------ +// 5. Quaternion functions +// ------------------------------------------------------------------------------ +// +// qmul(q0: Quat, q1: Quat) Quat +// qidentity() Quat +// conjugate(quat: Quat) Quat +// inverse(q: Quat) Quat +// rotate(q: Quat, v: Vec) Vec +// slerp(q0: Quat, q1: Quat, t: f32) Quat +// slerpV(q0: Quat, q1: Quat, t: F32x4) Quat +// quatToMat(quat: Quat) Mat +// quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void +// quatFromMat(m: Mat) Quat +// quatFromAxisAngle(axis: Vec, angle: f32) Quat +// quatFromNormAxisAngle(axis: Vec, angle: f32) Quat +// quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat +// quatFromRollPitchYawV(angles: Vec) Quat +// +// ------------------------------------------------------------------------------ +// 6. Color functions +// ------------------------------------------------------------------------------ +// +// adjustSaturation(color: F32x4, saturation: f32) F32x4 +// adjustContrast(color: F32x4, contrast: f32) F32x4 +// rgbToHsl(rgb: F32x4) F32x4 +// hslToRgb(hsl: F32x4) F32x4 +// rgbToHsv(rgb: F32x4) F32x4 +// hsvToRgb(hsv: F32x4) F32x4 +// rgbToSrgb(rgb: F32x4) F32x4 +// srgbToRgb(srgb: F32x4) F32x4 +// +// ------------------------------------------------------------------------------ +// X. Misc functions +// ------------------------------------------------------------------------------ +// +// linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 +// sin(v: f32) f32 +// cos(v: f32) f32 +// sincos(v: f32) [2]f32 +// asin(v: f32) f32 +// acos(v: f32) f32 +// +// fftInitUnityTable(unitytable: []F32x4) void +// fft(re: []F32x4, im: []F32x4, unitytable: []const F32x4) void +// ifft(re: []F32x4, im: []const F32x4, unitytable: []const F32x4) void +// +// ============================================================================== + +// Fundamental types +pub const F32x4 = @Vector(4, f32); +pub const F32x8 = @Vector(8, f32); +pub const F32x16 = @Vector(16, f32); +pub const Boolx4 = @Vector(4, bool); +pub const Boolx8 = @Vector(8, bool); +pub const Boolx16 = @Vector(16, bool); + +// "Higher-level" aliases +pub const Vec = F32x4; +pub const Mat = [4]F32x4; +pub const Quat = F32x4; + +const builtin = @import("builtin"); +const std = @import("std"); +const math = std.math; +const assert = std.debug.assert; +const expect = std.testing.expect; + +const cpu_arch = builtin.cpu.arch; +const has_avx = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx) else false; +const has_avx512f = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f) else false; +const has_fma = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .fma) else false; +// ------------------------------------------------------------------------------ +// +// 1. Initialization functions +// +// ------------------------------------------------------------------------------ +pub inline fn f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 { + return .{ e0, e1, e2, e3 }; +} +pub inline fn f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; +} +// zig fmt: off +pub inline fn f32x16( + e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, + e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; +} +// zig fmt: on + +pub inline fn f32x4s(e0: f32) F32x4 { + return splat(F32x4, e0); +} +pub inline fn f32x8s(e0: f32) F32x8 { + return splat(F32x8, e0); +} +pub inline fn f32x16s(e0: f32) F32x16 { + return splat(F32x16, e0); +} + +pub inline fn boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 { + return .{ e0, e1, e2, e3 }; +} +pub inline fn boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; +} +// zig fmt: off +pub inline fn boolx16( + e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, + e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; +} +// zig fmt: on + +pub inline fn veclen(comptime T: type) comptime_int { + return @typeInfo(T).vector.len; +} + +pub inline fn splat(comptime T: type, value: f32) T { + return @splat(value); +} +pub inline fn splatInt(comptime T: type, value: u32) T { + return @splat(@bitCast(value)); +} + +pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T { + var v = splat(T, 0.0); + const loop_len = if (len == 0) veclen(T) else len; + comptime var i: u32 = 0; + inline while (i < loop_len) : (i += 1) { + v[i] = mem[i]; + } + return v; +} +test "zmath.load" { + const a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; + var ptr = &a; + var i: u32 = 0; + const v0 = load(a[i..], F32x4, 2); + try expectVecEqual(v0, F32x4{ 1.0, 2.0, 0.0, 0.0 }); + i += 2; + const v1 = load(a[i .. i + 2], F32x4, 2); + try expectVecEqual(v1, F32x4{ 3.0, 4.0, 0.0, 0.0 }); + const v2 = load(a[5..7], F32x4, 2); + try expectVecEqual(v2, F32x4{ 6.0, 7.0, 0.0, 0.0 }); + const v3 = load(ptr[1..], F32x4, 2); + try expectVecEqual(v3, F32x4{ 2.0, 3.0, 0.0, 0.0 }); + i += 1; + const v4 = load(ptr[i .. i + 2], F32x4, 2); + try expectVecEqual(v4, F32x4{ 4.0, 5.0, 0.0, 0.0 }); +} + +pub fn store(mem: []f32, v: anytype, comptime len: u32) void { + const T = @TypeOf(v); + const loop_len = if (len == 0) veclen(T) else len; + comptime var i: u32 = 0; + inline while (i < loop_len) : (i += 1) { + mem[i] = v[i]; + } +} +test "zmath.store" { + var a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; + const v = load(a[1..], F32x4, 3); + store(a[2..], v, 4); + try expect(a[0] == 1.0); + try expect(a[1] == 2.0); + try expect(a[2] == 2.0); + try expect(a[3] == 3.0); + try expect(a[4] == 4.0); + try expect(a[5] == 0.0); +} + +pub inline fn loadArr2(arr: [2]f32) F32x4 { + return f32x4(arr[0], arr[1], 0.0, 0.0); +} +pub inline fn loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 { + return f32x4(arr[0], arr[1], z, w); +} +pub inline fn loadArr3(arr: [3]f32) F32x4 { + return f32x4(arr[0], arr[1], arr[2], 0.0); +} +pub inline fn loadArr3w(arr: [3]f32, w: f32) F32x4 { + return f32x4(arr[0], arr[1], arr[2], w); +} +pub inline fn loadArr4(arr: [4]f32) F32x4 { + return f32x4(arr[0], arr[1], arr[2], arr[3]); +} + +pub inline fn storeArr2(arr: *[2]f32, v: F32x4) void { + arr.* = .{ v[0], v[1] }; +} +pub inline fn storeArr3(arr: *[3]f32, v: F32x4) void { + arr.* = .{ v[0], v[1], v[2] }; +} +pub inline fn storeArr4(arr: *[4]f32, v: F32x4) void { + arr.* = .{ v[0], v[1], v[2], v[3] }; +} + +pub inline fn arr3Ptr(ptr: anytype) *const [3]f32 { + comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer); + const T = std.meta.Child(@TypeOf(ptr)); + comptime assert(T == F32x4); + return @as(*const [3]f32, @ptrCast(ptr)); +} + +pub inline fn arrNPtr(ptr: anytype) [*]const f32 { + comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer); + const T = std.meta.Child(@TypeOf(ptr)); + comptime assert(T == Mat or T == F32x4 or T == F32x8 or T == F32x16); + return @as([*]const f32, @ptrCast(ptr)); +} +test "zmath.arrNPtr" { + { + const mat = identity(); + const f32ptr = arrNPtr(&mat); + try expect(f32ptr[0] == 1.0); + try expect(f32ptr[5] == 1.0); + try expect(f32ptr[10] == 1.0); + try expect(f32ptr[15] == 1.0); + } + { + const v8 = f32x8s(1.0); + const f32ptr = arrNPtr(&v8); + try expect(f32ptr[1] == 1.0); + try expect(f32ptr[7] == 1.0); + } +} + +test "zmath.loadArr" { + { + const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; + const simd_reg = loadArr3(camera_position); + try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 0.0)); + } + { + const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; + const simd_reg = loadArr3w(camera_position, 1.0); + try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 1.0)); + } +} + +pub inline fn vecToArr2(v: Vec) [2]f32 { + return .{ v[0], v[1] }; +} +pub inline fn vecToArr3(v: Vec) [3]f32 { + return .{ v[0], v[1], v[2] }; +} +pub inline fn vecToArr4(v: Vec) [4]f32 { + return .{ v[0], v[1], v[2], v[3] }; +} +// ------------------------------------------------------------------------------ +// +// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) +// +// ------------------------------------------------------------------------------ +pub fn all(vb: anytype, comptime len: u32) bool { + const T = @TypeOf(vb); + if (len > veclen(T)) { + @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T)); + } + const loop_len = if (len == 0) veclen(T) else len; + const ab: [veclen(T)]bool = vb; + comptime var i: u32 = 0; + var result = true; + inline while (i < loop_len) : (i += 1) { + result = result and ab[i]; + } + return result; +} +test "zmath.all" { + try expect(all(boolx8(true, true, true, true, true, false, true, false), 5) == true); + try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false); + try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true); + try expect(all(boolx4(true, true, true, false), 3) == true); + try expect(all(boolx4(true, true, true, false), 1) == true); + try expect(all(boolx4(true, false, false, false), 1) == true); + try expect(all(boolx4(false, true, false, false), 1) == false); + try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false); + try expect(all(boolx4(false, true, false, false), 0) == false); + try expect(all(boolx4(true, true, true, true), 0) == true); +} + +pub fn any(vb: anytype, comptime len: u32) bool { + const T = @TypeOf(vb); + if (len > veclen(T)) { + @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T)); + } + const loop_len = if (len == 0) veclen(T) else len; + const ab: [veclen(T)]bool = vb; + comptime var i: u32 = 0; + var result = false; + inline while (i < loop_len) : (i += 1) { + result = result or ab[i]; + } + return result; +} +test "zmath.any" { + try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true); + try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false); + try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false); +} + +pub inline fn isNearEqual( + v0: anytype, + v1: anytype, + epsilon: anytype, +) @Vector(veclen(@TypeOf(v0)), bool) { + const T = @TypeOf(v0, v1, epsilon); + const delta = v0 - v1; + const temp = maxFast(delta, splat(T, 0.0) - delta); + return temp <= epsilon; +} +test "zmath.isNearEqual" { + { + const v0 = f32x4(1.0, 2.0, -3.0, 4.001); + const v1 = f32x4(1.0, 2.1, 3.0, 4.0); + const b = isNearEqual(v0, v1, splat(F32x4, 0.01)); + try expect(@reduce(.And, b == boolx4(true, false, false, true))); + } + { + const v0 = f32x8(1.0, 2.0, -3.0, 4.001, 1.001, 2.3, -0.0, 0.0); + const v1 = f32x8(1.0, 2.1, 3.0, 4.0, -1.001, 2.1, 0.0, 0.0); + const b = isNearEqual(v0, v1, splat(F32x8, 0.01)); + try expect(@reduce(.And, b == boolx8(true, false, false, true, false, false, true, true))); + } + try expect(all(isNearEqual( + splat(F32x4, math.inf(f32)), + splat(F32x4, math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); + try expect(all(isNearEqual( + splat(F32x4, -math.inf(f32)), + splat(F32x4, math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); + try expect(all(isNearEqual( + splat(F32x4, -math.inf(f32)), + splat(F32x4, -math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); + try expect(all(isNearEqual( + splat(F32x4, -math.nan(f32)), + splat(F32x4, math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); +} + +pub inline fn isNan( + v: anytype, +) @Vector(veclen(@TypeOf(v)), bool) { + return v != v; +} +test "zmath.isNan" { + { + const v0 = f32x4(math.inf(f32), math.nan(f32), math.nan(f32), 7.0); + const b = isNan(v0); + try expect(@reduce(.And, b == boolx4(false, true, true, false))); + } + { + const v0 = f32x8(0, math.nan(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); + const b = isNan(v0); + try expect(@reduce(.And, b == boolx8(false, true, false, false, false, true, true, false))); + } +} + +pub inline fn isInf( + v: anytype, +) @Vector(veclen(@TypeOf(v)), bool) { + const T = @TypeOf(v); + return abs(v) == splat(T, math.inf(f32)); +} +test "zmath.isInf" { + { + const v0 = f32x4(math.inf(f32), math.nan(f32), math.snan(f32), 7.0); + const b = isInf(v0); + try expect(@reduce(.And, b == boolx4(true, false, false, false))); + } + { + const v0 = f32x8(0, math.inf(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); + const b = isInf(v0); + try expect(@reduce(.And, b == boolx8(false, true, false, false, true, false, false, false))); + } +} + +pub inline fn isInBounds( + v: anytype, + bounds: anytype, +) @Vector(veclen(@TypeOf(v)), bool) { + const T = @TypeOf(v, bounds); + const Tu = @Vector(veclen(T), u1); + const Tr = @Vector(veclen(T), bool); + + // 2 x cmpleps, xorps, load, andps + const b0 = v <= bounds; + const b1 = (bounds * splat(T, -1.0)) <= v; + const b0u = @as(Tu, @bitCast(b0)); + const b1u = @as(Tu, @bitCast(b1)); + return @as(Tr, @bitCast(b0u & b1u)); +} +test "zmath.isInBounds" { + { + const v0 = f32x4(0.5, -2.0, -1.0, 1.9); + const v1 = f32x4(-1.6, -2.001, -1.0, 1.9); + const bounds = f32x4(1.0, 2.0, 1.0, 2.0); + const b0 = isInBounds(v0, bounds); + const b1 = isInBounds(v1, bounds); + try expect(@reduce(.And, b0 == boolx4(true, true, true, true))); + try expect(@reduce(.And, b1 == boolx4(false, false, true, true))); + } + { + const v0 = f32x8(2.0, 1.0, 2.0, 1.0, 0.5, -2.0, -1.0, 1.9); + const bounds = f32x8(1.0, 1.0, 1.0, math.inf(f32), 1.0, math.nan(f32), 1.0, 2.0); + const b0 = isInBounds(v0, bounds); + try expect(@reduce(.And, b0 == boolx8(false, true, false, true, true, false, true, true))); + } +} + +pub inline fn andInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(v0u & v1u)); // andps +} +test "zmath.andInt" { + { + const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v1 = f32x4(1.0, 2.0, 3.0, math.inf(f32)); + const v = andInt(v0, v1); + try expect(v[3] == math.inf(f32)); + try expectVecEqual(v, f32x4(0.0, 2.0, 0.0, math.inf(f32))); + } + { + const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, math.inf(f32)); + const v = andInt(v0, v1); + try expect(v[7] == math.inf(f32)); + try expectVecEqual(v, f32x8(0, 0, 0, 0, 0.0, 2.0, 0.0, math.inf(f32))); + } +} + +pub inline fn andNotInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(~v0u & v1u)); // andnps +} +test "zmath.andNotInt" { + { + const v0 = f32x4(1.0, 2.0, 3.0, 4.0); + const v1 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v = andNotInt(v1, v0); + try expectVecEqual(v, f32x4(1.0, 0.0, 3.0, 0.0)); + } + { + const v0 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); + const v1 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v = andNotInt(v1, v0); + try expectVecEqual(v, f32x8(0, 0, 0, 0, 1.0, 0.0, 3.0, 0.0)); + } +} + +pub inline fn orInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(v0u | v1u)); // orps +} +test "zmath.orInt" { + { + const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x4(1.0, 2.0, 3.0, 4.0); + const v = orInt(v0, v1); + try expect(v[0] == 1.0); + try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); + try expect(v[2] == 3.0); + try expect(v[3] == 4.0); + } + { + const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); + const v = orInt(v0, v1); + try expect(v[4] == 1.0); + try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); + try expect(v[6] == 3.0); + try expect(v[7] == 4.0); + } +} + +pub inline fn norInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(~(v0u | v1u))); // por, pcmpeqd, pxor +} + +pub inline fn xorInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(v0u ^ v1u)); // xorps +} +test "zmath.xorInt" { + { + const v0 = f32x4(1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x4(1.0, 0, 0, 0); + const v = xorInt(v0, v1); + try expect(v[0] == 0.0); + try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); + try expect(v[2] == 0.0); + try expect(v[3] == 0.0); + } + { + const v0 = f32x8(0, 0, 0, 0, 1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x8(0, 0, 0, 0, 1.0, 0, 0, 0); + const v = xorInt(v0, v1); + try expect(v[4] == 0.0); + try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); + try expect(v[6] == 0.0); + try expect(v[7] == 0.0); + } +} + +pub inline fn minFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + return select(v0 < v1, v0, v1); // minps +} +test "zmath.minFast" { + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = minFast(v0, v1); + try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = minFast(v0, v1); + try expect(v[0] == 1.0); + try expect(v[1] == 1.0); + try expect(!math.isNan(v[1])); + try expect(v[2] == 4.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } +} + +pub inline fn maxFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + return select(v0 > v1, v0, v1); // maxps +} +test "zmath.maxFast" { + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = maxFast(v0, v1); + try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = maxFast(v0, v1); + try expect(v[0] == 2.0); + try expect(v[1] == 1.0); + try expect(v[2] == 5.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } +} + +pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + // This will handle inf & nan + return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps +} +test "zmath.min" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = min(v0, v1); + try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); + } + { + const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); + const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); + const v = min(v0, v1); + try expectVecEqual(v, f32x8(0.0, 0.0, -2.0, 0.0, 1.0, 1.0, 2.0, 7.0)); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = min(v0, v1); + try expect(v[0] == 1.0); + try expect(v[1] == 1.0); + try expect(!math.isNan(v[1])); + try expect(v[2] == 4.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } + + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); + const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); + const v = min(v0, v1); + try expect(v[0] == -math.inf(f32)); + try expect(v[1] == -math.inf(f32)); + try expect(v[2] == math.inf(f32)); + try expect(!math.isNan(v[2])); + try expect(math.isNan(v[3])); + try expect(!math.isInf(v[3])); + } +} + +pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + // This will handle inf & nan + return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps +} +test "zmath.max" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = max(v0, v1); + try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); + } + { + const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); + const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); + const v = max(v0, v1); + try expectVecEqual(v, f32x8(0.0, 1.0, 0.0, 0.0, 2.0, 3.0, 4.0, math.inf(f32))); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = max(v0, v1); + try expect(v[0] == 2.0); + try expect(v[1] == 1.0); + try expect(v[2] == 5.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); + const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); + const v = max(v0, v1); + try expect(v[0] == -math.inf(f32)); + try expect(v[1] == math.inf(f32)); + try expect(v[2] == math.inf(f32)); + try expect(!math.isNan(v[2])); + try expect(math.isNan(v[3])); + try expect(!math.isInf(v[3])); + } +} + +pub fn round(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $0, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $0, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $0, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $0, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $0, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const sign = andInt(v, splatNegativeZero(T)); + const magic = orInt(splatNoFraction(T), sign); + var r1 = v + magic; + r1 = r1 - magic; + const r2 = abs(v); + const mask = r2 <= splatNoFraction(T); + return select(mask, r1, v); + } +} +test "zmath.round" { + { + try expect(all(round(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(round(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(round(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(round(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(round(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(round(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = round(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(1.0, -1.0, -2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 6.0, 8.0, 9.0, 10.0, 11.0, 13.0, 13.0), + 0.0, + ); + } + var v = round(f32x4(1.1, -1.1, -1.5, 1.5)); + try expectVecEqual(v, f32x4(1.0, -1.0, -2.0, 2.0)); + + const v1 = f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)); + v = round(v1); + try expect(v[3] == math.inf(f32)); + try expectVecEqual(v, f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + const v2 = f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)); + v = round(v2); + try expect(math.isNan(v2[0])); + try expect(math.isNan(v2[1])); + try expect(math.isNan(v2[2])); + try expect(v2[3] == -math.inf(f32)); + + const v3 = f32x4(1001.5, -201.499, -10000.99, -101.5); + v = round(v3); + try expectVecEqual(v, f32x4(1002.0, -201.0, -10001.0, -102.0)); + + const v4 = f32x4(-1_388_609.9, 1_388_609.5, 1_388_109.01, 2_388_609.5); + v = round(v4); + try expectVecEqual(v, f32x4(-1_388_610.0, 1_388_610.0, 1_388_109.0, 2_388_610.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = round(splat(F32x4, f)); + const fr = @round(splat(F32x4, f)); + const vr8 = round(splat(F32x8, f)); + const fr8 = @round(splat(F32x8, f)); + const vr16 = round(splat(F32x16, f)); + const fr16 = @round(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn trunc(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $3, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $3, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $3, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $3, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $3, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const mask = abs(v) < splatNoFraction(T); + const result = floatToIntAndBack(v); + return select(mask, result, v); + } +} +test "zmath.trunc" { + { + try expect(all(trunc(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(trunc(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(trunc(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(trunc(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(trunc(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(trunc(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = trunc(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(1.0, -1.0, -1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), + 0.0, + ); + } + var v = trunc(f32x4(1.1, -1.1, -1.5, 1.5)); + try expectVecEqual(v, f32x4(1.0, -1.0, -1.0, 1.0)); + + v = trunc(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + v = trunc(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); + try expect(math.isNan(v[0])); + try expect(math.isNan(v[1])); + try expect(math.isNan(v[2])); + try expect(v[3] == -math.inf(f32)); + + v = trunc(f32x4(1000.5001, -201.499, -10000.99, 100.750001)); + try expectVecEqual(v, f32x4(1000.0, -201.0, -10000.0, 100.0)); + + v = trunc(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); + try expectVecEqual(v, f32x4(-7_388_609.0, 7_388_609.0, 8_388_109.0, -8_388_509.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = trunc(splat(F32x4, f)); + const fr = @trunc(splat(F32x4, f)); + const vr8 = trunc(splat(F32x8, f)); + const fr8 = @trunc(splat(F32x8, f)); + const vr16 = trunc(splat(F32x16, f)); + const fr16 = @trunc(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn floor(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $1, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $1, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $1, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $1, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $1, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const mask = abs(v) < splatNoFraction(T); + var result = floatToIntAndBack(v); + const larger_mask = result > v; + const larger = select(larger_mask, splat(T, -1.0), splat(T, 0.0)); + result = result + larger; + return select(mask, result, v); + } +} +test "zmath.floor" { + { + try expect(all(floor(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(floor(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(floor(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(floor(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(floor(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(floor(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = floor(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(1.0, -2.0, -2.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), + 0.0, + ); + } + var v = floor(f32x4(1.5, -1.5, -1.7, -2.1)); + try expectVecEqual(v, f32x4(1.0, -2.0, -2.0, -3.0)); + + v = floor(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + v = floor(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); + try expect(math.isNan(v[0])); + try expect(math.isNan(v[1])); + try expect(math.isNan(v[2])); + try expect(v[3] == -math.inf(f32)); + + v = floor(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); + try expectVecEqual(v, f32x4(1000.0, -202.0, -10001.0, 100.0)); + + v = floor(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); + try expectVecEqual(v, f32x4(-7_388_610.0, 7_388_609.0, 8_388_109.0, -8_388_510.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = floor(splat(F32x4, f)); + const fr = @floor(splat(F32x4, f)); + const vr8 = floor(splat(F32x8, f)); + const fr8 = @floor(splat(F32x8, f)); + const vr16 = floor(splat(F32x16, f)); + const fr16 = @floor(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn ceil(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $2, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $2, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $2, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $2, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $2, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const mask = abs(v) < splatNoFraction(T); + var result = floatToIntAndBack(v); + const smaller_mask = result < v; + const smaller = select(smaller_mask, splat(T, -1.0), splat(T, 0.0)); + result = result - smaller; + return select(mask, result, v); + } +} +test "zmath.ceil" { + { + try expect(all(ceil(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(ceil(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(ceil(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(ceil(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(ceil(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(ceil(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = ceil(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(2.0, -1.0, -1.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0, 14.0), + 0.0, + ); + } + var v = ceil(f32x4(1.5, -1.5, -1.7, -2.1)); + try expectVecEqual(v, f32x4(2.0, -1.0, -1.0, -2.0)); + + v = ceil(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + v = ceil(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); + try expect(math.isNan(v[0])); + try expect(math.isNan(v[1])); + try expect(math.isNan(v[2])); + try expect(v[3] == -math.inf(f32)); + + v = ceil(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); + try expectVecEqual(v, f32x4(1001.0, -201.0, -10000.0, 101.0)); + + v = ceil(f32x4(-1_388_609.5, 1_388_609.1, 1_388_109.9, -1_388_509.9)); + try expectVecEqual(v, f32x4(-1_388_609.0, 1_388_610.0, 1_388_110.0, -1_388_509.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = ceil(splat(F32x4, f)); + const fr = @ceil(splat(F32x4, f)); + const vr8 = ceil(splat(F32x8, f)); + const fr8 = @ceil(splat(F32x8, f)); + const vr16 = ceil(splat(F32x16, f)); + const fr16 = @ceil(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub inline fn clamp(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { + var result = max(vmin, v); + result = min(vmax, result); + return result; +} +test "zmath.clamp" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = clamp(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); + try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); + } + { + const v0 = f32x8(-2.0, 0.25, -0.25, 100.0, -1.0, 0.2, 1.1, -0.3); + const v = clamp(v0, splat(F32x8, -0.5), splat(F32x8, 0.5)); + try expectVecApproxEqAbs(v, f32x8(-0.5, 0.25, -0.25, 0.5, -0.5, 0.2, 0.5, -0.3), 0.0001); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); + const v = clamp(v0, f32x4(-100.0, 0.0, -100.0, 0.0), f32x4(0.0, 100.0, 0.0, 100.0)); + try expectVecApproxEqAbs(v, f32x4(-100.0, 100.0, -100.0, 0.0), 0.0001); + } + { + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); + const v = clamp(v0, splat(F32x4, -1.0), splat(F32x4, 1.0)); + try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, -1.0, -1.0), 0.0001); + } +} + +pub inline fn clampFast(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { + var result = maxFast(vmin, v); + result = minFast(vmax, result); + return result; +} +test "zmath.clampFast" { + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = clampFast(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); + try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); + } +} + +pub inline fn saturate(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + var result = max(v, splat(T, 0.0)); + result = min(result, splat(T, 1.0)); + return result; +} +test "zmath.saturate" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); + } + { + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); + } +} + +pub inline fn saturateFast(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + var result = maxFast(v, splat(T, 0.0)); + result = minFast(result, splat(T, 1.0)); + return result; +} +test "zmath.saturateFast" { + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); + } + { + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); + } +} + +pub inline fn sqrt(v: anytype) @TypeOf(v) { + return @sqrt(v); // sqrtps +} + +pub inline fn abs(v: anytype) @TypeOf(v) { + return @abs(v); // load, andps +} + +pub inline fn select(mask: anytype, v0: anytype, v1: anytype) @TypeOf(v0, v1) { + return @select(f32, mask, v0, v1); +} + +pub inline fn lerp(v0: anytype, v1: anytype, t: f32) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + return v0 + (v1 - v0) * splat(T, t); // subps, shufps, addps, mulps +} + +pub inline fn lerpV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { + return v0 + (v1 - v0) * t; // subps, addps, mulps +} + +pub inline fn lerpInverse(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + return (splat(T, t) - v0) / (v1 - v0); +} + +pub inline fn lerpInverseV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { + return (t - v0) / (v1 - v0); +} +test "zmath.lerpInverse" { + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 10.0), 0, 0.0005)); + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 100.0), 1, 0.0005)); + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 55.0), 0.5, 0.05)); + try expectVecApproxEqAbs(lerpInverse(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 10.0), f32x4(0.1, 0.05, 0, 0), 0.0005); +} + +// Frame rate independent lerp (or "damp"), for approaching things over time. +// Reference: https://www.gamedeveloper.com/programming/improved-lerp-smoothing- +pub inline fn lerpOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1) { + const t = std.math.exp2(-rate * dt); + return lerp(v1, v0, t); +} + +pub inline fn lerpVOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1, rate, dt) { + const t = std.math.exp2(-rate * dt); + return lerpV(v1, v0, t); +} + +test "zmath.lerpOverTime" { + try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 1.0), 0.5, 0.0005)); + try expect(math.approxEqAbs(f32, lerpVOverTime(0.5, 1.0, 1.0, 1.0), 0.75, 0.0005)); + try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 0.0), 0.0, 0.0005)); + try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, std.math.inf(f32)), 1.0, 0.0005)); + try expectVecApproxEqAbs(lerpOverTime(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 1.0, 1.0), f32x4(50, 100, 55, 55), 0.0005); +} + +/// To transform a vector of values from one range to another. +pub inline fn mapLinear(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v) { + const T = @TypeOf(v); + const min1V = splat(T, min1); + const max1V = splat(T, max1); + const min2V = splat(T, min2); + const max2V = splat(T, max2); + const dV = max1V - min1V; + return min2V + (v - min1V) * (max2V - min2V) / dV; +} + +pub inline fn mapLinearV(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v, min1, max1, min2, max2) { + const d = max1 - min1; + return min2 + (v - min1) * (max2 - min2) / d; +} +test "zmath.mapLinear" { + try expect(math.approxEqAbs(f32, mapLinearV(0, 0, 1.2, 10, 100), 10, 0.0005)); + try expect(math.approxEqAbs(f32, mapLinearV(1.2, 0, 1.2, 10, 100), 100, 0.0005)); + try expect(math.approxEqAbs(f32, mapLinearV(0.6, 0, 1.2, 10, 100), 55, 0.0005)); + try expectVecApproxEqAbs(mapLinearV(splat(F32x4, 0), splat(F32x4, 0), splat(F32x4, 1.2), splat(F32x4, 10), splat(F32x4, 100)), splat(F32x4, 10), 0.0005); + try expectVecApproxEqAbs(mapLinear(f32x4(0, 0, 0.6, 1.2), 0, 1.2, 10, 100), f32x4(10, 10, 55, 100), 0.0005); +} + +pub const F32x4Component = enum { x, y, z, w }; + +pub inline fn swizzle( + v: F32x4, + comptime x: F32x4Component, + comptime y: F32x4Component, + comptime z: F32x4Component, + comptime w: F32x4Component, +) F32x4 { + return @shuffle(f32, v, undefined, [4]i32{ @intFromEnum(x), @intFromEnum(y), @intFromEnum(z), @intFromEnum(w) }); +} + +pub inline fn mod(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + // vdivps, vroundps, vmulps, vsubps + return v0 - v1 * trunc(v0 / v1); +} +test "zmath.mod" { + try expectVecApproxEqAbs(mod(splat(F32x4, 3.1), splat(F32x4, 1.7)), splat(F32x4, 1.4), 0.0005); + try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, 2.0)), splat(F32x4, -1.0), 0.0005); + try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, -2.0)), splat(F32x4, -1.0), 0.0005); + try expectVecApproxEqAbs(mod(splat(F32x4, 3.0), splat(F32x4, -2.0)), splat(F32x4, 1.0), 0.0005); + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, 1.0))), 0)); + try expect(all(isNan(mod(splat(F32x4, -math.inf(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.nan(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.snan(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, -math.snan(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.inf(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, -math.inf(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.nan(f32)))), 0)); +} + +pub fn modAngle(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => modAngle32(v), + F32x4, F32x8, F32x16 => modAngle32xN(v), + else => @compileError("zmath.modAngle() not implemented for " ++ @typeName(T)), + }; +} + +pub inline fn modAngle32xN(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return v - splat(T, math.tau) * round(v * splat(T, 1.0 / math.tau)); // 2 x vmulps, 2 x load, vroundps, vaddps +} +test "zmath.modAngle" { + try expectVecApproxEqAbs(modAngle(splat(F32x4, math.tau)), splat(F32x4, 0.0), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 0.0)), splat(F32x4, 0.0), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, math.pi)), splat(F32x4, math.pi), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 11 * math.pi)), splat(F32x4, math.pi), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 3.5 * math.pi)), splat(F32x4, -0.5 * math.pi), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 2.5 * math.pi)), splat(F32x4, 0.5 * math.pi), 0.0005); +} + +pub inline fn mulAdd(v0: anytype, v1: anytype, v2: anytype) @TypeOf(v0, v1, v2) { + const T = @TypeOf(v0, v1, v2); + if (@import("zmath_options").enable_cross_platform_determinism) { + return v0 * v1 + v2; // Compiler will generate mul, add sequence (no fma even if the target supports it). + } else { + if (cpu_arch == .x86_64 and has_avx and has_fma) { + return @mulAdd(T, v0, v1, v2); + } else { + // NOTE(mziulek): On .x86_64 without HW fma instructions @mulAdd maps to really slow code! + return v0 * v1 + v2; + } + } +} + +fn sin32xN(v: anytype) @TypeOf(v) { + // 11-degree minimax approximation + const T = @TypeOf(v); + + var x = modAngle(v); + const sign = andInt(x, splatNegativeZero(T)); + const c = orInt(sign, splat(T, math.pi)); + const absx = andNotInt(sign, x); + const rflx = c - x; + const comp = absx <= splat(T, 0.5 * math.pi); + x = select(comp, x, rflx); + const x2 = x * x; + + var result = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); + result = mulAdd(result, x2, splat(T, -0.00019840874)); + result = mulAdd(result, x2, splat(T, 0.0083333310)); + result = mulAdd(result, x2, splat(T, -0.16666667)); + result = mulAdd(result, x2, splat(T, 1.0)); + return x * result; +} +test "zmath.sin" { + const epsilon = 0.0001; + + try expectVecApproxEqAbs(sin(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 1.0), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x4, -0.0)), splat(F32x4, -0.0), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x4, 89.123)), splat(F32x4, 0.916166), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x8, 89.123)), splat(F32x8, 0.916166), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x16, 89.123)), splat(F32x16, 0.916166), epsilon); + try expect(all(isNan(sin(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(sin(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(sin(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(sin(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = sin(splat(F32x4, f)); + const fr = @sin(splat(F32x4, f)); + const vr8 = sin(splat(F32x8, f)); + const fr8 = @sin(splat(F32x8, f)); + const vr16 = sin(splat(F32x16, f)); + const fr16 = @sin(splat(F32x16, f)); + try expectVecApproxEqAbs(vr, fr, epsilon); + try expectVecApproxEqAbs(vr8, fr8, epsilon); + try expectVecApproxEqAbs(vr16, fr16, epsilon); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +fn cos32xN(v: anytype) @TypeOf(v) { + // 10-degree minimax approximation + const T = @TypeOf(v); + + var x = modAngle(v); + var sign = andInt(x, splatNegativeZero(T)); + const c = orInt(sign, splat(T, math.pi)); + const absx = andNotInt(sign, x); + const rflx = c - x; + const comp = absx <= splat(T, 0.5 * math.pi); + x = select(comp, x, rflx); + sign = select(comp, splat(T, 1.0), splat(T, -1.0)); + const x2 = x * x; + + var result = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); + result = mulAdd(result, x2, splat(T, -0.0013888378)); + result = mulAdd(result, x2, splat(T, 0.041666638)); + result = mulAdd(result, x2, splat(T, -0.5)); + result = mulAdd(result, x2, splat(T, 1.0)); + return sign * result; +} +test "zmath.cos" { + const epsilon = 0.0001; + + try expectVecApproxEqAbs(cos(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(cos(splat(F32x4, 0.0)), splat(F32x4, 1.0), epsilon); + try expectVecApproxEqAbs(cos(splat(F32x4, -0.0)), splat(F32x4, 1.0), epsilon); + try expect(all(isNan(cos(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(cos(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(cos(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(cos(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = cos(splat(F32x4, f)); + const fr = @cos(splat(F32x4, f)); + const vr8 = cos(splat(F32x8, f)); + const fr8 = @cos(splat(F32x8, f)); + const vr16 = cos(splat(F32x16, f)); + const fr16 = @cos(splat(F32x16, f)); + try expectVecApproxEqAbs(vr, fr, epsilon); + try expectVecApproxEqAbs(vr8, fr8, epsilon); + try expectVecApproxEqAbs(vr16, fr16, epsilon); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn sin(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => sin32(v), + F32x4, F32x8, F32x16 => sin32xN(v), + else => @compileError("zmath.sin() not implemented for " ++ @typeName(T)), + }; +} + +pub fn cos(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => cos32(v), + F32x4, F32x8, F32x16 => cos32xN(v), + else => @compileError("zmath.cos() not implemented for " ++ @typeName(T)), + }; +} + +pub fn sincos(v: anytype) [2]@TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => sincos32(v), + F32x4, F32x8, F32x16 => sincos32xN(v), + else => @compileError("zmath.sincos() not implemented for " ++ @typeName(T)), + }; +} + +pub fn asin(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => asin32(v), + F32x4, F32x8, F32x16 => asin32xN(v), + else => @compileError("zmath.asin() not implemented for " ++ @typeName(T)), + }; +} + +pub fn acos(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => acos32(v), + F32x4, F32x8, F32x16 => acos32xN(v), + else => @compileError("zmath.acos() not implemented for " ++ @typeName(T)), + }; +} + +fn sincos32xN(v: anytype) [2]@TypeOf(v) { + const T = @TypeOf(v); + + var x = modAngle(v); + var sign = andInt(x, splatNegativeZero(T)); + const c = orInt(sign, splat(T, math.pi)); + const absx = andNotInt(sign, x); + const rflx = c - x; + const comp = absx <= splat(T, 0.5 * math.pi); + x = select(comp, x, rflx); + sign = select(comp, splat(T, 1.0), splat(T, -1.0)); + const x2 = x * x; + + var sresult = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); + sresult = mulAdd(sresult, x2, splat(T, -0.00019840874)); + sresult = mulAdd(sresult, x2, splat(T, 0.0083333310)); + sresult = mulAdd(sresult, x2, splat(T, -0.16666667)); + sresult = x * mulAdd(sresult, x2, splat(T, 1.0)); + + var cresult = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); + cresult = mulAdd(cresult, x2, splat(T, -0.0013888378)); + cresult = mulAdd(cresult, x2, splat(T, 0.041666638)); + cresult = mulAdd(cresult, x2, splat(T, -0.5)); + cresult = sign * mulAdd(cresult, x2, splat(T, 1.0)); + + return .{ sresult, cresult }; +} +test "zmath.sincos32xN" { + const epsilon = 0.0001; + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const sc = sincos(splat(F32x4, f)); + const sc8 = sincos(splat(F32x8, f)); + const sc16 = sincos(splat(F32x16, f)); + const s4 = @sin(splat(F32x4, f)); + const s8 = @sin(splat(F32x8, f)); + const s16 = @sin(splat(F32x16, f)); + const c4 = @cos(splat(F32x4, f)); + const c8 = @cos(splat(F32x8, f)); + const c16 = @cos(splat(F32x16, f)); + try expectVecApproxEqAbs(sc[0], s4, epsilon); + try expectVecApproxEqAbs(sc8[0], s8, epsilon); + try expectVecApproxEqAbs(sc16[0], s16, epsilon); + try expectVecApproxEqAbs(sc[1], c4, epsilon); + try expectVecApproxEqAbs(sc8[1], c8, epsilon); + try expectVecApproxEqAbs(sc16[1], c16, epsilon); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +fn asin32xN(v: anytype) @TypeOf(v) { + // 7-degree minimax approximation + const T = @TypeOf(v); + + const x = abs(v); + const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); + + var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); + t0 = mulAdd(t0, x, splat(T, -0.0170881256)); + t0 = mulAdd(t0, x, splat(T, 0.0308918810)); + t0 = mulAdd(t0, x, splat(T, -0.0501743046)); + t0 = mulAdd(t0, x, splat(T, 0.0889789874)); + t0 = mulAdd(t0, x, splat(T, -0.2145988016)); + t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); + + const t1 = splat(T, math.pi) - t0; + return splat(T, 0.5 * math.pi) - select(v >= splat(T, 0.0), t0, t1); +} + +fn acos32xN(v: anytype) @TypeOf(v) { + // 7-degree minimax approximation + const T = @TypeOf(v); + + const x = abs(v); + const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); + + var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); + t0 = mulAdd(t0, x, splat(T, -0.0170881256)); + t0 = mulAdd(t0, x, splat(T, 0.0308918810)); + t0 = mulAdd(t0, x, splat(T, -0.0501743046)); + t0 = mulAdd(t0, x, splat(T, 0.0889789874)); + t0 = mulAdd(t0, x, splat(T, -0.2145988016)); + t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); + + const t1 = splat(T, math.pi) - t0; + return select(v >= splat(T, 0.0), t0, t1); +} + +pub fn atan(v: anytype) @TypeOf(v) { + // 17-degree minimax approximation + const T = @TypeOf(v); + + const vabs = abs(v); + const vinv = splat(T, 1.0) / v; + var sign = select(v > splat(T, 1.0), splat(T, 1.0), splat(T, -1.0)); + const comp = vabs <= splat(T, 1.0); + sign = select(comp, splat(T, 0.0), sign); + const x = select(comp, v, vinv); + const x2 = x * x; + + var result = mulAdd(splat(T, 0.0028662257), x2, splat(T, -0.0161657367)); + result = mulAdd(result, x2, splat(T, 0.0429096138)); + result = mulAdd(result, x2, splat(T, -0.0752896400)); + result = mulAdd(result, x2, splat(T, 0.1065626393)); + result = mulAdd(result, x2, splat(T, -0.1420889944)); + result = mulAdd(result, x2, splat(T, 0.1999355085)); + result = mulAdd(result, x2, splat(T, -0.3333314528)); + result = x * mulAdd(result, x2, splat(T, 1.0)); + + const result1 = sign * splat(T, 0.5 * math.pi) - result; + return select(sign == splat(T, 0.0), result, result1); +} +test "zmath.atan" { + const epsilon = 0.0001; + { + const v = f32x4(0.25, 0.5, 1.0, 1.25); + const e = f32x4(math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3])); + try expectVecApproxEqAbs(e, atan(v), epsilon); + } + { + const v = f32x8(-0.25, 0.5, -1.0, 1.25, 100.0, -200.0, 300.0, 400.0); + // zig fmt: off + const e = f32x8( + math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), + math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), + ); + // zig fmt: on + try expectVecApproxEqAbs(e, atan(v), epsilon); + } + { + // zig fmt: off + const v = f32x16( + -0.25, 0.5, -1.0, 0.0, 0.1, -0.2, 30.0, 400.0, + -0.25, 0.5, -1.0, -0.0, -0.05, -0.125, 0.0625, 4000.0 + ); + const e = f32x16( + math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), + math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), + math.atan(v[8]), math.atan(v[9]), math.atan(v[10]), math.atan(v[11]), + math.atan(v[12]), math.atan(v[13]), math.atan(v[14]), math.atan(v[15]), + ); + // zig fmt: on + try expectVecApproxEqAbs(e, atan(v), epsilon); + } + { + try expectVecApproxEqAbs(atan(splat(F32x4, math.inf(f32))), splat(F32x4, 0.5 * math.pi), epsilon); + try expectVecApproxEqAbs(atan(splat(F32x4, -math.inf(f32))), splat(F32x4, -0.5 * math.pi), epsilon); + try expect(all(isNan(atan(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(atan(splat(F32x4, -math.nan(f32)))), 0) == true); + } +} + +pub fn atan2(vy: anytype, vx: anytype) @TypeOf(vx, vy) { + const T = @TypeOf(vx, vy); + const Tu = @Vector(veclen(T), u32); + + const vx_is_positive = + (@as(Tu, @bitCast(vx)) & @as(Tu, @splat(0x8000_0000))) == @as(Tu, @splat(0)); + + const vy_sign = andInt(vy, splatNegativeZero(T)); + const c0_25pi = orInt(vy_sign, @as(T, @splat(0.25 * math.pi))); + const c0_50pi = orInt(vy_sign, @as(T, @splat(0.50 * math.pi))); + const c0_75pi = orInt(vy_sign, @as(T, @splat(0.75 * math.pi))); + const c1_00pi = orInt(vy_sign, @as(T, @splat(1.00 * math.pi))); + + var r1 = select(vx_is_positive, vy_sign, c1_00pi); + var r2 = select(vx == splat(T, 0.0), c0_50pi, splatInt(T, 0xffff_ffff)); + const r3 = select(vy == splat(T, 0.0), r1, r2); + const r4 = select(vx_is_positive, c0_25pi, c0_75pi); + const r5 = select(isInf(vx), r4, c0_50pi); + const result = select(isInf(vy), r5, r3); + const result_valid = @as(Tu, @bitCast(result)) == @as(Tu, @splat(0xffff_ffff)); + + const v = vy / vx; + const r0 = atan(v); + + r1 = select(vx_is_positive, splatNegativeZero(T), c1_00pi); + r2 = r0 + r1; + + return select(result_valid, r2, result); +} +test "zmath.atan2" { + // From DirectXMath XMVectorATan2(): + // + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + const epsilon = 0.0001; + try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, -1.0)), splat(F32x4, math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, -1.0)), splat(F32x4, -math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, 0.0)), splat(F32x4, 0.5 * math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, 0.0)), splat(F32x4, -0.5 * math.pi), epsilon); + try expectVecApproxEqAbs( + atan2(splat(F32x4, 1.0), splat(F32x4, -1.0)), + splat(F32x4, math.atan(@as(f32, -1.0)) + math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -10.0), splat(F32x4, -2.0)), + splat(F32x4, math.atan(@as(f32, 5.0)) - math.pi), + epsilon, + ); + try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, -math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, math.inf(f32))), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, math.inf(f32))), splat(F32x4, -0.0), epsilon); + try expectVecApproxEqAbs( + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, 2.0)), + splat(F32x4, 0.5 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, 2.0)), + splat(F32x4, -0.5 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, -math.inf(f32))), + splat(F32x4, 0.75 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, -math.inf(f32))), + splat(F32x4, -0.75 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32))), + splat(F32x4, 0.25 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, math.inf(f32))), + splat(F32x4, -0.25 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2( + f32x8(0.0, -math.inf(f32), -0.0, 2.0, math.inf(f32), math.inf(f32), 1.0, -math.inf(f32)), + f32x8(-2.0, math.inf(f32), 1.0, 0.0, 10.0, -math.inf(f32), 1.0, -math.inf(f32)), + ), + f32x8( + math.pi, + -0.25 * math.pi, + -0.0, + 0.5 * math.pi, + 0.5 * math.pi, + 0.75 * math.pi, + math.atan(@as(f32, 1.0)), + -0.75 * math.pi, + ), + epsilon, + ); + try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); + try expect(all(isNan(atan2(splat(F32x4, 1.0), splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(atan2(splat(F32x4, -1.0), splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(atan2(splat(F32x4, math.nan(f32)), splat(F32x4, -1.0))), 0) == true); + try expect(all(isNan(atan2(splat(F32x4, -math.nan(f32)), splat(F32x4, 1.0))), 0) == true); +} +// ------------------------------------------------------------------------------ +// +// 3. 2D, 3D, 4D vector functions +// +// ------------------------------------------------------------------------------ +pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | + const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | + return swizzle(xmm0, .x, .x, .x, .x); +} +test "zmath.dot2" { + const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); + const v1 = f32x4(4.0, 5.0, 600.0, 2.0); + const v = dot2(v0, v1); + try expectVecApproxEqAbs(v, splat(F32x4, 6.0), 0.0001); +} + +pub inline fn dot3(v0: Vec, v1: Vec) F32x4 { + const dot = v0 * v1; + return f32x4s(dot[0] + dot[1] + dot[2]); +} +test "zmath.dot3" { + const v0 = f32x4(-1.0, 2.0, 3.0, 1.0); + const v1 = f32x4(4.0, 5.0, 6.0, 1.0); + const v = dot3(v0, v1); + try expectVecApproxEqAbs(v, splat(F32x4, 24.0), 0.0001); +} + +pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | + var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- | + xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- | + xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- | + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss + return swizzle(xmm0, .x, .x, .x, .x); +} +test "zmath.dot4" { + const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); + const v1 = f32x4(4.0, 5.0, 6.0, 2.0); + const v = dot4(v0, v1); + try expectVecApproxEqAbs(v, splat(F32x4, 20.0), 0.0001); +} + +pub inline fn cross3(v0: Vec, v1: Vec) Vec { + var xmm0 = swizzle(v0, .y, .z, .x, .w); + var xmm1 = swizzle(v1, .z, .x, .y, .w); + var result = xmm0 * xmm1; + xmm0 = swizzle(xmm0, .y, .z, .x, .w); + xmm1 = swizzle(xmm1, .z, .x, .y, .w); + result = result - xmm0 * xmm1; + return andInt(result, f32x4_mask3); +} +test "zmath.cross3" { + { + const v0 = f32x4(1.0, 0.0, 0.0, 1.0); + const v1 = f32x4(0.0, 1.0, 0.0, 1.0); + const v = cross3(v0, v1); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(1.0, 0.0, 0.0, 1.0); + const v1 = f32x4(0.0, -1.0, 0.0, 1.0); + const v = cross3(v0, v1); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, -1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(-3.0, 0, -2.0, 1.0); + const v1 = f32x4(5.0, -1.0, 2.0, 1.0); + const v = cross3(v0, v1); + try expectVecApproxEqAbs(v, f32x4(-2.0, -4.0, 3.0, 0.0), 0.0001); + } +} + +pub inline fn lengthSq2(v: Vec) F32x4 { + return dot2(v, v); +} +pub inline fn lengthSq3(v: Vec) F32x4 { + return dot3(v, v); +} +pub inline fn lengthSq4(v: Vec) F32x4 { + return dot4(v, v); +} + +pub inline fn length2(v: Vec) F32x4 { + return sqrt(dot2(v, v)); +} +pub inline fn length3(v: Vec) F32x4 { + return sqrt(dot3(v, v)); +} +pub inline fn length4(v: Vec) F32x4 { + return sqrt(dot4(v, v)); +} +test "zmath.length3" { + { + const v = length3(f32x4(1.0, -2.0, 3.0, 1000.0)); + try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); + } + { + const v = length3(f32x4(1.0, math.nan(f32), math.nan(f32), 1000.0)); + try expect(all(isNan(v), 0)); + } + { + const v = length3(f32x4(1.0, math.inf(f32), 3.0, 1000.0)); + try expect(all(isInf(v), 0)); + } + { + const v = length3(f32x4(3.0, 2.0, 1.0, math.nan(f32))); + try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); + } +} + +pub inline fn normalize2(v: Vec) Vec { + return v * splat(F32x4, 1.0) / sqrt(dot2(v, v)); +} +pub inline fn normalize3(v: Vec) Vec { + return v * splat(F32x4, 1.0) / sqrt(dot3(v, v)); +} +pub inline fn normalize4(v: Vec) Vec { + return v * splat(F32x4, 1.0) / sqrt(dot4(v, v)); +} +test "zmath.normalize3" { + { + const v0 = f32x4(1.0, -2.0, 3.0, 1000.0); + const v = normalize3(v0); + try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(14.0)), 0.0005); + } + { + try expect(any(isNan(normalize3(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); + try expect(any(isNan(normalize3(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize3(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize3(f32x4(0, 0, 0, 0))), 0)); + } +} +test "zmath.normalize4" { + { + const v0 = f32x4(1.0, -2.0, 3.0, 10.0); + const v = normalize4(v0); + try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(114.0)), 0.0005); + } + { + try expect(any(isNan(normalize4(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); + try expect(any(isNan(normalize4(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize4(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize4(f32x4(0, 0, 0, 0))), 0)); + } +} + +fn vecMulMat(v: Vec, m: Mat) Vec { + const vx = @shuffle(f32, v, undefined, [4]i32{ 0, 0, 0, 0 }); + const vy = @shuffle(f32, v, undefined, [4]i32{ 1, 1, 1, 1 }); + const vz = @shuffle(f32, v, undefined, [4]i32{ 2, 2, 2, 2 }); + const vw = @shuffle(f32, v, undefined, [4]i32{ 3, 3, 3, 3 }); + return vx * m[0] + vy * m[1] + vz * m[2] + vw * m[3]; +} +fn matMulVec(m: Mat, v: Vec) Vec { + return .{ dot4(m[0], v)[0], dot4(m[1], v)[0], dot4(m[2], v)[0], dot4(m[3], v)[0] }; +} +test "zmath.vecMulMat" { + const m = Mat{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(2.0, 3.0, 4.0, 1.0), + }; + const vm = mul(f32x4(1.0, 2.0, 3.0, 1.0), m); + const mv = mul(m, f32x4(1.0, 2.0, 3.0, 1.0)); + const v = mul(transpose(m), f32x4(1.0, 2.0, 3.0, 1.0)); + try expectVecApproxEqAbs(vm, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); + try expectVecApproxEqAbs(mv, f32x4(1.0, 2.0, 3.0, 21.0), 0.0001); + try expectVecApproxEqAbs(v, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); +} +// ------------------------------------------------------------------------------ +// +// 4. Matrix functions +// +// ------------------------------------------------------------------------------ +pub fn identity() Mat { + const static = struct { + const identity = Mat{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; + }; + return static.identity; +} + +pub fn matFromArr(arr: [16]f32) Mat { + return Mat{ + f32x4(arr[0], arr[1], arr[2], arr[3]), + f32x4(arr[4], arr[5], arr[6], arr[7]), + f32x4(arr[8], arr[9], arr[10], arr[11]), + f32x4(arr[12], arr[13], arr[14], arr[15]), + }; +} + +fn mulRetType(comptime Ta: type, comptime Tb: type) type { + if (Ta == Mat and Tb == Mat) { + return Mat; + } else if ((Ta == f32 and Tb == Mat) or (Ta == Mat and Tb == f32)) { + return Mat; + } else if ((Ta == Vec and Tb == Mat) or (Ta == Mat and Tb == Vec)) { + return Vec; + } + @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ @typeName(Tb)); +} + +pub fn mul(a: anytype, b: anytype) mulRetType(@TypeOf(a), @TypeOf(b)) { + const Ta = @TypeOf(a); + const Tb = @TypeOf(b); + if (Ta == Mat and Tb == Mat) { + return mulMat(a, b); + } else if (Ta == f32 and Tb == Mat) { + const va = splat(F32x4, a); + return Mat{ va * b[0], va * b[1], va * b[2], va * b[3] }; + } else if (Ta == Mat and Tb == f32) { + const vb = splat(F32x4, b); + return Mat{ a[0] * vb, a[1] * vb, a[2] * vb, a[3] * vb }; + } else if (Ta == Vec and Tb == Mat) { + return vecMulMat(a, b); + } else if (Ta == Mat and Tb == Vec) { + return matMulVec(a, b); + } else { + @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ ", " ++ @typeName(Tb)); + } +} +test "zmath.mul" { + { + const m = Mat{ + f32x4(0.1, 0.2, 0.3, 0.4), + f32x4(0.5, 0.6, 0.7, 0.8), + f32x4(0.9, 1.0, 1.1, 1.2), + f32x4(1.3, 1.4, 1.5, 1.6), + }; + const ms = mul(@as(f32, 2.0), m); + try expectVecApproxEqAbs(ms[0], f32x4(0.2, 0.4, 0.6, 0.8), 0.0001); + try expectVecApproxEqAbs(ms[1], f32x4(1.0, 1.2, 1.4, 1.6), 0.0001); + try expectVecApproxEqAbs(ms[2], f32x4(1.8, 2.0, 2.2, 2.4), 0.0001); + try expectVecApproxEqAbs(ms[3], f32x4(2.6, 2.8, 3.0, 3.2), 0.0001); + } +} + +fn mulMat(m0: Mat, m1: Mat) Mat { + var result: Mat = undefined; + comptime var row: u32 = 0; + inline while (row < 4) : (row += 1) { + const vx = swizzle(m0[row], .x, .x, .x, .x); + const vy = swizzle(m0[row], .y, .y, .y, .y); + const vz = swizzle(m0[row], .z, .z, .z, .z); + const vw = swizzle(m0[row], .w, .w, .w, .w); + result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]); + } + return result; +} +test "zmath.matrix.mul" { + const a = Mat{ + f32x4(0.1, 0.2, 0.3, 0.4), + f32x4(0.5, 0.6, 0.7, 0.8), + f32x4(0.9, 1.0, 1.1, 1.2), + f32x4(1.3, 1.4, 1.5, 1.6), + }; + const b = Mat{ + f32x4(1.7, 1.8, 1.9, 2.0), + f32x4(2.1, 2.2, 2.3, 2.4), + f32x4(2.5, 2.6, 2.7, 2.8), + f32x4(2.9, 3.0, 3.1, 3.2), + }; + const c = mul(a, b); + try expectVecApproxEqAbs(c[0], f32x4(2.5, 2.6, 2.7, 2.8), 0.0001); + try expectVecApproxEqAbs(c[1], f32x4(6.18, 6.44, 6.7, 6.96), 0.0001); + try expectVecApproxEqAbs(c[2], f32x4(9.86, 10.28, 10.7, 11.12), 0.0001); + try expectVecApproxEqAbs(c[3], f32x4(13.54, 14.12, 14.7, 15.28), 0.0001); +} + +pub fn transpose(m: Mat) Mat { + const temp1 = @shuffle(f32, m[0], m[1], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); + const temp3 = @shuffle(f32, m[0], m[1], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); + const temp2 = @shuffle(f32, m[2], m[3], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); + const temp4 = @shuffle(f32, m[2], m[3], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); + return .{ + @shuffle(f32, temp1, temp2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), + @shuffle(f32, temp1, temp2, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), + @shuffle(f32, temp3, temp4, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), + @shuffle(f32, temp3, temp4, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), + }; +} +test "zmath.matrix.transpose" { + const m = Mat{ + f32x4(1.0, 2.0, 3.0, 4.0), + f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), + f32x4(13.0, 14.0, 15.0, 16.0), + }; + const mt = transpose(m); + try expectVecApproxEqAbs(mt[0], f32x4(1.0, 5.0, 9.0, 13.0), 0.0001); + try expectVecApproxEqAbs(mt[1], f32x4(2.0, 6.0, 10.0, 14.0), 0.0001); + try expectVecApproxEqAbs(mt[2], f32x4(3.0, 7.0, 11.0, 15.0), 0.0001); + try expectVecApproxEqAbs(mt[3], f32x4(4.0, 8.0, 12.0, 16.0), 0.0001); +} + +pub fn rotationX(angle: f32) Mat { + const sc = sincos(angle); + return .{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, sc[1], sc[0], 0.0), + f32x4(0.0, -sc[0], sc[1], 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub fn rotationY(angle: f32) Mat { + const sc = sincos(angle); + return .{ + f32x4(sc[1], 0.0, -sc[0], 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(sc[0], 0.0, sc[1], 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub fn rotationZ(angle: f32) Mat { + const sc = sincos(angle); + return .{ + f32x4(sc[1], sc[0], 0.0, 0.0), + f32x4(-sc[0], sc[1], 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub fn translation(x: f32, y: f32, z: f32) Mat { + return .{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(x, y, z, 1.0), + }; +} +pub fn translationV(v: Vec) Mat { + return translation(v[0], v[1], v[2]); +} + +pub fn scaling(x: f32, y: f32, z: f32) Mat { + return .{ + f32x4(x, 0.0, 0.0, 0.0), + f32x4(0.0, y, 0.0, 0.0), + f32x4(0.0, 0.0, z, 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} +pub fn scalingV(v: Vec) Mat { + return scaling(v[0], v[1], v[2]); +} + +pub fn lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { + const az = normalize3(eyedir); + const ax = normalize3(cross3(updir, az)); + const ay = normalize3(cross3(az, ax)); + return .{ + f32x4(ax[0], ay[0], az[0], 0), + f32x4(ax[1], ay[1], az[1], 0), + f32x4(ax[2], ay[2], az[2], 0), + f32x4(-dot3(ax, eyepos)[0], -dot3(ay, eyepos)[0], -dot3(az, eyepos)[0], 1.0), + }; +} +pub fn lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { + return lookToLh(eyepos, -eyedir, updir); +} +pub fn lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { + return lookToLh(eyepos, focuspos - eyepos, updir); +} +pub fn lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { + return lookToLh(eyepos, eyepos - focuspos, updir); +} +test "zmath.matrix.lookToLh" { + const m = lookToLh(f32x4(0.0, 0.0, -3.0, 1.0), f32x4(0.0, 0.0, 1.0, 0.0), f32x4(0.0, 1.0, 0.0, 0.0)); + try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.001); + try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.001); + try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.001); + try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 3.0, 1.0), 0.001); +} + +pub fn perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = far / (far - near); + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, r, 1.0), + f32x4(0.0, 0.0, -r * near, 0.0), + }; +} +pub fn perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = far / (near - far); + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, r, -1.0), + f32x4(0.0, 0.0, r * near, 0.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = far - near; + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, (near + far) / r, 1.0), + f32x4(0.0, 0.0, 2.0 * near * far / -r, 0.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = near - far; + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, (near + far) / r, -1.0), + f32x4(0.0, 0.0, 2.0 * near * far / r, 0.0), + }; +} + +pub fn orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (far - near); + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(0.0, 0.0, -r * near, 1.0), + }; +} + +pub fn orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (near - far); + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(0.0, 0.0, r * near, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = far - near; + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(0.0, 0.0, (near + far) / -r, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = near - far; + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(0.0, 0.0, (near + far) / r, 1.0), + }; +} + +pub fn orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (far - near); + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), -r * near, 1.0), + }; +} + +pub fn orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (near - far); + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), r * near, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = far - near; + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / -r, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = near - far; + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / r, 1.0), + }; +} + +pub fn determinant(m: Mat) F32x4 { + var v0 = swizzle(m[2], .y, .x, .x, .x); + var v1 = swizzle(m[3], .z, .z, .y, .y); + var v2 = swizzle(m[2], .y, .x, .x, .x); + var v3 = swizzle(m[3], .w, .w, .w, .z); + var v4 = swizzle(m[2], .z, .z, .y, .y); + var v5 = swizzle(m[3], .w, .w, .w, .z); + + var p0 = v0 * v1; + var p1 = v2 * v3; + var p2 = v4 * v5; + + v0 = swizzle(m[2], .z, .z, .y, .y); + v1 = swizzle(m[3], .y, .x, .x, .x); + v2 = swizzle(m[2], .w, .w, .w, .z); + v3 = swizzle(m[3], .y, .x, .x, .x); + v4 = swizzle(m[2], .w, .w, .w, .z); + v5 = swizzle(m[3], .z, .z, .y, .y); + + p0 = mulAdd(-v0, v1, p0); + p1 = mulAdd(-v2, v3, p1); + p2 = mulAdd(-v4, v5, p2); + + v0 = swizzle(m[1], .w, .w, .w, .z); + v1 = swizzle(m[1], .z, .z, .y, .y); + v2 = swizzle(m[1], .y, .x, .x, .x); + + const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0); + var r = v0 * p0; + r = mulAdd(-v1, p1, r); + r = mulAdd(v2, p2, r); + return dot4(s, r); +} +test "zmath.matrix.determinant" { + const m = Mat{ + f32x4(10.0, -9.0, -12.0, 1.0), + f32x4(7.0, -12.0, 11.0, 1.0), + f32x4(-10.0, 10.0, 3.0, 1.0), + f32x4(1.0, 2.0, 3.0, 4.0), + }; + try expectVecApproxEqAbs(determinant(m), splat(F32x4, 2939.0), 0.0001); +} + +pub fn inverse(a: anytype) @TypeOf(a) { + const T = @TypeOf(a); + return switch (T) { + Mat => inverseMat(a), + Quat => inverseQuat(a), + else => @compileError("zmath.inverse() not implemented for " ++ @typeName(T)), + }; +} + +fn inverseMat(m: Mat) Mat { + return inverseDet(m, null); +} + +pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { + const mt = transpose(m); + var v0: [4]F32x4 = undefined; + var v1: [4]F32x4 = undefined; + + v0[0] = swizzle(mt[2], .x, .x, .y, .y); + v1[0] = swizzle(mt[3], .z, .w, .z, .w); + v0[1] = swizzle(mt[0], .x, .x, .y, .y); + v1[1] = swizzle(mt[1], .z, .w, .z, .w); + v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + + var d0 = v0[0] * v1[0]; + var d1 = v0[1] * v1[1]; + var d2 = v0[2] * v1[2]; + + v0[0] = swizzle(mt[2], .z, .w, .z, .w); + v1[0] = swizzle(mt[3], .x, .x, .y, .y); + v0[1] = swizzle(mt[0], .z, .w, .z, .w); + v1[1] = swizzle(mt[1], .x, .x, .y, .y); + v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + + d0 = mulAdd(-v0[0], v1[0], d0); + d1 = mulAdd(-v0[1], v1[1], d1); + d2 = mulAdd(-v0[2], v1[2], d2); + + v0[0] = swizzle(mt[1], .y, .z, .x, .y); + v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 }); + v0[1] = swizzle(mt[0], .z, .x, .y, .x); + v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 }); + v0[2] = swizzle(mt[3], .y, .z, .x, .y); + v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 }); + v0[3] = swizzle(mt[2], .z, .x, .y, .x); + v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 }); + + var c0 = v0[0] * v1[0]; + var c2 = v0[1] * v1[1]; + var c4 = v0[2] * v1[2]; + var c6 = v0[3] * v1[3]; + + v0[0] = swizzle(mt[1], .z, .w, .y, .z); + v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) }); + v0[1] = swizzle(mt[0], .w, .z, .w, .y); + v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 }); + v0[2] = swizzle(mt[3], .z, .w, .y, .z); + v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) }); + v0[3] = swizzle(mt[2], .w, .z, .w, .y); + v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 }); + + c0 = mulAdd(-v0[0], v1[0], c0); + c2 = mulAdd(-v0[1], v1[1], c2); + c4 = mulAdd(-v0[2], v1[2], c4); + c6 = mulAdd(-v0[3], v1[3], c6); + + v0[0] = swizzle(mt[1], .w, .x, .w, .x); + v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 }); + v0[1] = swizzle(mt[0], .y, .w, .x, .z); + v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) }); + v0[2] = swizzle(mt[3], .w, .x, .w, .x); + v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 }); + v0[3] = swizzle(mt[2], .y, .w, .x, .z); + v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) }); + + const c1 = mulAdd(-v0[0], v1[0], c0); + const c3 = mulAdd(v0[1], v1[1], c2); + const c5 = mulAdd(-v0[2], v1[2], c4); + const c7 = mulAdd(v0[3], v1[3], c6); + + c0 = mulAdd(v0[0], v1[0], c0); + c2 = mulAdd(-v0[1], v1[1], c2); + c4 = mulAdd(v0[2], v1[2], c4); + c6 = mulAdd(-v0[3], v1[3], c6); + + var mr = Mat{ + f32x4(c0[0], c1[1], c0[2], c1[3]), + f32x4(c2[0], c3[1], c2[2], c3[3]), + f32x4(c4[0], c5[1], c4[2], c5[3]), + f32x4(c6[0], c7[1], c6[2], c7[3]), + }; + + const det = dot4(mr[0], mt[0]); + if (out_det != null) { + out_det.?.* = det; + } + + if (math.approxEqAbs(f32, det[0], 0.0, math.floatEps(f32))) { + return .{ + f32x4(0.0, 0.0, 0.0, 0.0), + f32x4(0.0, 0.0, 0.0, 0.0), + f32x4(0.0, 0.0, 0.0, 0.0), + f32x4(0.0, 0.0, 0.0, 0.0), + }; + } + + const scale = splat(F32x4, 1.0) / det; + mr[0] *= scale; + mr[1] *= scale; + mr[2] *= scale; + mr[3] *= scale; + return mr; +} +test "zmath.matrix.inverse" { + const m = Mat{ + f32x4(10.0, -9.0, -12.0, 1.0), + f32x4(7.0, -12.0, 11.0, 1.0), + f32x4(-10.0, 10.0, 3.0, 1.0), + f32x4(1.0, 2.0, 3.0, 4.0), + }; + var det: F32x4 = undefined; + const mi = inverseDet(m, &det); + try expectVecApproxEqAbs(det, splat(F32x4, 2939.0), 0.0001); + + try expectVecApproxEqAbs(mi[0], f32x4(-0.170806, -0.13576, -0.349439, 0.164001), 0.0001); + try expectVecApproxEqAbs(mi[1], f32x4(-0.163661, -0.14801, -0.253147, 0.141204), 0.0001); + try expectVecApproxEqAbs(mi[2], f32x4(-0.0871045, 0.00646478, -0.0785982, 0.0398095), 0.0001); + try expectVecApproxEqAbs(mi[3], f32x4(0.18986, 0.103096, 0.272882, 0.10854), 0.0001); +} + +pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { + const sincos_angle = sincos(angle); + + const c2 = splat(F32x4, 1.0 - sincos_angle[1]); + const c1 = splat(F32x4, sincos_angle[1]); + const c0 = splat(F32x4, sincos_angle[0]); + + const n0 = swizzle(axis, .y, .z, .x, .w); + const n1 = swizzle(axis, .z, .x, .y, .w); + + var v0 = c2 * n0 * n1; + const r0 = c2 * axis * axis + c1; + const r1 = c0 * axis + v0; + var r2 = v0 - c0 * axis; + + v0 = andInt(r0, f32x4_mask3); + + var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) }); + v1 = swizzle(v1, .y, .z, .w, .x); + + var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) }); + v2 = swizzle(v2, .x, .z, .x, .z); + + r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); + r2 = swizzle(r2, .x, .z, .w, .y); + + var m: Mat = undefined; + m[0] = r2; + + r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); + r2 = swizzle(r2, .z, .x, .w, .y); + m[1] = r2; + + v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); + m[2] = v2; + m[3] = f32x4(0.0, 0.0, 0.0, 1.0); + return m; +} +pub fn matFromAxisAngle(axis: Vec, angle: f32) Mat { + assert(!all(axis == splat(F32x4, 0.0), 3)); + assert(!all(isInf(axis), 3)); + const normal = normalize3(axis); + return matFromNormAxisAngle(normal, angle); +} +test "zmath.matrix.matFromAxisAngle" { + { + const m0 = matFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), math.pi * 0.25); + const m1 = rotationX(math.pi * 0.25); + try expectVecApproxEqAbs(m0[0], m1[0], 0.001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.001); + } + { + const m0 = matFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), math.pi * 0.125); + const m1 = rotationY(math.pi * 0.125); + try expectVecApproxEqAbs(m0[0], m1[0], 0.001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.001); + } + { + const m0 = matFromAxisAngle(f32x4(0.0, 0.0, 1.0, 0.0), math.pi * 0.333); + const m1 = rotationZ(math.pi * 0.333); + try expectVecApproxEqAbs(m0[0], m1[0], 0.001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.001); + } +} + +pub fn matFromQuat(quat: Quat) Mat { + const q0 = quat + quat; + var q1 = quat * q0; + + var v0 = swizzle(q1, .y, .x, .x, .w); + v0 = andInt(v0, f32x4_mask3); + + var v1 = swizzle(q1, .z, .z, .y, .w); + v1 = andInt(v1, f32x4_mask3); + + const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1; + + v0 = swizzle(quat, .x, .x, .y, .w); + v1 = swizzle(q0, .z, .y, .z, .w); + v0 = v0 * v1; + + v1 = swizzle(quat, .w, .w, .w, .w); + const v2 = swizzle(q0, .y, .z, .x, .w); + v1 = v1 * v2; + + const r1 = v0 + v1; + const r2 = v0 - v1; + + v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); + v0 = swizzle(v0, .x, .z, .w, .y); + v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) }); + v1 = swizzle(v1, .x, .z, .x, .z); + + q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); + q1 = swizzle(q1, .x, .z, .w, .y); + + var m: Mat = undefined; + m[0] = q1; + + q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); + q1 = swizzle(q1, .z, .x, .w, .y); + m[1] = q1; + + q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); + m[2] = q1; + m[3] = f32x4(0.0, 0.0, 0.0, 1.0); + return m; +} +test "zmath.matrix.matFromQuat" { + { + const m = matFromQuat(f32x4(0.0, 0.0, 0.0, 1.0)); + try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.0001); + try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); + try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); + try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); + } +} + +pub fn matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat { + return matFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); +} +pub fn matFromRollPitchYawV(angles: Vec) Mat { + return matFromQuat(quatFromRollPitchYawV(angles)); +} + +pub fn matToQuat(m: Mat) Quat { + return quatFromMat(m); +} + +pub inline fn loadMat(mem: []const f32) Mat { + return .{ + load(mem[0..4], F32x4, 0), + load(mem[4..8], F32x4, 0), + load(mem[8..12], F32x4, 0), + load(mem[12..16], F32x4, 0), + }; +} +test "zmath.loadMat" { + const a = [18]f32{ + 1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, + }; + const m = loadMat(a[1..]); + try expectVecEqual(m[0], f32x4(2.0, 3.0, 4.0, 5.0)); + try expectVecEqual(m[1], f32x4(6.0, 7.0, 8.0, 9.0)); + try expectVecEqual(m[2], f32x4(10.0, 11.0, 12.0, 13.0)); + try expectVecEqual(m[3], f32x4(14.0, 15.0, 16.0, 17.0)); +} + +pub inline fn storeMat(mem: []f32, m: Mat) void { + store(mem[0..4], m[0], 0); + store(mem[4..8], m[1], 0); + store(mem[8..12], m[2], 0); + store(mem[12..16], m[3], 0); +} + +pub inline fn loadMat43(mem: []const f32) Mat { + return .{ + f32x4(mem[0], mem[1], mem[2], 0.0), + f32x4(mem[3], mem[4], mem[5], 0.0), + f32x4(mem[6], mem[7], mem[8], 0.0), + f32x4(mem[9], mem[10], mem[11], 1.0), + }; +} + +pub inline fn storeMat43(mem: []f32, m: Mat) void { + store(mem[0..3], m[0], 3); + store(mem[3..6], m[1], 3); + store(mem[6..9], m[2], 3); + store(mem[9..12], m[3], 3); +} + +pub inline fn loadMat34(mem: []const f32) Mat { + return .{ + load(mem[0..4], F32x4, 0), + load(mem[4..8], F32x4, 0), + load(mem[8..12], F32x4, 0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub inline fn storeMat34(mem: []f32, m: Mat) void { + store(mem[0..4], m[0], 0); + store(mem[4..8], m[1], 0); + store(mem[8..12], m[2], 0); +} + +pub inline fn matToArr(m: Mat) [16]f32 { + var array: [16]f32 = undefined; + storeMat(array[0..], m); + return array; +} + +pub inline fn matToArr43(m: Mat) [12]f32 { + var array: [12]f32 = undefined; + storeMat43(array[0..], m); + return array; +} + +pub inline fn matToArr34(m: Mat) [12]f32 { + var array: [12]f32 = undefined; + storeMat34(array[0..], m); + return array; +} +// ------------------------------------------------------------------------------ +// +// 5. Quaternion functions +// +// ------------------------------------------------------------------------------ +pub fn qmul(q0: Quat, q1: Quat) Quat { + var result = swizzle(q1, .w, .w, .w, .w); + var q1x = swizzle(q1, .x, .x, .x, .x); + var q1y = swizzle(q1, .y, .y, .y, .y); + var q1z = swizzle(q1, .z, .z, .z, .z); + result = result * q0; + var q0_shuf = swizzle(q0, .w, .z, .y, .x); + q1x = q1x * q0_shuf; + q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z); + result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result); + q1y = q1y * q0_shuf; + q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x); + q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0); + q1z = q1z * q0_shuf; + q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y); + return result + q1y; +} +test "zmath.quaternion.mul" { + { + const q0 = f32x4(2.0, 3.0, 4.0, 1.0); + const q1 = f32x4(3.0, 2.0, 1.0, 4.0); + try expectVecApproxEqAbs(qmul(q0, q1), f32x4(16.0, 4.0, 22.0, -12.0), 0.0001); + } +} + +pub fn quatToMat(quat: Quat) Mat { + return matFromQuat(quat); +} + +pub fn quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void { + axis.* = quat; + angle.* = 2.0 * acos(quat[3]); +} +test "zmath.quaternion.quatToAxisAngle" { + { + const q0 = quatFromNormAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); + var axis: Vec = f32x4(4.0, 3.0, 2.0, 1.0); + var angle: f32 = 10.0; + quatToAxisAngle(q0, &axis, &angle); + try expect(math.approxEqAbs(f32, axis[0], @sin(@as(f32, 0.25) * math.pi * 0.5), 0.0001)); + try expect(axis[1] == 0.0); + try expect(axis[2] == 0.0); + try expect(math.approxEqAbs(f32, angle, 0.25 * math.pi, 0.0001)); + } +} + +pub fn quatFromMat(m: Mat) Quat { + const r0 = m[0]; + const r1 = m[1]; + const r2 = m[2]; + const r00 = swizzle(r0, .x, .x, .x, .x); + const r11 = swizzle(r1, .y, .y, .y, .y); + const r22 = swizzle(r2, .z, .z, .z, .z); + + const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0); + const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0); + const x2py2gez2pw2 = r22 <= splat(F32x4, 0.0); + + var t0 = mulAdd(r00, f32x4(1.0, -1.0, -1.0, 1.0), splat(F32x4, 1.0)); + var t1 = r11 * f32x4(-1.0, 1.0, -1.0, 1.0); + var t2 = mulAdd(r22, f32x4(-1.0, -1.0, 1.0, 1.0), t0); + const x2y2z2w2 = t1 + t2; + + t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) }); + t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) }); + t1 = swizzle(t1, .x, .z, .w, .y); + const xyxzyz = t0 + t1; + + t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) }); + t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) }); + t1 = swizzle(t1, .x, .z, .w, .y); + const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0); + + t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) }); + t1 = @shuffle(f32, x2y2z2w2, xwywzw, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 0) }); + t2 = @shuffle(f32, xyxzyz, xwywzw, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); + + const tensor0 = @shuffle(f32, t0, t2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + const tensor1 = @shuffle(f32, t0, t2, [4]i32{ 2, 1, ~@as(i32, 1), ~@as(i32, 3) }); + const tensor2 = @shuffle(f32, t2, t1, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 2) }); + const tensor3 = @shuffle(f32, t2, t1, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 1) }); + + t0 = select(x2gey2, tensor0, tensor1); + t1 = select(z2gew2, tensor2, tensor3); + t2 = select(x2py2gez2pw2, t0, t1); + + return t2 / length4(t2); +} +test "zmath.quatFromMat" { + { + const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); + const q1 = quatFromMat(rotationX(0.25 * math.pi)); + try expectVecApproxEqAbs(q0, q1, 0.0001); + } + { + const q0 = quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); + const q1 = quatFromMat(matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); + try expectVecApproxEqAbs(q0, q1, 0.0001); + } + { + const q0 = quatFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi); + const q1 = quatFromMat(matFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi)); + try expectVecApproxEqAbs(q0, q1, 0.0001); + } +} + +pub fn quatFromNormAxisAngle(axis: Vec, angle: f32) Quat { + const n = f32x4(axis[0], axis[1], axis[2], 1.0); + const sc = sincos(0.5 * angle); + return n * f32x4(sc[0], sc[0], sc[0], sc[1]); +} +pub fn quatFromAxisAngle(axis: Vec, angle: f32) Quat { + assert(!all(axis == splat(F32x4, 0.0), 3)); + assert(!all(isInf(axis), 3)); + const normal = normalize3(axis); + return quatFromNormAxisAngle(normal, angle); +} +test "zmath.quaternion.quatFromNormAxisAngle" { + { + const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); + const q1 = quatFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), 0.125 * math.pi); + const m0 = rotationX(0.25 * math.pi); + const m1 = rotationY(0.125 * math.pi); + const mr0 = quatToMat(qmul(q0, q1)); + const mr1 = mul(m0, m1); + try expectVecApproxEqAbs(mr0[0], mr1[0], 0.0001); + try expectVecApproxEqAbs(mr0[1], mr1[1], 0.0001); + try expectVecApproxEqAbs(mr0[2], mr1[2], 0.0001); + try expectVecApproxEqAbs(mr0[3], mr1[3], 0.0001); + } + { + const m0 = quatToMat(quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); + const m1 = matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); + try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); + } +} + +pub inline fn qidentity() Quat { + return f32x4(@as(f32, 0.0), @as(f32, 0.0), @as(f32, 0.0), @as(f32, 1.0)); +} + +pub inline fn conjugate(quat: Quat) Quat { + return quat * f32x4(-1.0, -1.0, -1.0, 1.0); +} + +fn inverseQuat(quat: Quat) Quat { + const l = lengthSq4(quat); + const conj = conjugate(quat); + return select(l <= splat(F32x4, math.floatEps(f32)), splat(F32x4, 0.0), conj / l); +} +test "zmath.quaternion.inverseQuat" { + try expectVecApproxEqAbs( + inverse(f32x4(2.0, 3.0, 4.0, 1.0)), + f32x4(-1.0 / 15.0, -1.0 / 10.0, -2.0 / 15.0, 1.0 / 30.0), + 0.0001, + ); + try expectVecApproxEqAbs(inverse(qidentity()), qidentity(), 0.0001); +} + +// Algorithm from: https://github.com/g-truc/glm/blob/master/glm/detail/type_quat.inl +pub fn rotate(q: Quat, v: Vec) Vec { + const w = splat(F32x4, q[3]); + const axis = f32x4(q[0], q[1], q[2], 0.0); + const uv = cross3(axis, v); + return v + ((uv * w) + cross3(axis, uv)) * splat(F32x4, 2.0); +} +test "zmath.quaternion.rotate" { + const quat = quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi); + const mat = matFromQuat(quat); + const forward = f32x4(0.0, 0.0, -1.0, 0.0); + const up = f32x4(0.0, 1.0, 0.0, 0.0); + const right = f32x4(1.0, 0.0, 0.0, 0.0); + try expectVecApproxEqAbs(rotate(quat, forward), mul(forward, mat), 0.0001); + try expectVecApproxEqAbs(rotate(quat, up), mul(up, mat), 0.0001); + try expectVecApproxEqAbs(rotate(quat, right), mul(right, mat), 0.0001); +} + +pub fn slerp(q0: Quat, q1: Quat, t: f32) Quat { + return slerpV(q0, q1, splat(F32x4, t)); +} +pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat { + var cos_omega = dot4(q0, q1); + const sign = select(cos_omega < splat(F32x4, 0.0), splat(F32x4, -1.0), splat(F32x4, 1.0)); + + cos_omega = cos_omega * sign; + const sin_omega = sqrt(splat(F32x4, 1.0) - cos_omega * cos_omega); + + const omega = atan2(sin_omega, cos_omega); + + var v01 = t; + v01 = xorInt(andInt(v01, f32x4_mask2), f32x4_sign_mask1); + v01 = f32x4(1.0, 0.0, 0.0, 0.0) + v01; + + var s0 = sin(v01 * omega) / sin_omega; + s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01); + + const s1 = swizzle(s0, .y, .y, .y, .y); + s0 = swizzle(s0, .x, .x, .x, .x); + + return q0 * s0 + sign * q1 * s1; +} +test "zmath.quaternion.slerp" { + const from = f32x4(0.0, 0.0, 0.0, 1.0); + const to = f32x4(0.5, 0.5, -0.5, 0.5); + const result = slerp(from, to, 0.5); + try expectVecApproxEqAbs(result, f32x4(0.28867513, 0.28867513, -0.28867513, 0.86602540), 0.0001); +} + +// Converts q back to euler angles, assuming a YXZ rotation order. +// See: http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler +pub fn quatToRollPitchYaw(q: Quat) [3]f32 { + var angles: [3]f32 = undefined; + + const p = swizzle(q, .w, .y, .x, .z); + const sign = -1.0; + + const singularity = p[0] * p[2] + sign * p[1] * p[3]; + if (singularity > 0.499) { + angles[0] = math.pi * 0.5; + angles[1] = 2.0 * math.atan2(p[1], p[0]); + angles[2] = 0.0; + } else if (singularity < -0.499) { + angles[0] = -math.pi * 0.5; + angles[1] = 2.0 * math.atan2(p[1], p[0]); + angles[2] = 0.0; + } else { + const sq = p * p; + const y = splat(F32x4, 2.0) * f32x4(p[0] * p[1] - sign * p[2] * p[3], p[0] * p[3] - sign * p[1] * p[2], 0.0, 0.0); + const x = splat(F32x4, 1.0) - (splat(F32x4, 2.0) * f32x4(sq[1] + sq[2], sq[2] + sq[3], 0.0, 0.0)); + const res = atan2(y, x); + angles[0] = math.asin(2.0 * singularity); + angles[1] = res[0]; + angles[2] = res[1]; + } + + return angles; +} + +test "zmath.quaternion.quatToRollPitchYaw" { + { + const expected = f32x4(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); + const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } + + { + const expected = f32x4(0.3 * math.pi, 0.1 * math.pi, 0.2 * math.pi, 0.0); + const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } + + // North pole singularity + { + const angle = f32x4(0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); + const expected = f32x4(0.5 * math.pi, -0.1 * math.pi, 0.0, 0.0); + const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } + + // South pole singularity + { + const angle = f32x4(-0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); + const expected = f32x4(-0.5 * math.pi, 0.5 * math.pi, 0.0, 0.0); + const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } +} + +pub fn quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat { + return quatFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); +} +pub fn quatFromRollPitchYawV(angles: Vec) Quat { // | pitch | yaw | roll | 0 | + const sc = sincos(splat(Vec, 0.5) * angles); + const p0 = @shuffle(f32, sc[1], sc[0], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); + const p1 = @shuffle(f32, sc[0], sc[1], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); + const y0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); + const y1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); + const r0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); + const r1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); + const q1 = p1 * f32x4(1.0, -1.0, -1.0, 1.0) * y1; + const q0 = p0 * y0 * r0; + return mulAdd(q1, r1, q0); +} +test "zmath.quaternion.quatFromRollPitchYawV" { + { + const m0 = quatToMat(quatFromRollPitchYawV(f32x4(0.25 * math.pi, 0.0, 0.0, 0.0))); + const m1 = rotationX(0.25 * math.pi); + try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); + } + { + const m0 = quatToMat(quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi)); + const m1 = mul( + rotationZ(0.3 * math.pi), + mul(rotationX(0.1 * math.pi), rotationY(0.2 * math.pi)), + ); + try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); + } +} +// ------------------------------------------------------------------------------ +// +// 6. Color functions +// +// ------------------------------------------------------------------------------ +pub fn adjustSaturation(color: F32x4, saturation: f32) F32x4 { + const luminance = dot3(f32x4(0.2125, 0.7154, 0.0721, 0.0), color); + var result = mulAdd(color - luminance, f32x4s(saturation), luminance); + result[3] = color[3]; + return result; +} + +pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 { + var result = mulAdd(color - f32x4s(0.5), f32x4s(contrast), f32x4s(0.5)); + result[3] = color[3]; + return result; +} + +pub fn rgbToHsl(rgb: F32x4) F32x4 { + const r = swizzle(rgb, .x, .x, .x, .x); + const g = swizzle(rgb, .y, .y, .y, .y); + const b = swizzle(rgb, .z, .z, .z, .z); + + const minv = min(r, min(g, b)); + const maxv = max(r, max(g, b)); + + const l = (minv + maxv) * f32x4s(0.5); + const d = maxv - minv; + const la = select(boolx4(true, true, true, false), l, rgb); + + if (all(d < f32x4s(math.floatEps(f32)), 3)) { + return select(boolx4(true, true, false, false), f32x4s(0.0), la); + } else { + var s: F32x4 = undefined; + var h: F32x4 = undefined; + + const d2 = minv + maxv; + + if (all(l > f32x4s(0.5), 3)) { + s = d / (f32x4s(2.0) - d2); + } else { + s = d / d2; + } + + if (all(r == maxv, 3)) { + h = (g - b) / d; + } else if (all(g == maxv, 3)) { + h = f32x4s(2.0) + (b - r) / d; + } else { + h = f32x4s(4.0) + (r - g) / d; + } + + h /= f32x4s(6.0); + + if (all(h < f32x4s(0.0), 3)) { + h += f32x4s(1.0); + } + + const lha = select(boolx4(true, true, false, false), h, la); + return select(boolx4(true, false, true, true), lha, s); + } +} +test "zmath.color.rgbToHsl" { + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.6, 0.5, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 0.5, 0.5), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 0.5, 0.25), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 0.5, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); +} + +fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 { + var t = h; + + if (all(t < f32x4s(0.0), 3)) + t += f32x4s(1.0); + + if (all(t > f32x4s(1.0), 3)) + t -= f32x4s(1.0); + + if (all(t < f32x4s(1.0 / 6.0), 3)) + return mulAdd(q - p, f32x4s(6.0) * t, p); + + if (all(t < f32x4s(0.5), 3)) + return q; + + if (all(t < f32x4s(2.0 / 3.0), 3)) + return mulAdd(q - p, f32x4s(6.0) * (f32x4s(2.0 / 3.0) - t), p); + + return p; +} + +pub fn hslToRgb(hsl: F32x4) F32x4 { + const s = swizzle(hsl, .y, .y, .y, .y); + const l = swizzle(hsl, .z, .z, .z, .z); + + if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) { + return select(boolx4(true, true, true, false), l, hsl); + } else { + const h = swizzle(hsl, .x, .x, .x, .x); + var q: F32x4 = undefined; + if (all(l < f32x4s(0.5), 3)) { + q = l * (f32x4s(1.0) + s); + } else { + q = (l + s) - (l * s); + } + + const p = f32x4s(2.0) * l - q; + + const r = hueToClr(p, q, h + f32x4s(1.0 / 3.0)); + const g = hueToClr(p, q, h); + const b = hueToClr(p, q, h - f32x4s(1.0 / 3.0)); + + const rg = select(boolx4(true, false, false, false), r, g); + const ba = select(boolx4(true, true, true, false), b, hsl); + return select(boolx4(true, true, false, false), rg, ba); + } +} +test "zmath.color.hslToRgb" { + try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hslToRgb(f32x4(0.6111, 0.6, 0.5, 1.0)), 0.0001); + try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hslToRgb(f32x4(0.0, 1.0, 0.5, 0.5)), 0.0001); + try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hslToRgb(f32x4(0.3333, 1.0, 0.5, 0.25)), 0.0005); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hslToRgb(f32x4(0.6666, 1.0, 0.5, 1.0)), 0.0005); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), 0.0001); + try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), 0.0001); + try expectVecApproxEqAbs(hslToRgb(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0))), f32x4(1.0, 1.0, 1.0, 1.0), 0.0005); + try expectVecApproxEqAbs( + hslToRgb(rgbToHsl(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + 0.0005, + ); + try expectVecApproxEqAbs( + rgbToHsl(hslToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + 0.0005, + ); + try expectVecApproxEqAbs( + rgbToHsl(hslToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), + f32x4(0.1839, 0.82198, 0.632, 1.0), + 0.0005, + ); + try expectVecApproxEqAbs( + hslToRgb(rgbToHsl(f32x4(0.1839, 0.632, 0.82198, 1.0))), + f32x4(0.1839, 0.632, 0.82198, 1.0), + 0.0005, + ); +} + +pub fn rgbToHsv(rgb: F32x4) F32x4 { + const r = swizzle(rgb, .x, .x, .x, .x); + const g = swizzle(rgb, .y, .y, .y, .y); + const b = swizzle(rgb, .z, .z, .z, .z); + + const minv = min(r, min(g, b)); + const v = max(r, max(g, b)); + const d = v - minv; + const s = if (all(isNearEqual(v, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) f32x4s(0.0) else d / v; + + if (all(d < f32x4s(math.floatEps(f32)), 3)) { + const hv = select(boolx4(true, false, false, false), f32x4s(0.0), v); + const hva = select(boolx4(true, true, true, false), hv, rgb); + return select(boolx4(true, false, true, true), hva, s); + } else { + var h: F32x4 = undefined; + if (all(r == v, 3)) { + h = (g - b) / d; + if (all(g < b, 3)) + h += f32x4s(6.0); + } else if (all(g == v, 3)) { + h = f32x4s(2.0) + (b - r) / d; + } else { + h = f32x4s(4.0) + (r - g) / d; + } + + h /= f32x4s(6.0); + const hv = select(boolx4(true, false, false, false), h, v); + const hva = select(boolx4(true, true, true, false), hv, rgb); + return select(boolx4(true, false, true, true), hva, s); + } +} +test "zmath.color.rgbToHsv" { + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.75, 0.8, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.2, 0.8, 1.0)), f32x4(0.7222, 0.75, 0.8, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.8, 0.2, 1.0)), f32x4(0.2777, 0.75, 0.8, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 1.0, 0.5), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 1.0, 0.25), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 1.0, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); +} + +pub fn hsvToRgb(hsv: F32x4) F32x4 { + const h = swizzle(hsv, .x, .x, .x, .x); + const s = swizzle(hsv, .y, .y, .y, .y); + const v = swizzle(hsv, .z, .z, .z, .z); + + const h6 = h * f32x4s(6.0); + const i = floor(h6); + const f = h6 - i; + + const p = v * (f32x4s(1.0) - s); + const q = v * (f32x4s(1.0) - f * s); + const t = v * (f32x4s(1.0) - (f32x4s(1.0) - f) * s); + + const ii = @as(i32, @intFromFloat(mod(i, f32x4s(6.0))[0])); + const rgb = switch (ii) { + 0 => blk: { + const vt = select(boolx4(true, false, false, false), v, t); + break :blk select(boolx4(true, true, false, false), vt, p); + }, + 1 => blk: { + const qv = select(boolx4(true, false, false, false), q, v); + break :blk select(boolx4(true, true, false, false), qv, p); + }, + 2 => blk: { + const pv = select(boolx4(true, false, false, false), p, v); + break :blk select(boolx4(true, true, false, false), pv, t); + }, + 3 => blk: { + const pq = select(boolx4(true, false, false, false), p, q); + break :blk select(boolx4(true, true, false, false), pq, v); + }, + 4 => blk: { + const tp = select(boolx4(true, false, false, false), t, p); + break :blk select(boolx4(true, true, false, false), tp, v); + }, + 5 => blk: { + const vp = select(boolx4(true, false, false, false), v, p); + break :blk select(boolx4(true, true, false, false), vp, q); + }, + else => unreachable, + }; + return select(boolx4(true, true, true, false), rgb, hsv); +} +test "zmath.color.hsvToRgb" { + const epsilon = 0.0005; + try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hsvToRgb(f32x4(0.6111, 0.75, 0.8, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(0.4, 0.2, 0.8, 1.0), hsvToRgb(f32x4(0.7222, 0.75, 0.8, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(0.4, 0.8, 0.2, 1.0), hsvToRgb(f32x4(0.2777, 0.75, 0.8, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hsvToRgb(f32x4(0.0, 1.0, 1.0, 0.5)), epsilon); + try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hsvToRgb(f32x4(0.3333, 1.0, 1.0, 0.25)), epsilon); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hsvToRgb(f32x4(0.6666, 1.0, 1.0, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), epsilon); + try expectVecApproxEqAbs( + hsvToRgb(rgbToHsv(f32x4(0.1839, 0.632, 0.82198, 1.0))), + f32x4(0.1839, 0.632, 0.82198, 1.0), + epsilon, + ); + try expectVecApproxEqAbs( + hsvToRgb(rgbToHsv(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + epsilon, + ); + try expectVecApproxEqAbs( + rgbToHsv(hsvToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + epsilon, + ); + try expectVecApproxEqAbs( + rgbToHsv(hsvToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), + f32x4(0.1839, 0.82198, 0.632, 1.0), + epsilon, + ); +} + +pub fn rgbToSrgb(rgb: F32x4) F32x4 { + const static = struct { + const cutoff = f32x4(0.0031308, 0.0031308, 0.0031308, 1.0); + const linear = f32x4(12.92, 12.92, 12.92, 1.0); + const scale = f32x4(1.055, 1.055, 1.055, 1.0); + const bias = f32x4(0.055, 0.055, 0.055, 1.0); + const rgamma = 1.0 / 2.4; + }; + var v = saturate(rgb); + const v0 = v * static.linear; + const v1 = static.scale * f32x4( + math.pow(f32, v[0], static.rgamma), + math.pow(f32, v[1], static.rgamma), + math.pow(f32, v[2], static.rgamma), + v[3], + ) - static.bias; + v = select(v < static.cutoff, v0, v1); + return select(boolx4(true, true, true, false), v, rgb); +} +test "zmath.color.rgbToSrgb" { + const epsilon = 0.001; + try expectVecApproxEqAbs(rgbToSrgb(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.484, 0.665, 0.906, 1.0), epsilon); +} + +pub fn srgbToRgb(srgb: F32x4) F32x4 { + const static = struct { + const cutoff = f32x4(0.04045, 0.04045, 0.04045, 1.0); + const rlinear = f32x4(1.0 / 12.92, 1.0 / 12.92, 1.0 / 12.92, 1.0); + const scale = f32x4(1.0 / 1.055, 1.0 / 1.055, 1.0 / 1.055, 1.0); + const bias = f32x4(0.055, 0.055, 0.055, 1.0); + const gamma = 2.4; + }; + var v = saturate(srgb); + const v0 = v * static.rlinear; + var v1 = static.scale * (v + static.bias); + v1 = f32x4( + math.pow(f32, v1[0], static.gamma), + math.pow(f32, v1[1], static.gamma), + math.pow(f32, v1[2], static.gamma), + v1[3], + ); + v = select(v > static.cutoff, v1, v0); + return select(boolx4(true, true, true, false), v, srgb); +} +test "zmath.color.srgbToRgb" { + const epsilon = 0.0007; + try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), srgbToRgb(f32x4(0.484, 0.665, 0.906, 1.0)), epsilon); + try expectVecApproxEqAbs( + rgbToSrgb(srgbToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), + f32x4(0.1839, 0.82198, 0.632, 1.0), + epsilon, + ); +} +// ------------------------------------------------------------------------------ +// +// X. Misc functions +// +// ------------------------------------------------------------------------------ +pub fn linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 { + const ptvec = pt - linept0; + const linevec = linept1 - linept0; + const scale = dot3(ptvec, linevec) / lengthSq3(linevec); + return length3(ptvec - linevec * scale); +} +test "zmath.linePointDistance" { + { + const linept0 = f32x4(-1.0, -2.0, -3.0, 1.0); + const linept1 = f32x4(1.0, 2.0, 3.0, 1.0); + const pt = f32x4(1.0, 1.0, 1.0, 1.0); + const v = linePointDistance(linept0, linept1, pt); + try expectVecApproxEqAbs(v, splat(F32x4, 0.654), 0.001); + } +} + +fn sin32(v: f32) f32 { + var y = v - math.tau * @round(v * 1.0 / math.tau); + + if (y > 0.5 * math.pi) { + y = math.pi - y; + } else if (y < -math.pi * 0.5) { + y = -math.pi - y; + } + const y2 = y * y; + + // 11-degree minimax approximation + var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); + sinv = mulAdd(sinv, y2, -0.00019840874); + sinv = mulAdd(sinv, y2, 0.0083333310); + sinv = mulAdd(sinv, y2, -0.16666667); + return y * mulAdd(sinv, y2, 1.0); +} +fn cos32(v: f32) f32 { + var y = v - math.tau * @round(v * 1.0 / math.tau); + + const sign = blk: { + if (y > 0.5 * math.pi) { + y = math.pi - y; + break :blk @as(f32, -1.0); + } else if (y < -math.pi * 0.5) { + y = -math.pi - y; + break :blk @as(f32, -1.0); + } else { + break :blk @as(f32, 1.0); + } + }; + const y2 = y * y; + + // 10-degree minimax approximation + var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); + cosv = mulAdd(cosv, y2, -0.0013888378); + cosv = mulAdd(cosv, y2, 0.041666638); + cosv = mulAdd(cosv, y2, -0.5); + return sign * mulAdd(cosv, y2, 1.0); +} +fn sincos32(v: f32) [2]f32 { + var y = v - math.tau * @round(v * 1.0 / math.tau); + + const sign = blk: { + if (y > 0.5 * math.pi) { + y = math.pi - y; + break :blk @as(f32, -1.0); + } else if (y < -math.pi * 0.5) { + y = -math.pi - y; + break :blk @as(f32, -1.0); + } else { + break :blk @as(f32, 1.0); + } + }; + const y2 = y * y; + + // 11-degree minimax approximation + var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); + sinv = mulAdd(sinv, y2, -0.00019840874); + sinv = mulAdd(sinv, y2, 0.0083333310); + sinv = mulAdd(sinv, y2, -0.16666667); + sinv = y * mulAdd(sinv, y2, 1.0); + + // 10-degree minimax approximation + var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); + cosv = mulAdd(cosv, y2, -0.0013888378); + cosv = mulAdd(cosv, y2, 0.041666638); + cosv = mulAdd(cosv, y2, -0.5); + cosv = sign * mulAdd(cosv, y2, 1.0); + + return .{ sinv, cosv }; +} +test "zmath.sincos32" { + const epsilon = 0.0001; + + try expect(math.isNan(sincos32(math.inf(f32))[0])); + try expect(math.isNan(sincos32(math.inf(f32))[1])); + try expect(math.isNan(sincos32(-math.inf(f32))[0])); + try expect(math.isNan(sincos32(-math.inf(f32))[1])); + try expect(math.isNan(sincos32(math.nan(f32))[0])); + try expect(math.isNan(sincos32(-math.nan(f32))[1])); + + try expect(math.isNan(sin32(math.inf(f32)))); + try expect(math.isNan(cos32(math.inf(f32)))); + try expect(math.isNan(sin32(-math.inf(f32)))); + try expect(math.isNan(cos32(-math.inf(f32)))); + try expect(math.isNan(sin32(math.nan(f32)))); + try expect(math.isNan(cos32(-math.nan(f32)))); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const sc = sincos32(f); + const s0 = sin32(f); + const c0 = cos32(f); + const s = @sin(f); + const c = @cos(f); + try expect(math.approxEqAbs(f32, sc[0], s, epsilon)); + try expect(math.approxEqAbs(f32, sc[1], c, epsilon)); + try expect(math.approxEqAbs(f32, s0, s, epsilon)); + try expect(math.approxEqAbs(f32, c0, c, epsilon)); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +fn asin32(v: f32) f32 { + const x = @abs(v); + var omx = 1.0 - x; + if (omx < 0.0) { + omx = 0.0; + } + const root = @sqrt(omx); + + // 7-degree minimax approximation + var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); + result = mulAdd(result, x, -0.0170881256); + result = mulAdd(result, x, 0.0308918810); + result = mulAdd(result, x, -0.0501743046); + result = mulAdd(result, x, 0.0889789874); + result = mulAdd(result, x, -0.2145988016); + result = root * mulAdd(result, x, 1.5707963050); + + return if (v >= 0.0) 0.5 * math.pi - result else result - 0.5 * math.pi; +} +test "zmath.asin32" { + const epsilon = 0.0001; + + try expect(math.approxEqAbs(f32, asin(@as(f32, -1.1)), -0.5 * math.pi, epsilon)); + try expect(math.approxEqAbs(f32, asin(@as(f32, 1.1)), 0.5 * math.pi, epsilon)); + try expect(math.approxEqAbs(f32, asin(@as(f32, -1000.1)), -0.5 * math.pi, epsilon)); + try expect(math.approxEqAbs(f32, asin(@as(f32, 100000.1)), 0.5 * math.pi, epsilon)); + try expect(math.isNan(asin(math.inf(f32)))); + try expect(math.isNan(asin(-math.inf(f32)))); + try expect(math.isNan(asin(math.nan(f32)))); + try expect(math.isNan(asin(-math.nan(f32)))); + + try expectVecApproxEqAbs(asin(splat(F32x8, -100.0)), splat(F32x8, -0.5 * math.pi), epsilon); + try expectVecApproxEqAbs(asin(splat(F32x16, 100.0)), splat(F32x16, 0.5 * math.pi), epsilon); + try expect(all(isNan(asin(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(asin(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(asin(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(asin(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -1.0; + var i: u32 = 0; + while (i < 8) : (i += 1) { + const r0 = asin32(f); + const r1 = math.asin(f); + const r4 = asin(splat(F32x4, f)); + const r8 = asin(splat(F32x8, f)); + const r16 = asin(splat(F32x16, f)); + try expect(math.approxEqAbs(f32, r0, r1, epsilon)); + try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); + try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); + try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); + f += 0.09 * @as(f32, @floatFromInt(i)); + } +} + +fn acos32(v: f32) f32 { + const x = @abs(v); + var omx = 1.0 - x; + if (omx < 0.0) { + omx = 0.0; + } + const root = @sqrt(omx); + + // 7-degree minimax approximation + var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); + result = mulAdd(result, x, -0.0170881256); + result = mulAdd(result, x, 0.0308918810); + result = mulAdd(result, x, -0.0501743046); + result = mulAdd(result, x, 0.0889789874); + result = mulAdd(result, x, -0.2145988016); + result = root * mulAdd(result, x, 1.5707963050); + + return if (v >= 0.0) result else math.pi - result; +} +test "zmath.acos32" { + const epsilon = 0.1; + + try expect(math.approxEqAbs(f32, acos(@as(f32, -1.1)), math.pi, epsilon)); + try expect(math.approxEqAbs(f32, acos(@as(f32, -10000.1)), math.pi, epsilon)); + try expect(math.approxEqAbs(f32, acos(@as(f32, 1.1)), 0.0, epsilon)); + try expect(math.approxEqAbs(f32, acos(@as(f32, 1000.1)), 0.0, epsilon)); + try expect(math.isNan(acos(math.inf(f32)))); + try expect(math.isNan(acos(-math.inf(f32)))); + try expect(math.isNan(acos(math.nan(f32)))); + try expect(math.isNan(acos(-math.nan(f32)))); + + try expectVecApproxEqAbs(acos(splat(F32x8, -100.0)), splat(F32x8, math.pi), epsilon); + try expectVecApproxEqAbs(acos(splat(F32x16, 100.0)), splat(F32x16, 0.0), epsilon); + try expect(all(isNan(acos(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(acos(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(acos(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(acos(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -1.0; + var i: u32 = 0; + while (i < 8) : (i += 1) { + const r0 = acos32(f); + const r1 = math.acos(f); + const r4 = acos(splat(F32x4, f)); + const r8 = acos(splat(F32x8, f)); + const r16 = acos(splat(F32x16, f)); + try expect(math.approxEqAbs(f32, r0, r1, epsilon)); + try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); + try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); + try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); + f += 0.09 * @as(f32, @floatFromInt(i)); + } +} + +pub fn modAngle32(in_angle: f32) f32 { + const angle = in_angle + math.pi; + var temp: f32 = @abs(angle); + temp = temp - (2.0 * math.pi * @as(f32, @floatFromInt(@as(i32, @intFromFloat(temp / math.pi))))); + temp = temp - math.pi; + if (angle < 0.0) { + temp = -temp; + } + return temp; +} + +pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeOf(re0, im0, re1, im1) { + const re0_re1 = re0 * re1; + const re0_im1 = re0 * im1; + return .{ + mulAdd(-im0, im1, re0_re1), // re + mulAdd(re1, im0, re0_im1), // im + }; +} +// ------------------------------------------------------------------------------ +// +// FFT (implementation based on xdsp.h from DirectXMath) +// +// ------------------------------------------------------------------------------ +fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void { + const re0l = swizzle(re0.*, .x, .x, .y, .y); + const re0h = swizzle(re0.*, .z, .z, .w, .w); + + const im0l = swizzle(im0.*, .x, .x, .y, .y); + const im0h = swizzle(im0.*, .z, .z, .w, .w); + + const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l); + const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l); + + const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); + const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w); + const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y); + + const re_templ = swizzle(re_temp, .x, .y, .x, .y); + const im_templ = swizzle(im_temp, .x, .y, .x, .y); + + re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ); + im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ); +} + +fn fftButterflyDit4_4( + re0: *F32x4, + re1: *F32x4, + re2: *F32x4, + re3: *F32x4, + im0: *F32x4, + im1: *F32x4, + im2: *F32x4, + im3: *F32x4, + unity_table_re: []const F32x4, + unity_table_im: []const F32x4, + stride: u32, + last: bool, +) void { + const re_temp0 = re0.* + re2.*; + const im_temp0 = im0.* + im2.*; + + const re_temp2 = re1.* + re3.*; + const im_temp2 = im1.* + im3.*; + + const re_temp1 = re0.* - re2.*; + const im_temp1 = im0.* - im2.*; + + const re_temp3 = re1.* - re3.*; + const im_temp3 = im1.* - im3.*; + + var re_temp4 = re_temp0 + re_temp2; + var im_temp4 = im_temp0 + im_temp2; + + var re_temp5 = re_temp1 + im_temp3; + var im_temp5 = im_temp1 - re_temp3; + + var re_temp6 = re_temp0 - re_temp2; + var im_temp6 = im_temp0 - im_temp2; + + var re_temp7 = re_temp1 - im_temp3; + var im_temp7 = im_temp1 + re_temp3; + + { + const re_im = cmulSoa(re_temp5, im_temp5, unity_table_re[stride], unity_table_im[stride]); + re_temp5 = re_im[0]; + im_temp5 = re_im[1]; + } + { + const re_im = cmulSoa(re_temp6, im_temp6, unity_table_re[stride * 2], unity_table_im[stride * 2]); + re_temp6 = re_im[0]; + im_temp6 = re_im[1]; + } + { + const re_im = cmulSoa(re_temp7, im_temp7, unity_table_re[stride * 3], unity_table_im[stride * 3]); + re_temp7 = re_im[0]; + im_temp7 = re_im[1]; + } + + if (last) { + fftButterflyDit4_1(&re_temp4, &im_temp4); + fftButterflyDit4_1(&re_temp5, &im_temp5); + fftButterflyDit4_1(&re_temp6, &im_temp6); + fftButterflyDit4_1(&re_temp7, &im_temp7); + } + + re0.* = re_temp4; + im0.* = im_temp4; + + re1.* = re_temp5; + im1.* = im_temp5; + + re2.* = re_temp6; + im2.* = im_temp6; + + re3.* = re_temp7; + im3.* = im_temp7; +} + +fn fft4(re: []F32x4, im: []F32x4, count: u32) void { + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= count); + assert(im.len >= count); + + var index: u32 = 0; + while (index < count) : (index += 1) { + fftButterflyDit4_1(&re[index], &im[index]); + } +} +test "zmath.fft4" { + const epsilon = 0.0001; + var re = [_]F32x4{f32x4(1.0, 2.0, 3.0, 4.0)}; + var im = [_]F32x4{f32x4s(0.0)}; + fft4(re[0..], im[0..], 1); + + var re_uns: [1]F32x4 = undefined; + var im_uns: [1]F32x4 = undefined; + fftUnswizzle(re[0..], re_uns[0..]); + fftUnswizzle(im[0..], im_uns[0..]); + + try expectVecApproxEqAbs(re_uns[0], f32x4(10.0, -2.0, -2.0, -2.0), epsilon); + try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 2.0, 0.0, -2.0), epsilon); +} + +fn fft8(re: []F32x4, im: []F32x4, count: u32) void { + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= 2 * count); + assert(im.len >= 2 * count); + + var index: u32 = 0; + while (index < count) : (index += 1) { + var pre = re[index * 2 ..]; + var pim = im[index * 2 ..]; + + var odds_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + var evens_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + var odds_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + var evens_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + fftButterflyDit4_1(&odds_re, &odds_im); + fftButterflyDit4_1(&evens_re, &evens_im); + + { + const re_im = cmulSoa( + odds_re, + odds_im, + f32x4(1.0, 0.70710677, 0.0, -0.70710677), + f32x4(0.0, -0.70710677, -1.0, -0.70710677), + ); + pre[0] = evens_re + re_im[0]; + pim[0] = evens_im + re_im[1]; + } + { + const re_im = cmulSoa( + odds_re, + odds_im, + f32x4(-1.0, -0.70710677, 0.0, 0.70710677), + f32x4(0.0, 0.70710677, 1.0, 0.70710677), + ); + pre[1] = evens_re + re_im[0]; + pim[1] = evens_im + re_im[1]; + } + } +} +test "zmath.fft8" { + const epsilon = 0.0001; + var re = [_]F32x4{ f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0) }; + var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0) }; + fft8(re[0..], im[0..], 1); + + var re_uns: [2]F32x4 = undefined; + var im_uns: [2]F32x4 = undefined; + fftUnswizzle(re[0..], re_uns[0..]); + fftUnswizzle(im[0..], im_uns[0..]); + + try expectVecApproxEqAbs(re_uns[0], f32x4(36.0, -4.0, -4.0, -4.0), epsilon); + try expectVecApproxEqAbs(re_uns[1], f32x4(-4.0, -4.0, -4.0, -4.0), epsilon); + try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 9.656854, 4.0, 1.656854), epsilon); + try expectVecApproxEqAbs(im_uns[1], f32x4(0.0, -1.656854, -4.0, -9.656854), epsilon); +} + +fn fft16(re: []F32x4, im: []F32x4, count: u32) void { + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= 4 * count); + assert(im.len >= 4 * count); + + const static = struct { + const unity_table_re = [4]F32x4{ + f32x4(1.0, 1.0, 1.0, 1.0), + f32x4(1.0, 0.92387950, 0.70710677, 0.38268343), + f32x4(1.0, 0.70710677, -4.3711388e-008, -0.70710677), + f32x4(1.0, 0.38268343, -0.70710677, -0.92387950), + }; + const unity_table_im = [4]F32x4{ + f32x4(-0.0, -0.0, -0.0, -0.0), + f32x4(-0.0, -0.38268343, -0.70710677, -0.92387950), + f32x4(-0.0, -0.70710677, -1.0, -0.70710677), + f32x4(-0.0, -0.92387950, -0.70710677, 0.38268343), + }; + }; + + var index: u32 = 0; + while (index < count) : (index += 1) { + fftButterflyDit4_4( + &re[index * 4], + &re[index * 4 + 1], + &re[index * 4 + 2], + &re[index * 4 + 3], + &im[index * 4], + &im[index * 4 + 1], + &im[index * 4 + 2], + &im[index * 4 + 3], + static.unity_table_re[0..], + static.unity_table_im[0..], + 1, + true, + ); + } +} +test "zmath.fft16" { + const epsilon = 0.0001; + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), + f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), + f32x4(13.0, 14.0, 15.0, 16.0), + }; + var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0) }; + fft16(re[0..], im[0..], 1); + + var re_uns: [4]F32x4 = undefined; + var im_uns: [4]F32x4 = undefined; + fftUnswizzle(re[0..], re_uns[0..]); + fftUnswizzle(im[0..], im_uns[0..]); + + try expectVecApproxEqAbs(re_uns[0], f32x4(136.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(re_uns[1], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(re_uns[2], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(re_uns[3], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 40.218716, 19.313708, 11.972846), epsilon); + try expectVecApproxEqAbs(im_uns[1], f32x4(8.0, 5.345429, 3.313708, 1.591299), epsilon); + try expectVecApproxEqAbs(im_uns[2], f32x4(0.0, -1.591299, -3.313708, -5.345429), epsilon); + try expectVecApproxEqAbs(im_uns[3], f32x4(-8.0, -11.972846, -19.313708, -40.218716), epsilon); +} + +fn fftN(re: []F32x4, im: []F32x4, unity_table: []const F32x4, length: u32, count: u32) void { + assert(length > 16); + assert(std.math.isPowerOfTwo(length)); + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= length * count / 4); + assert(re.len == im.len); + + const total = count * length; + const total_vectors = total / 4; + const stage_vectors = length / 4; + const stage_vectors_mask = stage_vectors - 1; + const stride = length / 16; + const stride_mask = stride - 1; + const stride_inv_mask = ~stride_mask; + + var unity_table_re = unity_table; + var unity_table_im = unity_table[length / 4 ..]; + + var index: u32 = 0; + while (index < total_vectors / 4) : (index += 1) { + const n = (index & stride_inv_mask) * 4 + (index & stride_mask); + fftButterflyDit4_4( + &re[n], + &re[n + stride], + &re[n + stride * 2], + &re[n + stride * 3], + &im[n], + &im[n + stride], + &im[n + stride * 2], + &im[n + stride * 3], + unity_table_re[(n & stage_vectors_mask)..], + unity_table_im[(n & stage_vectors_mask)..], + stride, + false, + ); + } + + if (length > 16 * 4) { + fftN(re, im, unity_table[(length / 2)..], length / 4, count * 4); + } else if (length == 16 * 4) { + fft16(re, im, count * 4); + } else if (length == 8 * 4) { + fft8(re, im, count * 4); + } else if (length == 4 * 4) { + fft4(re, im, count * 4); + } +} +test "zmath.fftN" { + var unity_table: [128]F32x4 = undefined; + const epsilon = 0.0001; + + // 32 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..32]); + fft(re[0..], im[0..], unity_table[0..32]); + + try expectVecApproxEqAbs(re[0], f32x4(528.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[1], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[2], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[3], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[4], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[5], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[6], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[7], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(im[0], f32x4(0.0, 162.450726, 80.437432, 52.744931), epsilon); + try expectVecApproxEqAbs(im[1], f32x4(38.627417, 29.933895, 23.945692, 19.496056), epsilon); + try expectVecApproxEqAbs(im[2], f32x4(16.0, 13.130861, 10.690858, 8.552178), epsilon); + try expectVecApproxEqAbs(im[3], f32x4(6.627417, 4.853547, 3.182598, 1.575862), epsilon); + try expectVecApproxEqAbs(im[4], f32x4(0.0, -1.575862, -3.182598, -4.853547), epsilon); + try expectVecApproxEqAbs(im[5], f32x4(-6.627417, -8.552178, -10.690858, -13.130861), epsilon); + try expectVecApproxEqAbs(im[6], f32x4(-16.0, -19.496056, -23.945692, -29.933895), epsilon); + try expectVecApproxEqAbs(im[7], f32x4(-38.627417, -52.744931, -80.437432, -162.450726), epsilon); + } + + // 64 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..64]); + fft(re[0..], im[0..], unity_table[0..64]); + + try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); + var i: u32 = 1; + while (i < 16) : (i += 1) { + try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); + } + + const expected = [_]f32{ + 0.0, 0.0, 324.901452, 0.000000, 160.874864, 0.0, 105.489863, 0.000000, + 77.254834, 0.0, 59.867789, 0.0, 47.891384, 0.0, 38.992113, 0.0, + 32.000000, 0.000000, 26.261721, 0.000000, 21.381716, 0.000000, 17.104356, 0.000000, + 13.254834, 0.000000, 9.707094, 0.000000, 6.365196, 0.000000, 3.151725, 0.000000, + 0.000000, 0.000000, -3.151725, 0.000000, -6.365196, 0.000000, -9.707094, 0.000000, + -13.254834, 0.000000, -17.104356, 0.000000, -21.381716, 0.000000, -26.261721, 0.000000, + -32.000000, 0.000000, -38.992113, 0.000000, -47.891384, 0.000000, -59.867789, 0.000000, + -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000, + }; + for (expected, 0..) |e, ie| { + try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); + } + } + + // 128 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..128]); + fft(re[0..], im[0..], unity_table[0..128]); + + try expectVecApproxEqAbs(re[0], f32x4(2112.0, 0.0, 0.0, 0.0), epsilon); + var i: u32 = 1; + while (i < 32) : (i += 1) { + try expectVecApproxEqAbs(re[i], f32x4(-64.0, 0.0, 0.0, 0.0), epsilon); + } + + const expected = [_]f32{ + 0.000000, 0.000000, 0.000000, 0.000000, 649.802905, 0.000000, 0.000000, 0.000000, + 321.749727, 0.000000, 0.000000, 0.000000, 210.979725, 0.000000, 0.000000, 0.000000, + 154.509668, 0.000000, 0.000000, 0.000000, 119.735578, 0.000000, 0.000000, 0.000000, + 95.782769, 0.000000, 0.000000, 0.000000, 77.984226, 0.000000, 0.000000, 0.000000, + 64.000000, 0.000000, 0.000000, 0.000000, 52.523443, 0.000000, 0.000000, 0.000000, + 42.763433, 0.000000, 0.000000, 0.000000, 34.208713, 0.000000, 0.000000, 0.000000, + 26.509668, 0.000000, 0.000000, 0.000000, 19.414188, 0.000000, 0.000000, 0.000000, + 12.730392, 0.000000, 0.000000, 0.000000, 6.303450, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, -6.303450, 0.000000, 0.000000, 0.000000, + -12.730392, 0.000000, 0.000000, 0.000000, -19.414188, 0.000000, 0.000000, 0.000000, + -26.509668, 0.000000, 0.000000, 0.000000, -34.208713, 0.000000, 0.000000, 0.000000, + -42.763433, 0.000000, 0.000000, 0.000000, -52.523443, 0.000000, 0.000000, 0.000000, + -64.000000, 0.000000, 0.000000, 0.000000, -77.984226, 0.000000, 0.000000, 0.000000, + -95.782769, 0.000000, 0.000000, 0.000000, -119.735578, 0.000000, 0.000000, 0.000000, + -154.509668, 0.000000, 0.000000, 0.000000, -210.979725, 0.000000, 0.000000, 0.000000, + -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000, + }; + for (expected, 0..) |e, ie| { + try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); + } + } +} + +fn fftUnswizzle(input: []const F32x4, output: []F32x4) void { + assert(std.math.isPowerOfTwo(input.len)); + assert(input.len == output.len); + assert(input.ptr != output.ptr); + + const log2_length = std.math.log2_int(usize, input.len * 4); + assert(log2_length >= 2); + + const length = input.len; + + const f32_output = @as([*]f32, @ptrCast(output.ptr))[0 .. output.len * 4]; + + const static = struct { + const swizzle_table = [256]u8{ + 0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, + 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, + 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, + 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, + 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, + 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, + 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, + 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, + 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, + 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, + 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, + 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, + 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, + 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, + 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, + 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF, + }; + }; + + if ((log2_length & 1) == 0) { + const rev32 = @as(u6, @intCast(32 - log2_length)); + var index: usize = 0; + while (index < length) : (index += 1) { + const n = index * 4; + const addr = + (@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | + (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | + (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | + @as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])); + f32_output[addr >> rev32] = input[index][0]; + f32_output[(0x40000000 | addr) >> rev32] = input[index][1]; + f32_output[(0x80000000 | addr) >> rev32] = input[index][2]; + f32_output[(0xC0000000 | addr) >> rev32] = input[index][3]; + } + } else { + const rev7 = @as(usize, 1) << @as(u6, @intCast(log2_length - 3)); + const rev32 = @as(u6, @intCast(32 - (log2_length - 3))); + var index: usize = 0; + while (index < length) : (index += 1) { + const n = index / 2; + var addr = + (((@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | + (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | + (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | + (@as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])))) >> rev32) | + ((index & 1) * rev7 * 4); + f32_output[addr] = input[index][0]; + addr += rev7; + f32_output[addr] = input[index][1]; + addr += rev7; + f32_output[addr] = input[index][2]; + addr += rev7; + f32_output[addr] = input[index][3]; + } + } +} + +pub fn fftInitUnityTable(out_unity_table: []F32x4) void { + assert(std.math.isPowerOfTwo(out_unity_table.len)); + assert(out_unity_table.len >= 32 and out_unity_table.len <= 512); + + var unity_table = out_unity_table; + + const v0123 = f32x4(0.0, 1.0, 2.0, 3.0); + var length = out_unity_table.len / 4; + var vlstep = f32x4s(0.5 * math.pi / @as(f32, @floatFromInt(length))); + + while (true) { + length /= 4; + var vjp = v0123; + + var j: u32 = 0; + while (j < length) : (j += 1) { + unity_table[j] = f32x4s(1.0); + unity_table[j + length * 4] = f32x4s(0.0); + + var vls = vjp * vlstep; + var sin_cos = sincos(vls); + unity_table[j + length] = sin_cos[1]; + unity_table[j + length * 5] = sin_cos[0] * f32x4s(-1.0); + + var vijp = vjp + vjp; + vls = vijp * vlstep; + sin_cos = sincos(vls); + unity_table[j + length * 2] = sin_cos[1]; + unity_table[j + length * 6] = sin_cos[0] * f32x4s(-1.0); + + vijp = vijp + vjp; + vls = vijp * vlstep; + sin_cos = sincos(vls); + unity_table[j + length * 3] = sin_cos[1]; + unity_table[j + length * 7] = sin_cos[0] * f32x4s(-1.0); + + vjp += f32x4s(4.0); + } + vlstep *= f32x4s(4.0); + unity_table = unity_table[8 * length ..]; + + if (length <= 4) + break; + } +} + +pub fn fft(re: []F32x4, im: []F32x4, unity_table: []const F32x4) void { + const length = @as(u32, @intCast(re.len * 4)); + assert(std.math.isPowerOfTwo(length)); + assert(length >= 4 and length <= 512); + assert(re.len == im.len); + + var re_temp_storage: [128]F32x4 = undefined; + var im_temp_storage: [128]F32x4 = undefined; + const re_temp = re_temp_storage[0..re.len]; + const im_temp = im_temp_storage[0..im.len]; + + @memcpy(re_temp, re); + @memcpy(im_temp, im); + + if (length > 16) { + assert(unity_table.len == length); + fftN(re_temp, im_temp, unity_table, length, 1); + } else if (length == 16) { + fft16(re_temp, im_temp, 1); + } else if (length == 8) { + fft8(re_temp, im_temp, 1); + } else if (length == 4) { + fft4(re_temp, im_temp, 1); + } + + fftUnswizzle(re_temp, re); + fftUnswizzle(im_temp, im); +} + +pub fn ifft(re: []F32x4, im: []const F32x4, unity_table: []const F32x4) void { + const length = @as(u32, @intCast(re.len * 4)); + assert(std.math.isPowerOfTwo(length)); + assert(length >= 4 and length <= 512); + assert(re.len == im.len); + + var re_temp_storage: [128]F32x4 = undefined; + var im_temp_storage: [128]F32x4 = undefined; + var re_temp = re_temp_storage[0..re.len]; + var im_temp = im_temp_storage[0..im.len]; + + const rnp = f32x4s(1.0 / @as(f32, @floatFromInt(length))); + const rnm = f32x4s(-1.0 / @as(f32, @floatFromInt(length))); + + for (re, 0..) |_, i| { + re_temp[i] = re[i] * rnp; + im_temp[i] = im[i] * rnm; + } + + if (length > 16) { + assert(unity_table.len == length); + fftN(re_temp, im_temp, unity_table, length, 1); + } else if (length == 16) { + fft16(re_temp, im_temp, 1); + } else if (length == 8) { + fft8(re_temp, im_temp, 1); + } else if (length == 4) { + fft4(re_temp, im_temp, 1); + } + + fftUnswizzle(re_temp, re); +} +test "zmath.ifft" { + var unity_table: [512]F32x4 = undefined; + const epsilon = 0.0001; + + // 64 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..64]); + fft(re[0..], im[0..], unity_table[0..64]); + + try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); + var i: u32 = 1; + while (i < 16) : (i += 1) { + try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); + } + + ifft(re[0..], im[0..], unity_table[0..64]); + + try expectVecApproxEqAbs(re[0], f32x4(1.0, 2.0, 3.0, 4.0), epsilon); + try expectVecApproxEqAbs(re[1], f32x4(5.0, 6.0, 7.0, 8.0), epsilon); + try expectVecApproxEqAbs(re[2], f32x4(9.0, 10.0, 11.0, 12.0), epsilon); + try expectVecApproxEqAbs(re[3], f32x4(13.0, 14.0, 15.0, 16.0), epsilon); + try expectVecApproxEqAbs(re[4], f32x4(17.0, 18.0, 19.0, 20.0), epsilon); + try expectVecApproxEqAbs(re[5], f32x4(21.0, 22.0, 23.0, 24.0), epsilon); + try expectVecApproxEqAbs(re[6], f32x4(25.0, 26.0, 27.0, 28.0), epsilon); + try expectVecApproxEqAbs(re[7], f32x4(29.0, 30.0, 31.0, 32.0), epsilon); + } + + // 512 samples + { + var re: [128]F32x4 = undefined; + var im = [_]F32x4{f32x4s(0.0)} ** 128; + + for (&re, 0..) |*v, i| { + const f = @as(f32, @floatFromInt(i * 4)); + v.* = f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0); + } + + fftInitUnityTable(unity_table[0..512]); + fft(re[0..], im[0..], unity_table[0..512]); + + for (re, 0..) |v, i| { + const f = @as(f32, @floatFromInt(i * 4)); + try expect(!approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon)); + } + + ifft(re[0..], im[0..], unity_table[0..512]); + + for (re, 0..) |v, i| { + const f = @as(f32, @floatFromInt(i * 4)); + try expectVecApproxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon); + } + } +} +// ------------------------------------------------------------------------------ +// +// Private functions and constants +// +// ------------------------------------------------------------------------------ +const f32x4_sign_mask1: F32x4 = F32x4{ @as(f32, @bitCast(@as(u32, 0x8000_0000))), 0, 0, 0 }; +const f32x4_mask2: F32x4 = F32x4{ + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + 0, + 0, +}; +const f32x4_mask3: F32x4 = F32x4{ + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + 0, +}; + +inline fn splatNegativeZero(comptime T: type) T { + return @splat(@as(f32, @bitCast(@as(u32, 0x8000_0000)))); +} +inline fn splatNoFraction(comptime T: type) T { + return @splat(@as(f32, 8_388_608.0)); +} +inline fn splatAbsMask(comptime T: type) T { + return @splat(@as(f32, @bitCast(@as(u32, 0x7fff_ffff)))); +} + +fn floatToIntAndBack(v: anytype) @TypeOf(v) { + // This routine won't handle nan, inf and numbers greater than 8_388_608.0 (will generate undefined values). + @setRuntimeSafety(false); + + const T = @TypeOf(v); + const len = veclen(T); + + var vi32: [len]i32 = undefined; + comptime var i: u32 = 0; + // vcvttps2dq + inline while (i < len) : (i += 1) { + vi32[i] = @as(i32, @intFromFloat(v[i])); + } + + var vf32: [len]f32 = undefined; + i = 0; + // vcvtdq2ps + inline while (i < len) : (i += 1) { + vf32[i] = @as(f32, @floatFromInt(vi32[i])); + } + + return vf32; +} +test "zmath.floatToIntAndBack" { + { + const v = floatToIntAndBack(f32x4(1.1, 2.9, 3.0, -4.5)); + try expectVecEqual(v, f32x4(1.0, 2.0, 3.0, -4.0)); + } + { + const v = floatToIntAndBack(f32x8(1.1, 2.9, 3.0, -4.5, 2.5, -2.5, 1.1, -100.2)); + try expectVecEqual(v, f32x8(1.0, 2.0, 3.0, -4.0, 2.0, -2.0, 1.0, -100.0)); + } + { + const v = floatToIntAndBack(f32x4(math.inf(f32), 2.9, math.nan(f32), math.snan(f32))); + try expect(v[1] == 2.0); + } +} + +pub fn expectVecEqual(expected: anytype, actual: anytype) !void { + const T = @TypeOf(expected, actual); + inline for (0..veclen(T)) |i| { + try std.testing.expectEqual(expected[i], actual[i]); + } +} + +pub fn expectVecApproxEqAbs(expected: anytype, actual: anytype, eps: f32) !void { + const T = @TypeOf(expected, actual); + inline for (0..veclen(T)) |i| { + try std.testing.expectApproxEqAbs(expected[i], actual[i], eps); + } +} + +pub fn approxEqAbs(v0: anytype, v1: anytype, eps: f32) bool { + const T = @TypeOf(v0, v1); + comptime var i: comptime_int = 0; + inline while (i < veclen(T)) : (i += 1) { + if (!math.approxEqAbs(f32, v0[i], v1[i], eps)) { + return false; + } + } + return true; +} + +// ------------------------------------------------------------------------------ +// This software is available under 2 licenses -- choose whichever you prefer. +// ------------------------------------------------------------------------------ +// ALTERNATIVE A - MIT License +// Copyright (c) 2022 Michal Ziulek and Contributors +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// ------------------------------------------------------------------------------ +// ALTERNATIVE B - Public Domain (www.unlicense.org) +// This is free and unencumbered software released into the public domain. +// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +// software, either in source code form or as a compiled binary, for any purpose, +// commercial or non-commercial, and by any means. +// In jurisdictions that recognize copyright laws, the author or authors of this +// software dedicate any and all copyright interest in the software to the public +// domain. We make this dedication for the benefit of the public at large and to +// the detriment of our heirs and successors. We intend this dedication to be an +// overt act of relinquishment in perpetuity of all present and future rights to +// this software under copyright law. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// ------------------------------------------------------------------------------ diff --git a/src/util.zig b/src/util.zig index 3a626ef..e5afb27 100644 --- a/src/util.zig +++ b/src/util.zig @@ -19,7 +19,7 @@ // // ============================================================================== -const zm = @import("zmath.zig"); +const zm = @import("main.zig"); const std = @import("std"); const math = std.math; const expect = std.testing.expect; diff --git a/src/zmath.zig b/src/zmath.zig deleted file mode 100644 index 9ff0cb5..0000000 --- a/src/zmath.zig +++ /dev/null @@ -1,4571 +0,0 @@ -// ============================================================================== -// -// SIMD math library for game developers -// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath -// -// Should work on all OSes supported by Zig. Works on x86_64 and ARM. -// Provides ~140 optimized routines and ~70 extensive tests. -// Can be used with any graphics API. -// -// zmath uses row-major matrices, row vectors (each row vector is stored in a SIMD register). -// Handedness is determined by which function version is used (Rh vs. Lh), -// otherwise the function works with either left-handed or right-handed view coordinates. -// -// const va = f32x4(1.0, 2.0, 3.0, 1.0); -// const vb = f32x4(-1.0, 1.0, -1.0, 1.0); -// const v0 = va + vb - f32x4(0.0, 1.0, 0.0, 1.0) * f32x4s(3.0); -// const v1 = cross3(va, vb) + f32x4(1.0, 1.0, 1.0, 1.0); -// const v2 = va + dot3(va, vb) / v1; // dotN() returns scalar replicated on all vector components -// -// const m = rotationX(math.pi * 0.25); -// const v = f32x4(...); -// const v0 = mul(v, m); // 'v' treated as a row vector -// const v1 = mul(m, v); // 'v' treated as a column vector -// const f = m[row][column]; -// -// const b = va < vb; -// if (all(b, 0)) { ... } // '0' means check all vector components; if all are 'true' -// if (all(b, 3)) { ... } // '3' means check first three vector components; if all first three are 'true' -// if (any(b, 0)) { ... } // '0' means check all vector components; if any is 'true' -// if (any(b, 3)) { ... } // '3' means check first three vector components; if any from first three is 'true' -// -// var v4 = load(mem[0..], F32x4, 0); -// var v8 = load(mem[100..], F32x8, 0); -// var v16 = load(mem[200..], F32x16, 0); -// -// var camera_position = [3]f32{ 1.0, 2.0, 3.0 }; -// var cam_pos = loadArr3(camera_position); -// ... -// storeArr3(&camera_position, cam_pos); -// -// v4 = sin(v4); // SIMDx4 -// v8 = cos(v8); // .x86_64 -> 2 x SIMDx4, .x86_64+avx+fma -> SIMDx8 -// v16 = atan(v16); // .x86_64 -> 4 x SIMDx4, .x86_64+avx+fma -> 2 x SIMDx8, .x86_64+avx512f -> SIMDx16 -// -// store(mem[0..], v4, 0); -// store(mem[100..], v8, 0); -// store(mem[200..], v16, 0); -// -// ------------------------------------------------------------------------------ -// 1. Initialization functions -// ------------------------------------------------------------------------------ -// -// f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 -// f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 -// f32x16(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, -// e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 -// -// f32x4s(e0: f32) F32x4 -// f32x8s(e0: f32) F32x8 -// f32x16s(e0: f32) F32x16 -// -// boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 -// boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 -// boolx16(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, -// e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 -// -// load(mem: []const f32, comptime T: type, comptime len: u32) T -// store(mem: []f32, v: anytype, comptime len: u32) void -// -// loadArr2(arr: [2]f32) F32x4 -// loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 -// loadArr3(arr: [3]f32) F32x4 -// loadArr3w(arr: [3]f32, w: f32) F32x4 -// loadArr4(arr: [4]f32) F32x4 -// -// storeArr2(arr: *[2]f32, v: F32x4) void -// storeArr3(arr: *[3]f32, v: F32x4) void -// storeArr4(arr: *[4]f32, v: F32x4) void -// -// arr3Ptr(ptr: anytype) *const [3]f32 -// arrNPtr(ptr: anytype) [*]const f32 -// -// splat(comptime T: type, value: f32) T -// splatInt(comptime T: type, value: u32) T -// -// ------------------------------------------------------------------------------ -// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) -// ------------------------------------------------------------------------------ -// -// all(vb: anytype, comptime len: u32) bool -// any(vb: anytype, comptime len: u32) bool -// -// isNearEqual(v0: F32xN, v1: F32xN, epsilon: F32xN) BoolxN -// isNan(v: F32xN) BoolxN -// isInf(v: F32xN) BoolxN -// isInBounds(v: F32xN, bounds: F32xN) BoolxN -// -// andInt(v0: F32xN, v1: F32xN) F32xN -// andNotInt(v0: F32xN, v1: F32xN) F32xN -// orInt(v0: F32xN, v1: F32xN) F32xN -// norInt(v0: F32xN, v1: F32xN) F32xN -// xorInt(v0: F32xN, v1: F32xN) F32xN -// -// minFast(v0: F32xN, v1: F32xN) F32xN -// maxFast(v0: F32xN, v1: F32xN) F32xN -// min(v0: F32xN, v1: F32xN) F32xN -// max(v0: F32xN, v1: F32xN) F32xN -// round(v: F32xN) F32xN -// floor(v: F32xN) F32xN -// trunc(v: F32xN) F32xN -// ceil(v: F32xN) F32xN -// clamp(v0: F32xN, v1: F32xN) F32xN -// clampFast(v0: F32xN, v1: F32xN) F32xN -// saturate(v: F32xN) F32xN -// saturateFast(v: F32xN) F32xN -// lerp(v0: F32xN, v1: F32xN, t: f32) F32xN -// lerpV(v0: F32xN, v1: F32xN, t: F32xN) F32xN -// lerpInverse(v0: F32xN, v1: F32xN, t: f32) F32xN -// lerpInverseV(v0: F32xN, v1: F32xN, t: F32xN) F32xN -// mapLinear(v: F32xN, min1: f32, max1: f32, min2: f32, max2: f32) F32xN -// mapLinearV(v: F32xN, min1: F32xN, max1: F32xN, min2: F32xN, max2: F32xN) F32xN -// sqrt(v: F32xN) F32xN -// abs(v: F32xN) F32xN -// mod(v0: F32xN, v1: F32xN) F32xN -// modAngle(v: F32xN) F32xN -// mulAdd(v0: F32xN, v1: F32xN, v2: F32xN) F32xN -// select(mask: BoolxN, v0: F32xN, v1: F32xN) -// sin(v: F32xN) F32xN -// cos(v: F32xN) F32xN -// sincos(v: F32xN) [2]F32xN -// asin(v: F32xN) F32xN -// acos(v: F32xN) F32xN -// atan(v: F32xN) F32xN -// atan2(vy: F32xN, vx: F32xN) F32xN -// cmulSoa(re0: F32xN, im0: F32xN, re1: F32xN, im1: F32xN) [2]F32xN -// -// ------------------------------------------------------------------------------ -// 3. 2D, 3D, 4D vector functions -// ------------------------------------------------------------------------------ -// -// swizzle(v: Vec, c, c, c, c) Vec (comptime c = .x | .y | .z | .w) -// dot2(v0: Vec, v1: Vec) F32x4 -// dot3(v0: Vec, v1: Vec) F32x4 -// dot4(v0: Vec, v1: Vec) F32x4 -// cross3(v0: Vec, v1: Vec) Vec -// lengthSq2(v: Vec) F32x4 -// lengthSq3(v: Vec) F32x4 -// lengthSq4(v: Vec) F32x4 -// length2(v: Vec) F32x4 -// length3(v: Vec) F32x4 -// length4(v: Vec) F32x4 -// normalize2(v: Vec) Vec -// normalize3(v: Vec) Vec -// normalize4(v: Vec) Vec -// -// vecToArr2(v: Vec) [2]f32 -// vecToArr3(v: Vec) [3]f32 -// vecToArr4(v: Vec) [4]f32 -// -// ------------------------------------------------------------------------------ -// 4. Matrix functions -// ------------------------------------------------------------------------------ -// -// identity() Mat -// mul(m0: Mat, m1: Mat) Mat -// mul(s: f32, m: Mat) Mat -// mul(m: Mat, s: f32) Mat -// mul(v: Vec, m: Mat) Vec -// mul(m: Mat, v: Vec) Vec -// transpose(m: Mat) Mat -// rotationX(angle: f32) Mat -// rotationY(angle: f32) Mat -// rotationZ(angle: f32) Mat -// translation(x: f32, y: f32, z: f32) Mat -// translationV(v: Vec) Mat -// scaling(x: f32, y: f32, z: f32) Mat -// scalingV(v: Vec) Mat -// lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat -// lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat -// lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat -// lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat -// perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat -// perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat -// perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat -// perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat -// orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat -// orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat -// orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat -// orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat -// orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat -// orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat -// orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat -// orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat -// determinant(m: Mat) F32x4 -// inverse(m: Mat) Mat -// inverseDet(m: Mat, det: ?*F32x4) Mat -// matToQuat(m: Mat) Quat -// matFromAxisAngle(axis: Vec, angle: f32) Mat -// matFromNormAxisAngle(axis: Vec, angle: f32) Mat -// matFromQuat(quat: Quat) Mat -// matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat -// matFromRollPitchYawV(angles: Vec) Mat -// matFromArr(arr: [16]f32) Mat -// -// loadMat(mem: []const f32) Mat -// loadMat43(mem: []const f32) Mat -// loadMat34(mem: []const f32) Mat -// storeMat(mem: []f32, m: Mat) void -// storeMat43(mem: []f32, m: Mat) void -// storeMat34(mem: []f32, m: Mat) void -// -// matToArr(m: Mat) [16]f32 -// matToArr43(m: Mat) [12]f32 -// matToArr34(m: Mat) [12]f32 -// -// ------------------------------------------------------------------------------ -// 5. Quaternion functions -// ------------------------------------------------------------------------------ -// -// qmul(q0: Quat, q1: Quat) Quat -// qidentity() Quat -// conjugate(quat: Quat) Quat -// inverse(q: Quat) Quat -// rotate(q: Quat, v: Vec) Vec -// slerp(q0: Quat, q1: Quat, t: f32) Quat -// slerpV(q0: Quat, q1: Quat, t: F32x4) Quat -// quatToMat(quat: Quat) Mat -// quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void -// quatFromMat(m: Mat) Quat -// quatFromAxisAngle(axis: Vec, angle: f32) Quat -// quatFromNormAxisAngle(axis: Vec, angle: f32) Quat -// quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat -// quatFromRollPitchYawV(angles: Vec) Quat -// -// ------------------------------------------------------------------------------ -// 6. Color functions -// ------------------------------------------------------------------------------ -// -// adjustSaturation(color: F32x4, saturation: f32) F32x4 -// adjustContrast(color: F32x4, contrast: f32) F32x4 -// rgbToHsl(rgb: F32x4) F32x4 -// hslToRgb(hsl: F32x4) F32x4 -// rgbToHsv(rgb: F32x4) F32x4 -// hsvToRgb(hsv: F32x4) F32x4 -// rgbToSrgb(rgb: F32x4) F32x4 -// srgbToRgb(srgb: F32x4) F32x4 -// -// ------------------------------------------------------------------------------ -// X. Misc functions -// ------------------------------------------------------------------------------ -// -// linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 -// sin(v: f32) f32 -// cos(v: f32) f32 -// sincos(v: f32) [2]f32 -// asin(v: f32) f32 -// acos(v: f32) f32 -// -// fftInitUnityTable(unitytable: []F32x4) void -// fft(re: []F32x4, im: []F32x4, unitytable: []const F32x4) void -// ifft(re: []F32x4, im: []const F32x4, unitytable: []const F32x4) void -// -// ============================================================================== - -// Fundamental types -pub const F32x4 = @Vector(4, f32); -pub const F32x8 = @Vector(8, f32); -pub const F32x16 = @Vector(16, f32); -pub const Boolx4 = @Vector(4, bool); -pub const Boolx8 = @Vector(8, bool); -pub const Boolx16 = @Vector(16, bool); - -// "Higher-level" aliases -pub const Vec = F32x4; -pub const Mat = [4]F32x4; -pub const Quat = F32x4; - -const builtin = @import("builtin"); -const std = @import("std"); -const math = std.math; -const assert = std.debug.assert; -const expect = std.testing.expect; - -const cpu_arch = builtin.cpu.arch; -const has_avx = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx) else false; -const has_avx512f = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f) else false; -const has_fma = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .fma) else false; -// ------------------------------------------------------------------------------ -// -// 1. Initialization functions -// -// ------------------------------------------------------------------------------ -pub inline fn f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 { - return .{ e0, e1, e2, e3 }; -} -pub inline fn f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 { - return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; -} -// zig fmt: off -pub inline fn f32x16( - e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, - e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 { - return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; -} -// zig fmt: on - -pub inline fn f32x4s(e0: f32) F32x4 { - return splat(F32x4, e0); -} -pub inline fn f32x8s(e0: f32) F32x8 { - return splat(F32x8, e0); -} -pub inline fn f32x16s(e0: f32) F32x16 { - return splat(F32x16, e0); -} - -pub inline fn boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 { - return .{ e0, e1, e2, e3 }; -} -pub inline fn boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 { - return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; -} -// zig fmt: off -pub inline fn boolx16( - e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, - e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 { - return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; -} -// zig fmt: on - -pub inline fn veclen(comptime T: type) comptime_int { - return @typeInfo(T).vector.len; -} - -pub inline fn splat(comptime T: type, value: f32) T { - return @splat(value); -} -pub inline fn splatInt(comptime T: type, value: u32) T { - return @splat(@bitCast(value)); -} - -pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T { - var v = splat(T, 0.0); - const loop_len = if (len == 0) veclen(T) else len; - comptime var i: u32 = 0; - inline while (i < loop_len) : (i += 1) { - v[i] = mem[i]; - } - return v; -} -test "zmath.load" { - const a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; - var ptr = &a; - var i: u32 = 0; - const v0 = load(a[i..], F32x4, 2); - try expectVecEqual(v0, F32x4{ 1.0, 2.0, 0.0, 0.0 }); - i += 2; - const v1 = load(a[i .. i + 2], F32x4, 2); - try expectVecEqual(v1, F32x4{ 3.0, 4.0, 0.0, 0.0 }); - const v2 = load(a[5..7], F32x4, 2); - try expectVecEqual(v2, F32x4{ 6.0, 7.0, 0.0, 0.0 }); - const v3 = load(ptr[1..], F32x4, 2); - try expectVecEqual(v3, F32x4{ 2.0, 3.0, 0.0, 0.0 }); - i += 1; - const v4 = load(ptr[i .. i + 2], F32x4, 2); - try expectVecEqual(v4, F32x4{ 4.0, 5.0, 0.0, 0.0 }); -} - -pub fn store(mem: []f32, v: anytype, comptime len: u32) void { - const T = @TypeOf(v); - const loop_len = if (len == 0) veclen(T) else len; - comptime var i: u32 = 0; - inline while (i < loop_len) : (i += 1) { - mem[i] = v[i]; - } -} -test "zmath.store" { - var a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; - const v = load(a[1..], F32x4, 3); - store(a[2..], v, 4); - try expect(a[0] == 1.0); - try expect(a[1] == 2.0); - try expect(a[2] == 2.0); - try expect(a[3] == 3.0); - try expect(a[4] == 4.0); - try expect(a[5] == 0.0); -} - -pub inline fn loadArr2(arr: [2]f32) F32x4 { - return f32x4(arr[0], arr[1], 0.0, 0.0); -} -pub inline fn loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 { - return f32x4(arr[0], arr[1], z, w); -} -pub inline fn loadArr3(arr: [3]f32) F32x4 { - return f32x4(arr[0], arr[1], arr[2], 0.0); -} -pub inline fn loadArr3w(arr: [3]f32, w: f32) F32x4 { - return f32x4(arr[0], arr[1], arr[2], w); -} -pub inline fn loadArr4(arr: [4]f32) F32x4 { - return f32x4(arr[0], arr[1], arr[2], arr[3]); -} - -pub inline fn storeArr2(arr: *[2]f32, v: F32x4) void { - arr.* = .{ v[0], v[1] }; -} -pub inline fn storeArr3(arr: *[3]f32, v: F32x4) void { - arr.* = .{ v[0], v[1], v[2] }; -} -pub inline fn storeArr4(arr: *[4]f32, v: F32x4) void { - arr.* = .{ v[0], v[1], v[2], v[3] }; -} - -pub inline fn arr3Ptr(ptr: anytype) *const [3]f32 { - comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer); - const T = std.meta.Child(@TypeOf(ptr)); - comptime assert(T == F32x4); - return @as(*const [3]f32, @ptrCast(ptr)); -} - -pub inline fn arrNPtr(ptr: anytype) [*]const f32 { - comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer); - const T = std.meta.Child(@TypeOf(ptr)); - comptime assert(T == Mat or T == F32x4 or T == F32x8 or T == F32x16); - return @as([*]const f32, @ptrCast(ptr)); -} -test "zmath.arrNPtr" { - { - const mat = identity(); - const f32ptr = arrNPtr(&mat); - try expect(f32ptr[0] == 1.0); - try expect(f32ptr[5] == 1.0); - try expect(f32ptr[10] == 1.0); - try expect(f32ptr[15] == 1.0); - } - { - const v8 = f32x8s(1.0); - const f32ptr = arrNPtr(&v8); - try expect(f32ptr[1] == 1.0); - try expect(f32ptr[7] == 1.0); - } -} - -test "zmath.loadArr" { - { - const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; - const simd_reg = loadArr3(camera_position); - try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 0.0)); - } - { - const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; - const simd_reg = loadArr3w(camera_position, 1.0); - try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 1.0)); - } -} - -pub inline fn vecToArr2(v: Vec) [2]f32 { - return .{ v[0], v[1] }; -} -pub inline fn vecToArr3(v: Vec) [3]f32 { - return .{ v[0], v[1], v[2] }; -} -pub inline fn vecToArr4(v: Vec) [4]f32 { - return .{ v[0], v[1], v[2], v[3] }; -} -// ------------------------------------------------------------------------------ -// -// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) -// -// ------------------------------------------------------------------------------ -pub fn all(vb: anytype, comptime len: u32) bool { - const T = @TypeOf(vb); - if (len > veclen(T)) { - @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T)); - } - const loop_len = if (len == 0) veclen(T) else len; - const ab: [veclen(T)]bool = vb; - comptime var i: u32 = 0; - var result = true; - inline while (i < loop_len) : (i += 1) { - result = result and ab[i]; - } - return result; -} -test "zmath.all" { - try expect(all(boolx8(true, true, true, true, true, false, true, false), 5) == true); - try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false); - try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true); - try expect(all(boolx4(true, true, true, false), 3) == true); - try expect(all(boolx4(true, true, true, false), 1) == true); - try expect(all(boolx4(true, false, false, false), 1) == true); - try expect(all(boolx4(false, true, false, false), 1) == false); - try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false); - try expect(all(boolx4(false, true, false, false), 0) == false); - try expect(all(boolx4(true, true, true, true), 0) == true); -} - -pub fn any(vb: anytype, comptime len: u32) bool { - const T = @TypeOf(vb); - if (len > veclen(T)) { - @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T)); - } - const loop_len = if (len == 0) veclen(T) else len; - const ab: [veclen(T)]bool = vb; - comptime var i: u32 = 0; - var result = false; - inline while (i < loop_len) : (i += 1) { - result = result or ab[i]; - } - return result; -} -test "zmath.any" { - try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true); - try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false); - try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false); -} - -pub inline fn isNearEqual( - v0: anytype, - v1: anytype, - epsilon: anytype, -) @Vector(veclen(@TypeOf(v0)), bool) { - const T = @TypeOf(v0, v1, epsilon); - const delta = v0 - v1; - const temp = maxFast(delta, splat(T, 0.0) - delta); - return temp <= epsilon; -} -test "zmath.isNearEqual" { - { - const v0 = f32x4(1.0, 2.0, -3.0, 4.001); - const v1 = f32x4(1.0, 2.1, 3.0, 4.0); - const b = isNearEqual(v0, v1, splat(F32x4, 0.01)); - try expect(@reduce(.And, b == boolx4(true, false, false, true))); - } - { - const v0 = f32x8(1.0, 2.0, -3.0, 4.001, 1.001, 2.3, -0.0, 0.0); - const v1 = f32x8(1.0, 2.1, 3.0, 4.0, -1.001, 2.1, 0.0, 0.0); - const b = isNearEqual(v0, v1, splat(F32x8, 0.01)); - try expect(@reduce(.And, b == boolx8(true, false, false, true, false, false, true, true))); - } - try expect(all(isNearEqual( - splat(F32x4, math.inf(f32)), - splat(F32x4, math.inf(f32)), - splat(F32x4, 0.0001), - ), 0) == false); - try expect(all(isNearEqual( - splat(F32x4, -math.inf(f32)), - splat(F32x4, math.inf(f32)), - splat(F32x4, 0.0001), - ), 0) == false); - try expect(all(isNearEqual( - splat(F32x4, -math.inf(f32)), - splat(F32x4, -math.inf(f32)), - splat(F32x4, 0.0001), - ), 0) == false); - try expect(all(isNearEqual( - splat(F32x4, -math.nan(f32)), - splat(F32x4, math.inf(f32)), - splat(F32x4, 0.0001), - ), 0) == false); -} - -pub inline fn isNan( - v: anytype, -) @Vector(veclen(@TypeOf(v)), bool) { - return v != v; -} -test "zmath.isNan" { - { - const v0 = f32x4(math.inf(f32), math.nan(f32), math.nan(f32), 7.0); - const b = isNan(v0); - try expect(@reduce(.And, b == boolx4(false, true, true, false))); - } - { - const v0 = f32x8(0, math.nan(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); - const b = isNan(v0); - try expect(@reduce(.And, b == boolx8(false, true, false, false, false, true, true, false))); - } -} - -pub inline fn isInf( - v: anytype, -) @Vector(veclen(@TypeOf(v)), bool) { - const T = @TypeOf(v); - return abs(v) == splat(T, math.inf(f32)); -} -test "zmath.isInf" { - { - const v0 = f32x4(math.inf(f32), math.nan(f32), math.snan(f32), 7.0); - const b = isInf(v0); - try expect(@reduce(.And, b == boolx4(true, false, false, false))); - } - { - const v0 = f32x8(0, math.inf(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); - const b = isInf(v0); - try expect(@reduce(.And, b == boolx8(false, true, false, false, true, false, false, false))); - } -} - -pub inline fn isInBounds( - v: anytype, - bounds: anytype, -) @Vector(veclen(@TypeOf(v)), bool) { - const T = @TypeOf(v, bounds); - const Tu = @Vector(veclen(T), u1); - const Tr = @Vector(veclen(T), bool); - - // 2 x cmpleps, xorps, load, andps - const b0 = v <= bounds; - const b1 = (bounds * splat(T, -1.0)) <= v; - const b0u = @as(Tu, @bitCast(b0)); - const b1u = @as(Tu, @bitCast(b1)); - return @as(Tr, @bitCast(b0u & b1u)); -} -test "zmath.isInBounds" { - { - const v0 = f32x4(0.5, -2.0, -1.0, 1.9); - const v1 = f32x4(-1.6, -2.001, -1.0, 1.9); - const bounds = f32x4(1.0, 2.0, 1.0, 2.0); - const b0 = isInBounds(v0, bounds); - const b1 = isInBounds(v1, bounds); - try expect(@reduce(.And, b0 == boolx4(true, true, true, true))); - try expect(@reduce(.And, b1 == boolx4(false, false, true, true))); - } - { - const v0 = f32x8(2.0, 1.0, 2.0, 1.0, 0.5, -2.0, -1.0, 1.9); - const bounds = f32x8(1.0, 1.0, 1.0, math.inf(f32), 1.0, math.nan(f32), 1.0, 2.0); - const b0 = isInBounds(v0, bounds); - try expect(@reduce(.And, b0 == boolx8(false, true, false, true, true, false, true, true))); - } -} - -pub inline fn andInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - const Tu = @Vector(veclen(T), u32); - const v0u = @as(Tu, @bitCast(v0)); - const v1u = @as(Tu, @bitCast(v1)); - return @as(T, @bitCast(v0u & v1u)); // andps -} -test "zmath.andInt" { - { - const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); - const v1 = f32x4(1.0, 2.0, 3.0, math.inf(f32)); - const v = andInt(v0, v1); - try expect(v[3] == math.inf(f32)); - try expectVecEqual(v, f32x4(0.0, 2.0, 0.0, math.inf(f32))); - } - { - const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); - const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, math.inf(f32)); - const v = andInt(v0, v1); - try expect(v[7] == math.inf(f32)); - try expectVecEqual(v, f32x8(0, 0, 0, 0, 0.0, 2.0, 0.0, math.inf(f32))); - } -} - -pub inline fn andNotInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - const Tu = @Vector(veclen(T), u32); - const v0u = @as(Tu, @bitCast(v0)); - const v1u = @as(Tu, @bitCast(v1)); - return @as(T, @bitCast(~v0u & v1u)); // andnps -} -test "zmath.andNotInt" { - { - const v0 = f32x4(1.0, 2.0, 3.0, 4.0); - const v1 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); - const v = andNotInt(v1, v0); - try expectVecEqual(v, f32x4(1.0, 0.0, 3.0, 0.0)); - } - { - const v0 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); - const v1 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); - const v = andNotInt(v1, v0); - try expectVecEqual(v, f32x8(0, 0, 0, 0, 1.0, 0.0, 3.0, 0.0)); - } -} - -pub inline fn orInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - const Tu = @Vector(veclen(T), u32); - const v0u = @as(Tu, @bitCast(v0)); - const v1u = @as(Tu, @bitCast(v1)); - return @as(T, @bitCast(v0u | v1u)); // orps -} -test "zmath.orInt" { - { - const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); - const v1 = f32x4(1.0, 2.0, 3.0, 4.0); - const v = orInt(v0, v1); - try expect(v[0] == 1.0); - try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); - try expect(v[2] == 3.0); - try expect(v[3] == 4.0); - } - { - const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); - const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); - const v = orInt(v0, v1); - try expect(v[4] == 1.0); - try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); - try expect(v[6] == 3.0); - try expect(v[7] == 4.0); - } -} - -pub inline fn norInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - const Tu = @Vector(veclen(T), u32); - const v0u = @as(Tu, @bitCast(v0)); - const v1u = @as(Tu, @bitCast(v1)); - return @as(T, @bitCast(~(v0u | v1u))); // por, pcmpeqd, pxor -} - -pub inline fn xorInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - const Tu = @Vector(veclen(T), u32); - const v0u = @as(Tu, @bitCast(v0)); - const v1u = @as(Tu, @bitCast(v1)); - return @as(T, @bitCast(v0u ^ v1u)); // xorps -} -test "zmath.xorInt" { - { - const v0 = f32x4(1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); - const v1 = f32x4(1.0, 0, 0, 0); - const v = xorInt(v0, v1); - try expect(v[0] == 0.0); - try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); - try expect(v[2] == 0.0); - try expect(v[3] == 0.0); - } - { - const v0 = f32x8(0, 0, 0, 0, 1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); - const v1 = f32x8(0, 0, 0, 0, 1.0, 0, 0, 0); - const v = xorInt(v0, v1); - try expect(v[4] == 0.0); - try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); - try expect(v[6] == 0.0); - try expect(v[7] == 0.0); - } -} - -pub inline fn minFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - return select(v0 < v1, v0, v1); // minps -} -test "zmath.minFast" { - { - const v0 = f32x4(1.0, 3.0, 2.0, 7.0); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = minFast(v0, v1); - try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); - } - { - const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = minFast(v0, v1); - try expect(v[0] == 1.0); - try expect(v[1] == 1.0); - try expect(!math.isNan(v[1])); - try expect(v[2] == 4.0); - try expect(v[3] == math.inf(f32)); - try expect(!math.isNan(v[3])); - } -} - -pub inline fn maxFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - return select(v0 > v1, v0, v1); // maxps -} -test "zmath.maxFast" { - { - const v0 = f32x4(1.0, 3.0, 2.0, 7.0); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = maxFast(v0, v1); - try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); - } - { - const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = maxFast(v0, v1); - try expect(v[0] == 2.0); - try expect(v[1] == 1.0); - try expect(v[2] == 5.0); - try expect(v[3] == math.inf(f32)); - try expect(!math.isNan(v[3])); - } -} - -pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - // This will handle inf & nan - return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps -} -test "zmath.min" { - // Calling math.inf causes test to fail! - if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; - { - const v0 = f32x4(1.0, 3.0, 2.0, 7.0); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = min(v0, v1); - try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); - } - { - const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); - const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); - const v = min(v0, v1); - try expectVecEqual(v, f32x8(0.0, 0.0, -2.0, 0.0, 1.0, 1.0, 2.0, 7.0)); - } - { - const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = min(v0, v1); - try expect(v[0] == 1.0); - try expect(v[1] == 1.0); - try expect(!math.isNan(v[1])); - try expect(v[2] == 4.0); - try expect(v[3] == math.inf(f32)); - try expect(!math.isNan(v[3])); - } - - { - const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); - const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); - const v = min(v0, v1); - try expect(v[0] == -math.inf(f32)); - try expect(v[1] == -math.inf(f32)); - try expect(v[2] == math.inf(f32)); - try expect(!math.isNan(v[2])); - try expect(math.isNan(v[3])); - try expect(!math.isInf(v[3])); - } -} - -pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - // This will handle inf & nan - return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps -} -test "zmath.max" { - // Calling math.inf causes test to fail! - if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; - { - const v0 = f32x4(1.0, 3.0, 2.0, 7.0); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = max(v0, v1); - try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); - } - { - const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); - const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); - const v = max(v0, v1); - try expectVecEqual(v, f32x8(0.0, 1.0, 0.0, 0.0, 2.0, 3.0, 4.0, math.inf(f32))); - } - { - const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); - const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); - const v = max(v0, v1); - try expect(v[0] == 2.0); - try expect(v[1] == 1.0); - try expect(v[2] == 5.0); - try expect(v[3] == math.inf(f32)); - try expect(!math.isNan(v[3])); - } - { - const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); - const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); - const v = max(v0, v1); - try expect(v[0] == -math.inf(f32)); - try expect(v[1] == math.inf(f32)); - try expect(v[2] == math.inf(f32)); - try expect(!math.isNan(v[2])); - try expect(math.isNan(v[3])); - try expect(!math.isInf(v[3])); - } -} - -pub fn round(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - if (cpu_arch == .x86_64 and has_avx) { - if (T == F32x4) { - return asm ("vroundps $0, %%xmm0, %%xmm0" - : [ret] "={xmm0}" (-> T), - : [v] "{xmm0}" (v), - ); - } else if (T == F32x8) { - return asm ("vroundps $0, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> T), - : [v] "{ymm0}" (v), - ); - } else if (T == F32x16 and has_avx512f) { - return asm ("vrndscaleps $0, %%zmm0, %%zmm0" - : [ret] "={zmm0}" (-> T), - : [v] "{zmm0}" (v), - ); - } else if (T == F32x16 and !has_avx512f) { - const arr: [16]f32 = v; - var ymm0 = @as(F32x8, arr[0..8].*); - var ymm1 = @as(F32x8, arr[8..16].*); - ymm0 = asm ("vroundps $0, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> F32x8), - : [v] "{ymm0}" (ymm0), - ); - ymm1 = asm ("vroundps $0, %%ymm1, %%ymm1" - : [ret] "={ymm1}" (-> F32x8), - : [v] "{ymm1}" (ymm1), - ); - return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); - } - } else { - const sign = andInt(v, splatNegativeZero(T)); - const magic = orInt(splatNoFraction(T), sign); - var r1 = v + magic; - r1 = r1 - magic; - const r2 = abs(v); - const mask = r2 <= splatNoFraction(T); - return select(mask, r1, v); - } -} -test "zmath.round" { - { - try expect(all(round(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); - try expect(all(round(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); - try expect(all(isNan(round(splat(F32x4, math.nan(f32)))), 0)); - try expect(all(isNan(round(splat(F32x4, -math.nan(f32)))), 0)); - try expect(all(isNan(round(splat(F32x4, math.snan(f32)))), 0)); - try expect(all(isNan(round(splat(F32x4, -math.snan(f32)))), 0)); - } - { - const v = round(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); - try expectVecApproxEqAbs( - v, - f32x16(1.0, -1.0, -2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 6.0, 8.0, 9.0, 10.0, 11.0, 13.0, 13.0), - 0.0, - ); - } - var v = round(f32x4(1.1, -1.1, -1.5, 1.5)); - try expectVecEqual(v, f32x4(1.0, -1.0, -2.0, 2.0)); - - const v1 = f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)); - v = round(v1); - try expect(v[3] == math.inf(f32)); - try expectVecEqual(v, f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - - const v2 = f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)); - v = round(v2); - try expect(math.isNan(v2[0])); - try expect(math.isNan(v2[1])); - try expect(math.isNan(v2[2])); - try expect(v2[3] == -math.inf(f32)); - - const v3 = f32x4(1001.5, -201.499, -10000.99, -101.5); - v = round(v3); - try expectVecEqual(v, f32x4(1002.0, -201.0, -10001.0, -102.0)); - - const v4 = f32x4(-1_388_609.9, 1_388_609.5, 1_388_109.01, 2_388_609.5); - v = round(v4); - try expectVecEqual(v, f32x4(-1_388_610.0, 1_388_610.0, 1_388_109.0, 2_388_610.0)); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const vr = round(splat(F32x4, f)); - const fr = @round(splat(F32x4, f)); - const vr8 = round(splat(F32x8, f)); - const fr8 = @round(splat(F32x8, f)); - const vr16 = round(splat(F32x16, f)); - const fr16 = @round(splat(F32x16, f)); - try expectVecEqual(vr, fr); - try expectVecEqual(vr8, fr8); - try expectVecEqual(vr16, fr16); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -pub fn trunc(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - if (cpu_arch == .x86_64 and has_avx) { - if (T == F32x4) { - return asm ("vroundps $3, %%xmm0, %%xmm0" - : [ret] "={xmm0}" (-> T), - : [v] "{xmm0}" (v), - ); - } else if (T == F32x8) { - return asm ("vroundps $3, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> T), - : [v] "{ymm0}" (v), - ); - } else if (T == F32x16 and has_avx512f) { - return asm ("vrndscaleps $3, %%zmm0, %%zmm0" - : [ret] "={zmm0}" (-> T), - : [v] "{zmm0}" (v), - ); - } else if (T == F32x16 and !has_avx512f) { - const arr: [16]f32 = v; - var ymm0 = @as(F32x8, arr[0..8].*); - var ymm1 = @as(F32x8, arr[8..16].*); - ymm0 = asm ("vroundps $3, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> F32x8), - : [v] "{ymm0}" (ymm0), - ); - ymm1 = asm ("vroundps $3, %%ymm1, %%ymm1" - : [ret] "={ymm1}" (-> F32x8), - : [v] "{ymm1}" (ymm1), - ); - return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); - } - } else { - const mask = abs(v) < splatNoFraction(T); - const result = floatToIntAndBack(v); - return select(mask, result, v); - } -} -test "zmath.trunc" { - { - try expect(all(trunc(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); - try expect(all(trunc(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); - try expect(all(isNan(trunc(splat(F32x4, math.nan(f32)))), 0)); - try expect(all(isNan(trunc(splat(F32x4, -math.nan(f32)))), 0)); - try expect(all(isNan(trunc(splat(F32x4, math.snan(f32)))), 0)); - try expect(all(isNan(trunc(splat(F32x4, -math.snan(f32)))), 0)); - } - { - const v = trunc(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); - try expectVecApproxEqAbs( - v, - f32x16(1.0, -1.0, -1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), - 0.0, - ); - } - var v = trunc(f32x4(1.1, -1.1, -1.5, 1.5)); - try expectVecEqual(v, f32x4(1.0, -1.0, -1.0, 1.0)); - - v = trunc(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - - v = trunc(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); - try expect(math.isNan(v[0])); - try expect(math.isNan(v[1])); - try expect(math.isNan(v[2])); - try expect(v[3] == -math.inf(f32)); - - v = trunc(f32x4(1000.5001, -201.499, -10000.99, 100.750001)); - try expectVecEqual(v, f32x4(1000.0, -201.0, -10000.0, 100.0)); - - v = trunc(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); - try expectVecEqual(v, f32x4(-7_388_609.0, 7_388_609.0, 8_388_109.0, -8_388_509.0)); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const vr = trunc(splat(F32x4, f)); - const fr = @trunc(splat(F32x4, f)); - const vr8 = trunc(splat(F32x8, f)); - const fr8 = @trunc(splat(F32x8, f)); - const vr16 = trunc(splat(F32x16, f)); - const fr16 = @trunc(splat(F32x16, f)); - try expectVecEqual(vr, fr); - try expectVecEqual(vr8, fr8); - try expectVecEqual(vr16, fr16); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -pub fn floor(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - if (cpu_arch == .x86_64 and has_avx) { - if (T == F32x4) { - return asm ("vroundps $1, %%xmm0, %%xmm0" - : [ret] "={xmm0}" (-> T), - : [v] "{xmm0}" (v), - ); - } else if (T == F32x8) { - return asm ("vroundps $1, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> T), - : [v] "{ymm0}" (v), - ); - } else if (T == F32x16 and has_avx512f) { - return asm ("vrndscaleps $1, %%zmm0, %%zmm0" - : [ret] "={zmm0}" (-> T), - : [v] "{zmm0}" (v), - ); - } else if (T == F32x16 and !has_avx512f) { - const arr: [16]f32 = v; - var ymm0 = @as(F32x8, arr[0..8].*); - var ymm1 = @as(F32x8, arr[8..16].*); - ymm0 = asm ("vroundps $1, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> F32x8), - : [v] "{ymm0}" (ymm0), - ); - ymm1 = asm ("vroundps $1, %%ymm1, %%ymm1" - : [ret] "={ymm1}" (-> F32x8), - : [v] "{ymm1}" (ymm1), - ); - return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); - } - } else { - const mask = abs(v) < splatNoFraction(T); - var result = floatToIntAndBack(v); - const larger_mask = result > v; - const larger = select(larger_mask, splat(T, -1.0), splat(T, 0.0)); - result = result + larger; - return select(mask, result, v); - } -} -test "zmath.floor" { - { - try expect(all(floor(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); - try expect(all(floor(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); - try expect(all(isNan(floor(splat(F32x4, math.nan(f32)))), 0)); - try expect(all(isNan(floor(splat(F32x4, -math.nan(f32)))), 0)); - try expect(all(isNan(floor(splat(F32x4, math.snan(f32)))), 0)); - try expect(all(isNan(floor(splat(F32x4, -math.snan(f32)))), 0)); - } - { - const v = floor(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); - try expectVecApproxEqAbs( - v, - f32x16(1.0, -2.0, -2.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), - 0.0, - ); - } - var v = floor(f32x4(1.5, -1.5, -1.7, -2.1)); - try expectVecEqual(v, f32x4(1.0, -2.0, -2.0, -3.0)); - - v = floor(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - - v = floor(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); - try expect(math.isNan(v[0])); - try expect(math.isNan(v[1])); - try expect(math.isNan(v[2])); - try expect(v[3] == -math.inf(f32)); - - v = floor(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); - try expectVecEqual(v, f32x4(1000.0, -202.0, -10001.0, 100.0)); - - v = floor(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); - try expectVecEqual(v, f32x4(-7_388_610.0, 7_388_609.0, 8_388_109.0, -8_388_510.0)); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const vr = floor(splat(F32x4, f)); - const fr = @floor(splat(F32x4, f)); - const vr8 = floor(splat(F32x8, f)); - const fr8 = @floor(splat(F32x8, f)); - const vr16 = floor(splat(F32x16, f)); - const fr16 = @floor(splat(F32x16, f)); - try expectVecEqual(vr, fr); - try expectVecEqual(vr8, fr8); - try expectVecEqual(vr16, fr16); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -pub fn ceil(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - if (cpu_arch == .x86_64 and has_avx) { - if (T == F32x4) { - return asm ("vroundps $2, %%xmm0, %%xmm0" - : [ret] "={xmm0}" (-> T), - : [v] "{xmm0}" (v), - ); - } else if (T == F32x8) { - return asm ("vroundps $2, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> T), - : [v] "{ymm0}" (v), - ); - } else if (T == F32x16 and has_avx512f) { - return asm ("vrndscaleps $2, %%zmm0, %%zmm0" - : [ret] "={zmm0}" (-> T), - : [v] "{zmm0}" (v), - ); - } else if (T == F32x16 and !has_avx512f) { - const arr: [16]f32 = v; - var ymm0 = @as(F32x8, arr[0..8].*); - var ymm1 = @as(F32x8, arr[8..16].*); - ymm0 = asm ("vroundps $2, %%ymm0, %%ymm0" - : [ret] "={ymm0}" (-> F32x8), - : [v] "{ymm0}" (ymm0), - ); - ymm1 = asm ("vroundps $2, %%ymm1, %%ymm1" - : [ret] "={ymm1}" (-> F32x8), - : [v] "{ymm1}" (ymm1), - ); - return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); - } - } else { - const mask = abs(v) < splatNoFraction(T); - var result = floatToIntAndBack(v); - const smaller_mask = result < v; - const smaller = select(smaller_mask, splat(T, -1.0), splat(T, 0.0)); - result = result - smaller; - return select(mask, result, v); - } -} -test "zmath.ceil" { - { - try expect(all(ceil(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); - try expect(all(ceil(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); - try expect(all(isNan(ceil(splat(F32x4, math.nan(f32)))), 0)); - try expect(all(isNan(ceil(splat(F32x4, -math.nan(f32)))), 0)); - try expect(all(isNan(ceil(splat(F32x4, math.snan(f32)))), 0)); - try expect(all(isNan(ceil(splat(F32x4, -math.snan(f32)))), 0)); - } - { - const v = ceil(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); - try expectVecApproxEqAbs( - v, - f32x16(2.0, -1.0, -1.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0, 14.0), - 0.0, - ); - } - var v = ceil(f32x4(1.5, -1.5, -1.7, -2.1)); - try expectVecEqual(v, f32x4(2.0, -1.0, -1.0, -2.0)); - - v = ceil(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); - - v = ceil(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); - try expect(math.isNan(v[0])); - try expect(math.isNan(v[1])); - try expect(math.isNan(v[2])); - try expect(v[3] == -math.inf(f32)); - - v = ceil(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); - try expectVecEqual(v, f32x4(1001.0, -201.0, -10000.0, 101.0)); - - v = ceil(f32x4(-1_388_609.5, 1_388_609.1, 1_388_109.9, -1_388_509.9)); - try expectVecEqual(v, f32x4(-1_388_609.0, 1_388_610.0, 1_388_110.0, -1_388_509.0)); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const vr = ceil(splat(F32x4, f)); - const fr = @ceil(splat(F32x4, f)); - const vr8 = ceil(splat(F32x8, f)); - const fr8 = @ceil(splat(F32x8, f)); - const vr16 = ceil(splat(F32x16, f)); - const fr16 = @ceil(splat(F32x16, f)); - try expectVecEqual(vr, fr); - try expectVecEqual(vr8, fr8); - try expectVecEqual(vr16, fr16); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -pub inline fn clamp(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { - var result = max(vmin, v); - result = min(vmax, result); - return result; -} -test "zmath.clamp" { - // Calling math.inf causes test to fail! - if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; - { - const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); - const v = clamp(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); - try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); - } - { - const v0 = f32x8(-2.0, 0.25, -0.25, 100.0, -1.0, 0.2, 1.1, -0.3); - const v = clamp(v0, splat(F32x8, -0.5), splat(F32x8, 0.5)); - try expectVecApproxEqAbs(v, f32x8(-0.5, 0.25, -0.25, 0.5, -0.5, 0.2, 0.5, -0.3), 0.0001); - } - { - const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); - const v = clamp(v0, f32x4(-100.0, 0.0, -100.0, 0.0), f32x4(0.0, 100.0, 0.0, 100.0)); - try expectVecApproxEqAbs(v, f32x4(-100.0, 100.0, -100.0, 0.0), 0.0001); - } - { - const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); - const v = clamp(v0, splat(F32x4, -1.0), splat(F32x4, 1.0)); - try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, -1.0, -1.0), 0.0001); - } -} - -pub inline fn clampFast(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { - var result = maxFast(vmin, v); - result = minFast(vmax, result); - return result; -} -test "zmath.clampFast" { - { - const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); - const v = clampFast(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); - try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); - } -} - -pub inline fn saturate(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - var result = max(v, splat(T, 0.0)); - result = min(result, splat(T, 1.0)); - return result; -} -test "zmath.saturate" { - // Calling math.inf causes test to fail! - if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; - { - const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); - const v = saturate(v0); - try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); - } - { - const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); - const v = saturate(v0); - try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); - } - { - const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); - const v = saturate(v0); - try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); - } - { - const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); - const v = saturate(v0); - try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); - } -} - -pub inline fn saturateFast(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - var result = maxFast(v, splat(T, 0.0)); - result = minFast(result, splat(T, 1.0)); - return result; -} -test "zmath.saturateFast" { - { - const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); - const v = saturateFast(v0); - try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); - } - { - const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); - const v = saturateFast(v0); - try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); - } - { - const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); - const v = saturateFast(v0); - try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); - } - { - const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); - const v = saturateFast(v0); - try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); - } -} - -pub inline fn sqrt(v: anytype) @TypeOf(v) { - return @sqrt(v); // sqrtps -} - -pub inline fn abs(v: anytype) @TypeOf(v) { - return @abs(v); // load, andps -} - -pub inline fn select(mask: anytype, v0: anytype, v1: anytype) @TypeOf(v0, v1) { - return @select(f32, mask, v0, v1); -} - -pub inline fn lerp(v0: anytype, v1: anytype, t: f32) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - return v0 + (v1 - v0) * splat(T, t); // subps, shufps, addps, mulps -} - -pub inline fn lerpV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { - return v0 + (v1 - v0) * t; // subps, addps, mulps -} - -pub inline fn lerpInverse(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1) { - const T = @TypeOf(v0, v1); - return (splat(T, t) - v0) / (v1 - v0); -} - -pub inline fn lerpInverseV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { - return (t - v0) / (v1 - v0); -} -test "zmath.lerpInverse" { - try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 10.0), 0, 0.0005)); - try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 100.0), 1, 0.0005)); - try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 55.0), 0.5, 0.05)); - try expectVecApproxEqAbs(lerpInverse(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 10.0), f32x4(0.1, 0.05, 0, 0), 0.0005); -} - -// Frame rate independent lerp (or "damp"), for approaching things over time. -// Reference: https://www.gamedeveloper.com/programming/improved-lerp-smoothing- -pub inline fn lerpOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1) { - const t = std.math.exp2(-rate * dt); - return lerp(v1, v0, t); -} - -pub inline fn lerpVOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1, rate, dt) { - const t = std.math.exp2(-rate * dt); - return lerpV(v1, v0, t); -} - -test "zmath.lerpOverTime" { - try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 1.0), 0.5, 0.0005)); - try expect(math.approxEqAbs(f32, lerpVOverTime(0.5, 1.0, 1.0, 1.0), 0.75, 0.0005)); - try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 0.0), 0.0, 0.0005)); - try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, std.math.inf(f32)), 1.0, 0.0005)); - try expectVecApproxEqAbs(lerpOverTime(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 1.0, 1.0), f32x4(50, 100, 55, 55), 0.0005); -} - -/// To transform a vector of values from one range to another. -pub inline fn mapLinear(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v) { - const T = @TypeOf(v); - const min1V = splat(T, min1); - const max1V = splat(T, max1); - const min2V = splat(T, min2); - const max2V = splat(T, max2); - const dV = max1V - min1V; - return min2V + (v - min1V) * (max2V - min2V) / dV; -} - -pub inline fn mapLinearV(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v, min1, max1, min2, max2) { - const d = max1 - min1; - return min2 + (v - min1) * (max2 - min2) / d; -} -test "zmath.mapLinear" { - try expect(math.approxEqAbs(f32, mapLinearV(0, 0, 1.2, 10, 100), 10, 0.0005)); - try expect(math.approxEqAbs(f32, mapLinearV(1.2, 0, 1.2, 10, 100), 100, 0.0005)); - try expect(math.approxEqAbs(f32, mapLinearV(0.6, 0, 1.2, 10, 100), 55, 0.0005)); - try expectVecApproxEqAbs(mapLinearV(splat(F32x4, 0), splat(F32x4, 0), splat(F32x4, 1.2), splat(F32x4, 10), splat(F32x4, 100)), splat(F32x4, 10), 0.0005); - try expectVecApproxEqAbs(mapLinear(f32x4(0, 0, 0.6, 1.2), 0, 1.2, 10, 100), f32x4(10, 10, 55, 100), 0.0005); -} - -pub const F32x4Component = enum { x, y, z, w }; - -pub inline fn swizzle( - v: F32x4, - comptime x: F32x4Component, - comptime y: F32x4Component, - comptime z: F32x4Component, - comptime w: F32x4Component, -) F32x4 { - return @shuffle(f32, v, undefined, [4]i32{ @intFromEnum(x), @intFromEnum(y), @intFromEnum(z), @intFromEnum(w) }); -} - -pub inline fn mod(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - // vdivps, vroundps, vmulps, vsubps - return v0 - v1 * trunc(v0 / v1); -} -test "zmath.mod" { - try expectVecApproxEqAbs(mod(splat(F32x4, 3.1), splat(F32x4, 1.7)), splat(F32x4, 1.4), 0.0005); - try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, 2.0)), splat(F32x4, -1.0), 0.0005); - try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, -2.0)), splat(F32x4, -1.0), 0.0005); - try expectVecApproxEqAbs(mod(splat(F32x4, 3.0), splat(F32x4, -2.0)), splat(F32x4, 1.0), 0.0005); - try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, 1.0))), 0)); - try expect(all(isNan(mod(splat(F32x4, -math.inf(f32)), splat(F32x4, 123.456))), 0)); - try expect(all(isNan(mod(splat(F32x4, math.nan(f32)), splat(F32x4, 123.456))), 0)); - try expect(all(isNan(mod(splat(F32x4, math.snan(f32)), splat(F32x4, 123.456))), 0)); - try expect(all(isNan(mod(splat(F32x4, -math.snan(f32)), splat(F32x4, 123.456))), 0)); - try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.inf(f32)))), 0)); - try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, -math.inf(f32)))), 0)); - try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32)))), 0)); - try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.nan(f32)))), 0)); - try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.nan(f32)))), 0)); -} - -pub fn modAngle(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - return switch (T) { - f32 => modAngle32(v), - F32x4, F32x8, F32x16 => modAngle32xN(v), - else => @compileError("zmath.modAngle() not implemented for " ++ @typeName(T)), - }; -} - -pub inline fn modAngle32xN(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - return v - splat(T, math.tau) * round(v * splat(T, 1.0 / math.tau)); // 2 x vmulps, 2 x load, vroundps, vaddps -} -test "zmath.modAngle" { - try expectVecApproxEqAbs(modAngle(splat(F32x4, math.tau)), splat(F32x4, 0.0), 0.0005); - try expectVecApproxEqAbs(modAngle(splat(F32x4, 0.0)), splat(F32x4, 0.0), 0.0005); - try expectVecApproxEqAbs(modAngle(splat(F32x4, math.pi)), splat(F32x4, math.pi), 0.0005); - try expectVecApproxEqAbs(modAngle(splat(F32x4, 11 * math.pi)), splat(F32x4, math.pi), 0.0005); - try expectVecApproxEqAbs(modAngle(splat(F32x4, 3.5 * math.pi)), splat(F32x4, -0.5 * math.pi), 0.0005); - try expectVecApproxEqAbs(modAngle(splat(F32x4, 2.5 * math.pi)), splat(F32x4, 0.5 * math.pi), 0.0005); -} - -pub inline fn mulAdd(v0: anytype, v1: anytype, v2: anytype) @TypeOf(v0, v1, v2) { - const T = @TypeOf(v0, v1, v2); - if (@import("zmath_options").enable_cross_platform_determinism) { - return v0 * v1 + v2; // Compiler will generate mul, add sequence (no fma even if the target supports it). - } else { - if (cpu_arch == .x86_64 and has_avx and has_fma) { - return @mulAdd(T, v0, v1, v2); - } else { - // NOTE(mziulek): On .x86_64 without HW fma instructions @mulAdd maps to really slow code! - return v0 * v1 + v2; - } - } -} - -fn sin32xN(v: anytype) @TypeOf(v) { - // 11-degree minimax approximation - const T = @TypeOf(v); - - var x = modAngle(v); - const sign = andInt(x, splatNegativeZero(T)); - const c = orInt(sign, splat(T, math.pi)); - const absx = andNotInt(sign, x); - const rflx = c - x; - const comp = absx <= splat(T, 0.5 * math.pi); - x = select(comp, x, rflx); - const x2 = x * x; - - var result = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); - result = mulAdd(result, x2, splat(T, -0.00019840874)); - result = mulAdd(result, x2, splat(T, 0.0083333310)); - result = mulAdd(result, x2, splat(T, -0.16666667)); - result = mulAdd(result, x2, splat(T, 1.0)); - return x * result; -} -test "zmath.sin" { - const epsilon = 0.0001; - - try expectVecApproxEqAbs(sin(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 1.0), epsilon); - try expectVecApproxEqAbs(sin(splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); - try expectVecApproxEqAbs(sin(splat(F32x4, -0.0)), splat(F32x4, -0.0), epsilon); - try expectVecApproxEqAbs(sin(splat(F32x4, 89.123)), splat(F32x4, 0.916166), epsilon); - try expectVecApproxEqAbs(sin(splat(F32x8, 89.123)), splat(F32x8, 0.916166), epsilon); - try expectVecApproxEqAbs(sin(splat(F32x16, 89.123)), splat(F32x16, 0.916166), epsilon); - try expect(all(isNan(sin(splat(F32x4, math.inf(f32)))), 0) == true); - try expect(all(isNan(sin(splat(F32x4, -math.inf(f32)))), 0) == true); - try expect(all(isNan(sin(splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(sin(splat(F32x4, math.snan(f32)))), 0) == true); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const vr = sin(splat(F32x4, f)); - const fr = @sin(splat(F32x4, f)); - const vr8 = sin(splat(F32x8, f)); - const fr8 = @sin(splat(F32x8, f)); - const vr16 = sin(splat(F32x16, f)); - const fr16 = @sin(splat(F32x16, f)); - try expectVecApproxEqAbs(vr, fr, epsilon); - try expectVecApproxEqAbs(vr8, fr8, epsilon); - try expectVecApproxEqAbs(vr16, fr16, epsilon); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -fn cos32xN(v: anytype) @TypeOf(v) { - // 10-degree minimax approximation - const T = @TypeOf(v); - - var x = modAngle(v); - var sign = andInt(x, splatNegativeZero(T)); - const c = orInt(sign, splat(T, math.pi)); - const absx = andNotInt(sign, x); - const rflx = c - x; - const comp = absx <= splat(T, 0.5 * math.pi); - x = select(comp, x, rflx); - sign = select(comp, splat(T, 1.0), splat(T, -1.0)); - const x2 = x * x; - - var result = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); - result = mulAdd(result, x2, splat(T, -0.0013888378)); - result = mulAdd(result, x2, splat(T, 0.041666638)); - result = mulAdd(result, x2, splat(T, -0.5)); - result = mulAdd(result, x2, splat(T, 1.0)); - return sign * result; -} -test "zmath.cos" { - const epsilon = 0.0001; - - try expectVecApproxEqAbs(cos(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 0.0), epsilon); - try expectVecApproxEqAbs(cos(splat(F32x4, 0.0)), splat(F32x4, 1.0), epsilon); - try expectVecApproxEqAbs(cos(splat(F32x4, -0.0)), splat(F32x4, 1.0), epsilon); - try expect(all(isNan(cos(splat(F32x4, math.inf(f32)))), 0) == true); - try expect(all(isNan(cos(splat(F32x4, -math.inf(f32)))), 0) == true); - try expect(all(isNan(cos(splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(cos(splat(F32x4, math.snan(f32)))), 0) == true); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const vr = cos(splat(F32x4, f)); - const fr = @cos(splat(F32x4, f)); - const vr8 = cos(splat(F32x8, f)); - const fr8 = @cos(splat(F32x8, f)); - const vr16 = cos(splat(F32x16, f)); - const fr16 = @cos(splat(F32x16, f)); - try expectVecApproxEqAbs(vr, fr, epsilon); - try expectVecApproxEqAbs(vr8, fr8, epsilon); - try expectVecApproxEqAbs(vr16, fr16, epsilon); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -pub fn sin(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - return switch (T) { - f32 => sin32(v), - F32x4, F32x8, F32x16 => sin32xN(v), - else => @compileError("zmath.sin() not implemented for " ++ @typeName(T)), - }; -} - -pub fn cos(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - return switch (T) { - f32 => cos32(v), - F32x4, F32x8, F32x16 => cos32xN(v), - else => @compileError("zmath.cos() not implemented for " ++ @typeName(T)), - }; -} - -pub fn sincos(v: anytype) [2]@TypeOf(v) { - const T = @TypeOf(v); - return switch (T) { - f32 => sincos32(v), - F32x4, F32x8, F32x16 => sincos32xN(v), - else => @compileError("zmath.sincos() not implemented for " ++ @typeName(T)), - }; -} - -pub fn asin(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - return switch (T) { - f32 => asin32(v), - F32x4, F32x8, F32x16 => asin32xN(v), - else => @compileError("zmath.asin() not implemented for " ++ @typeName(T)), - }; -} - -pub fn acos(v: anytype) @TypeOf(v) { - const T = @TypeOf(v); - return switch (T) { - f32 => acos32(v), - F32x4, F32x8, F32x16 => acos32xN(v), - else => @compileError("zmath.acos() not implemented for " ++ @typeName(T)), - }; -} - -fn sincos32xN(v: anytype) [2]@TypeOf(v) { - const T = @TypeOf(v); - - var x = modAngle(v); - var sign = andInt(x, splatNegativeZero(T)); - const c = orInt(sign, splat(T, math.pi)); - const absx = andNotInt(sign, x); - const rflx = c - x; - const comp = absx <= splat(T, 0.5 * math.pi); - x = select(comp, x, rflx); - sign = select(comp, splat(T, 1.0), splat(T, -1.0)); - const x2 = x * x; - - var sresult = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); - sresult = mulAdd(sresult, x2, splat(T, -0.00019840874)); - sresult = mulAdd(sresult, x2, splat(T, 0.0083333310)); - sresult = mulAdd(sresult, x2, splat(T, -0.16666667)); - sresult = x * mulAdd(sresult, x2, splat(T, 1.0)); - - var cresult = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); - cresult = mulAdd(cresult, x2, splat(T, -0.0013888378)); - cresult = mulAdd(cresult, x2, splat(T, 0.041666638)); - cresult = mulAdd(cresult, x2, splat(T, -0.5)); - cresult = sign * mulAdd(cresult, x2, splat(T, 1.0)); - - return .{ sresult, cresult }; -} -test "zmath.sincos32xN" { - const epsilon = 0.0001; - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const sc = sincos(splat(F32x4, f)); - const sc8 = sincos(splat(F32x8, f)); - const sc16 = sincos(splat(F32x16, f)); - const s4 = @sin(splat(F32x4, f)); - const s8 = @sin(splat(F32x8, f)); - const s16 = @sin(splat(F32x16, f)); - const c4 = @cos(splat(F32x4, f)); - const c8 = @cos(splat(F32x8, f)); - const c16 = @cos(splat(F32x16, f)); - try expectVecApproxEqAbs(sc[0], s4, epsilon); - try expectVecApproxEqAbs(sc8[0], s8, epsilon); - try expectVecApproxEqAbs(sc16[0], s16, epsilon); - try expectVecApproxEqAbs(sc[1], c4, epsilon); - try expectVecApproxEqAbs(sc8[1], c8, epsilon); - try expectVecApproxEqAbs(sc16[1], c16, epsilon); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -fn asin32xN(v: anytype) @TypeOf(v) { - // 7-degree minimax approximation - const T = @TypeOf(v); - - const x = abs(v); - const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); - - var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); - t0 = mulAdd(t0, x, splat(T, -0.0170881256)); - t0 = mulAdd(t0, x, splat(T, 0.0308918810)); - t0 = mulAdd(t0, x, splat(T, -0.0501743046)); - t0 = mulAdd(t0, x, splat(T, 0.0889789874)); - t0 = mulAdd(t0, x, splat(T, -0.2145988016)); - t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); - - const t1 = splat(T, math.pi) - t0; - return splat(T, 0.5 * math.pi) - select(v >= splat(T, 0.0), t0, t1); -} - -fn acos32xN(v: anytype) @TypeOf(v) { - // 7-degree minimax approximation - const T = @TypeOf(v); - - const x = abs(v); - const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); - - var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); - t0 = mulAdd(t0, x, splat(T, -0.0170881256)); - t0 = mulAdd(t0, x, splat(T, 0.0308918810)); - t0 = mulAdd(t0, x, splat(T, -0.0501743046)); - t0 = mulAdd(t0, x, splat(T, 0.0889789874)); - t0 = mulAdd(t0, x, splat(T, -0.2145988016)); - t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); - - const t1 = splat(T, math.pi) - t0; - return select(v >= splat(T, 0.0), t0, t1); -} - -pub fn atan(v: anytype) @TypeOf(v) { - // 17-degree minimax approximation - const T = @TypeOf(v); - - const vabs = abs(v); - const vinv = splat(T, 1.0) / v; - var sign = select(v > splat(T, 1.0), splat(T, 1.0), splat(T, -1.0)); - const comp = vabs <= splat(T, 1.0); - sign = select(comp, splat(T, 0.0), sign); - const x = select(comp, v, vinv); - const x2 = x * x; - - var result = mulAdd(splat(T, 0.0028662257), x2, splat(T, -0.0161657367)); - result = mulAdd(result, x2, splat(T, 0.0429096138)); - result = mulAdd(result, x2, splat(T, -0.0752896400)); - result = mulAdd(result, x2, splat(T, 0.1065626393)); - result = mulAdd(result, x2, splat(T, -0.1420889944)); - result = mulAdd(result, x2, splat(T, 0.1999355085)); - result = mulAdd(result, x2, splat(T, -0.3333314528)); - result = x * mulAdd(result, x2, splat(T, 1.0)); - - const result1 = sign * splat(T, 0.5 * math.pi) - result; - return select(sign == splat(T, 0.0), result, result1); -} -test "zmath.atan" { - const epsilon = 0.0001; - { - const v = f32x4(0.25, 0.5, 1.0, 1.25); - const e = f32x4(math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3])); - try expectVecApproxEqAbs(e, atan(v), epsilon); - } - { - const v = f32x8(-0.25, 0.5, -1.0, 1.25, 100.0, -200.0, 300.0, 400.0); - // zig fmt: off - const e = f32x8( - math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), - math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), - ); - // zig fmt: on - try expectVecApproxEqAbs(e, atan(v), epsilon); - } - { - // zig fmt: off - const v = f32x16( - -0.25, 0.5, -1.0, 0.0, 0.1, -0.2, 30.0, 400.0, - -0.25, 0.5, -1.0, -0.0, -0.05, -0.125, 0.0625, 4000.0 - ); - const e = f32x16( - math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), - math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), - math.atan(v[8]), math.atan(v[9]), math.atan(v[10]), math.atan(v[11]), - math.atan(v[12]), math.atan(v[13]), math.atan(v[14]), math.atan(v[15]), - ); - // zig fmt: on - try expectVecApproxEqAbs(e, atan(v), epsilon); - } - { - try expectVecApproxEqAbs(atan(splat(F32x4, math.inf(f32))), splat(F32x4, 0.5 * math.pi), epsilon); - try expectVecApproxEqAbs(atan(splat(F32x4, -math.inf(f32))), splat(F32x4, -0.5 * math.pi), epsilon); - try expect(all(isNan(atan(splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(atan(splat(F32x4, -math.nan(f32)))), 0) == true); - } -} - -pub fn atan2(vy: anytype, vx: anytype) @TypeOf(vx, vy) { - const T = @TypeOf(vx, vy); - const Tu = @Vector(veclen(T), u32); - - const vx_is_positive = - (@as(Tu, @bitCast(vx)) & @as(Tu, @splat(0x8000_0000))) == @as(Tu, @splat(0)); - - const vy_sign = andInt(vy, splatNegativeZero(T)); - const c0_25pi = orInt(vy_sign, @as(T, @splat(0.25 * math.pi))); - const c0_50pi = orInt(vy_sign, @as(T, @splat(0.50 * math.pi))); - const c0_75pi = orInt(vy_sign, @as(T, @splat(0.75 * math.pi))); - const c1_00pi = orInt(vy_sign, @as(T, @splat(1.00 * math.pi))); - - var r1 = select(vx_is_positive, vy_sign, c1_00pi); - var r2 = select(vx == splat(T, 0.0), c0_50pi, splatInt(T, 0xffff_ffff)); - const r3 = select(vy == splat(T, 0.0), r1, r2); - const r4 = select(vx_is_positive, c0_25pi, c0_75pi); - const r5 = select(isInf(vx), r4, c0_50pi); - const result = select(isInf(vy), r5, r3); - const result_valid = @as(Tu, @bitCast(result)) == @as(Tu, @splat(0xffff_ffff)); - - const v = vy / vx; - const r0 = atan(v); - - r1 = select(vx_is_positive, splatNegativeZero(T), c1_00pi); - r2 = r0 + r1; - - return select(result_valid, r2, result); -} -test "zmath.atan2" { - // From DirectXMath XMVectorATan2(): - // - // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: - - // Y == 0 and X is Negative -> Pi with the sign of Y - // y == 0 and x is positive -> 0 with the sign of y - // Y != 0 and X == 0 -> Pi / 2 with the sign of Y - // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) - // X == -Infinity and Finite Y -> Pi with the sign of Y - // X == +Infinity and Finite Y -> 0 with the sign of Y - // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y - // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y - // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y - - const epsilon = 0.0001; - try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, -1.0)), splat(F32x4, math.pi), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, -1.0)), splat(F32x4, -math.pi), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, 0.0)), splat(F32x4, 0.5 * math.pi), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, 0.0)), splat(F32x4, -0.5 * math.pi), epsilon); - try expectVecApproxEqAbs( - atan2(splat(F32x4, 1.0), splat(F32x4, -1.0)), - splat(F32x4, math.atan(@as(f32, -1.0)) + math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2(splat(F32x4, -10.0), splat(F32x4, -2.0)), - splat(F32x4, math.atan(@as(f32, 5.0)) - math.pi), - epsilon, - ); - try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, math.pi), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, -math.pi), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, math.inf(f32))), splat(F32x4, 0.0), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, math.inf(f32))), splat(F32x4, -0.0), epsilon); - try expectVecApproxEqAbs( - atan2(splat(F32x4, math.inf(f32)), splat(F32x4, 2.0)), - splat(F32x4, 0.5 * math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, 2.0)), - splat(F32x4, -0.5 * math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2(splat(F32x4, math.inf(f32)), splat(F32x4, -math.inf(f32))), - splat(F32x4, 0.75 * math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, -math.inf(f32))), - splat(F32x4, -0.75 * math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32))), - splat(F32x4, 0.25 * math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, math.inf(f32))), - splat(F32x4, -0.25 * math.pi), - epsilon, - ); - try expectVecApproxEqAbs( - atan2( - f32x8(0.0, -math.inf(f32), -0.0, 2.0, math.inf(f32), math.inf(f32), 1.0, -math.inf(f32)), - f32x8(-2.0, math.inf(f32), 1.0, 0.0, 10.0, -math.inf(f32), 1.0, -math.inf(f32)), - ), - f32x8( - math.pi, - -0.25 * math.pi, - -0.0, - 0.5 * math.pi, - 0.5 * math.pi, - 0.75 * math.pi, - math.atan(@as(f32, 1.0)), - -0.75 * math.pi, - ), - epsilon, - ); - try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); - try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); - try expect(all(isNan(atan2(splat(F32x4, 1.0), splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(atan2(splat(F32x4, -1.0), splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(atan2(splat(F32x4, math.nan(f32)), splat(F32x4, -1.0))), 0) == true); - try expect(all(isNan(atan2(splat(F32x4, -math.nan(f32)), splat(F32x4, 1.0))), 0) == true); -} -// ------------------------------------------------------------------------------ -// -// 3. 2D, 3D, 4D vector functions -// -// ------------------------------------------------------------------------------ -pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { - var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | - const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | - xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | - return swizzle(xmm0, .x, .x, .x, .x); -} -test "zmath.dot2" { - const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); - const v1 = f32x4(4.0, 5.0, 600.0, 2.0); - const v = dot2(v0, v1); - try expectVecApproxEqAbs(v, splat(F32x4, 6.0), 0.0001); -} - -pub inline fn dot3(v0: Vec, v1: Vec) F32x4 { - const dot = v0 * v1; - return f32x4s(dot[0] + dot[1] + dot[2]); -} -test "zmath.dot3" { - const v0 = f32x4(-1.0, 2.0, 3.0, 1.0); - const v1 = f32x4(4.0, 5.0, 6.0, 1.0); - const v = dot3(v0, v1); - try expectVecApproxEqAbs(v, splat(F32x4, 24.0), 0.0001); -} - -pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { - var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | - var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- | - xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- | - xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- | - xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss - return swizzle(xmm0, .x, .x, .x, .x); -} -test "zmath.dot4" { - const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); - const v1 = f32x4(4.0, 5.0, 6.0, 2.0); - const v = dot4(v0, v1); - try expectVecApproxEqAbs(v, splat(F32x4, 20.0), 0.0001); -} - -pub inline fn cross3(v0: Vec, v1: Vec) Vec { - var xmm0 = swizzle(v0, .y, .z, .x, .w); - var xmm1 = swizzle(v1, .z, .x, .y, .w); - var result = xmm0 * xmm1; - xmm0 = swizzle(xmm0, .y, .z, .x, .w); - xmm1 = swizzle(xmm1, .z, .x, .y, .w); - result = result - xmm0 * xmm1; - return andInt(result, f32x4_mask3); -} -test "zmath.cross3" { - { - const v0 = f32x4(1.0, 0.0, 0.0, 1.0); - const v1 = f32x4(0.0, 1.0, 0.0, 1.0); - const v = cross3(v0, v1); - try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); - } - { - const v0 = f32x4(1.0, 0.0, 0.0, 1.0); - const v1 = f32x4(0.0, -1.0, 0.0, 1.0); - const v = cross3(v0, v1); - try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, -1.0, 0.0), 0.0001); - } - { - const v0 = f32x4(-3.0, 0, -2.0, 1.0); - const v1 = f32x4(5.0, -1.0, 2.0, 1.0); - const v = cross3(v0, v1); - try expectVecApproxEqAbs(v, f32x4(-2.0, -4.0, 3.0, 0.0), 0.0001); - } -} - -pub inline fn lengthSq2(v: Vec) F32x4 { - return dot2(v, v); -} -pub inline fn lengthSq3(v: Vec) F32x4 { - return dot3(v, v); -} -pub inline fn lengthSq4(v: Vec) F32x4 { - return dot4(v, v); -} - -pub inline fn length2(v: Vec) F32x4 { - return sqrt(dot2(v, v)); -} -pub inline fn length3(v: Vec) F32x4 { - return sqrt(dot3(v, v)); -} -pub inline fn length4(v: Vec) F32x4 { - return sqrt(dot4(v, v)); -} -test "zmath.length3" { - { - const v = length3(f32x4(1.0, -2.0, 3.0, 1000.0)); - try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); - } - { - const v = length3(f32x4(1.0, math.nan(f32), math.nan(f32), 1000.0)); - try expect(all(isNan(v), 0)); - } - { - const v = length3(f32x4(1.0, math.inf(f32), 3.0, 1000.0)); - try expect(all(isInf(v), 0)); - } - { - const v = length3(f32x4(3.0, 2.0, 1.0, math.nan(f32))); - try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); - } -} - -pub inline fn normalize2(v: Vec) Vec { - return v * splat(F32x4, 1.0) / sqrt(dot2(v, v)); -} -pub inline fn normalize3(v: Vec) Vec { - return v * splat(F32x4, 1.0) / sqrt(dot3(v, v)); -} -pub inline fn normalize4(v: Vec) Vec { - return v * splat(F32x4, 1.0) / sqrt(dot4(v, v)); -} -test "zmath.normalize3" { - { - const v0 = f32x4(1.0, -2.0, 3.0, 1000.0); - const v = normalize3(v0); - try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(14.0)), 0.0005); - } - { - try expect(any(isNan(normalize3(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); - try expect(any(isNan(normalize3(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); - try expect(any(isNan(normalize3(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); - try expect(any(isNan(normalize3(f32x4(0, 0, 0, 0))), 0)); - } -} -test "zmath.normalize4" { - { - const v0 = f32x4(1.0, -2.0, 3.0, 10.0); - const v = normalize4(v0); - try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(114.0)), 0.0005); - } - { - try expect(any(isNan(normalize4(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); - try expect(any(isNan(normalize4(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); - try expect(any(isNan(normalize4(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); - try expect(any(isNan(normalize4(f32x4(0, 0, 0, 0))), 0)); - } -} - -fn vecMulMat(v: Vec, m: Mat) Vec { - const vx = @shuffle(f32, v, undefined, [4]i32{ 0, 0, 0, 0 }); - const vy = @shuffle(f32, v, undefined, [4]i32{ 1, 1, 1, 1 }); - const vz = @shuffle(f32, v, undefined, [4]i32{ 2, 2, 2, 2 }); - const vw = @shuffle(f32, v, undefined, [4]i32{ 3, 3, 3, 3 }); - return vx * m[0] + vy * m[1] + vz * m[2] + vw * m[3]; -} -fn matMulVec(m: Mat, v: Vec) Vec { - return .{ dot4(m[0], v)[0], dot4(m[1], v)[0], dot4(m[2], v)[0], dot4(m[3], v)[0] }; -} -test "zmath.vecMulMat" { - const m = Mat{ - f32x4(1.0, 0.0, 0.0, 0.0), - f32x4(0.0, 1.0, 0.0, 0.0), - f32x4(0.0, 0.0, 1.0, 0.0), - f32x4(2.0, 3.0, 4.0, 1.0), - }; - const vm = mul(f32x4(1.0, 2.0, 3.0, 1.0), m); - const mv = mul(m, f32x4(1.0, 2.0, 3.0, 1.0)); - const v = mul(transpose(m), f32x4(1.0, 2.0, 3.0, 1.0)); - try expectVecApproxEqAbs(vm, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); - try expectVecApproxEqAbs(mv, f32x4(1.0, 2.0, 3.0, 21.0), 0.0001); - try expectVecApproxEqAbs(v, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); -} -// ------------------------------------------------------------------------------ -// -// 4. Matrix functions -// -// ------------------------------------------------------------------------------ -pub fn identity() Mat { - const static = struct { - const identity = Mat{ - f32x4(1.0, 0.0, 0.0, 0.0), - f32x4(0.0, 1.0, 0.0, 0.0), - f32x4(0.0, 0.0, 1.0, 0.0), - f32x4(0.0, 0.0, 0.0, 1.0), - }; - }; - return static.identity; -} - -pub fn matFromArr(arr: [16]f32) Mat { - return Mat{ - f32x4(arr[0], arr[1], arr[2], arr[3]), - f32x4(arr[4], arr[5], arr[6], arr[7]), - f32x4(arr[8], arr[9], arr[10], arr[11]), - f32x4(arr[12], arr[13], arr[14], arr[15]), - }; -} - -fn mulRetType(comptime Ta: type, comptime Tb: type) type { - if (Ta == Mat and Tb == Mat) { - return Mat; - } else if ((Ta == f32 and Tb == Mat) or (Ta == Mat and Tb == f32)) { - return Mat; - } else if ((Ta == Vec and Tb == Mat) or (Ta == Mat and Tb == Vec)) { - return Vec; - } - @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ @typeName(Tb)); -} - -pub fn mul(a: anytype, b: anytype) mulRetType(@TypeOf(a), @TypeOf(b)) { - const Ta = @TypeOf(a); - const Tb = @TypeOf(b); - if (Ta == Mat and Tb == Mat) { - return mulMat(a, b); - } else if (Ta == f32 and Tb == Mat) { - const va = splat(F32x4, a); - return Mat{ va * b[0], va * b[1], va * b[2], va * b[3] }; - } else if (Ta == Mat and Tb == f32) { - const vb = splat(F32x4, b); - return Mat{ a[0] * vb, a[1] * vb, a[2] * vb, a[3] * vb }; - } else if (Ta == Vec and Tb == Mat) { - return vecMulMat(a, b); - } else if (Ta == Mat and Tb == Vec) { - return matMulVec(a, b); - } else { - @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ ", " ++ @typeName(Tb)); - } -} -test "zmath.mul" { - { - const m = Mat{ - f32x4(0.1, 0.2, 0.3, 0.4), - f32x4(0.5, 0.6, 0.7, 0.8), - f32x4(0.9, 1.0, 1.1, 1.2), - f32x4(1.3, 1.4, 1.5, 1.6), - }; - const ms = mul(@as(f32, 2.0), m); - try expectVecApproxEqAbs(ms[0], f32x4(0.2, 0.4, 0.6, 0.8), 0.0001); - try expectVecApproxEqAbs(ms[1], f32x4(1.0, 1.2, 1.4, 1.6), 0.0001); - try expectVecApproxEqAbs(ms[2], f32x4(1.8, 2.0, 2.2, 2.4), 0.0001); - try expectVecApproxEqAbs(ms[3], f32x4(2.6, 2.8, 3.0, 3.2), 0.0001); - } -} - -fn mulMat(m0: Mat, m1: Mat) Mat { - var result: Mat = undefined; - comptime var row: u32 = 0; - inline while (row < 4) : (row += 1) { - const vx = swizzle(m0[row], .x, .x, .x, .x); - const vy = swizzle(m0[row], .y, .y, .y, .y); - const vz = swizzle(m0[row], .z, .z, .z, .z); - const vw = swizzle(m0[row], .w, .w, .w, .w); - result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]); - } - return result; -} -test "zmath.matrix.mul" { - const a = Mat{ - f32x4(0.1, 0.2, 0.3, 0.4), - f32x4(0.5, 0.6, 0.7, 0.8), - f32x4(0.9, 1.0, 1.1, 1.2), - f32x4(1.3, 1.4, 1.5, 1.6), - }; - const b = Mat{ - f32x4(1.7, 1.8, 1.9, 2.0), - f32x4(2.1, 2.2, 2.3, 2.4), - f32x4(2.5, 2.6, 2.7, 2.8), - f32x4(2.9, 3.0, 3.1, 3.2), - }; - const c = mul(a, b); - try expectVecApproxEqAbs(c[0], f32x4(2.5, 2.6, 2.7, 2.8), 0.0001); - try expectVecApproxEqAbs(c[1], f32x4(6.18, 6.44, 6.7, 6.96), 0.0001); - try expectVecApproxEqAbs(c[2], f32x4(9.86, 10.28, 10.7, 11.12), 0.0001); - try expectVecApproxEqAbs(c[3], f32x4(13.54, 14.12, 14.7, 15.28), 0.0001); -} - -pub fn transpose(m: Mat) Mat { - const temp1 = @shuffle(f32, m[0], m[1], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); - const temp3 = @shuffle(f32, m[0], m[1], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); - const temp2 = @shuffle(f32, m[2], m[3], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); - const temp4 = @shuffle(f32, m[2], m[3], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); - return .{ - @shuffle(f32, temp1, temp2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), - @shuffle(f32, temp1, temp2, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), - @shuffle(f32, temp3, temp4, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), - @shuffle(f32, temp3, temp4, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), - }; -} -test "zmath.matrix.transpose" { - const m = Mat{ - f32x4(1.0, 2.0, 3.0, 4.0), - f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), - f32x4(13.0, 14.0, 15.0, 16.0), - }; - const mt = transpose(m); - try expectVecApproxEqAbs(mt[0], f32x4(1.0, 5.0, 9.0, 13.0), 0.0001); - try expectVecApproxEqAbs(mt[1], f32x4(2.0, 6.0, 10.0, 14.0), 0.0001); - try expectVecApproxEqAbs(mt[2], f32x4(3.0, 7.0, 11.0, 15.0), 0.0001); - try expectVecApproxEqAbs(mt[3], f32x4(4.0, 8.0, 12.0, 16.0), 0.0001); -} - -pub fn rotationX(angle: f32) Mat { - const sc = sincos(angle); - return .{ - f32x4(1.0, 0.0, 0.0, 0.0), - f32x4(0.0, sc[1], sc[0], 0.0), - f32x4(0.0, -sc[0], sc[1], 0.0), - f32x4(0.0, 0.0, 0.0, 1.0), - }; -} - -pub fn rotationY(angle: f32) Mat { - const sc = sincos(angle); - return .{ - f32x4(sc[1], 0.0, -sc[0], 0.0), - f32x4(0.0, 1.0, 0.0, 0.0), - f32x4(sc[0], 0.0, sc[1], 0.0), - f32x4(0.0, 0.0, 0.0, 1.0), - }; -} - -pub fn rotationZ(angle: f32) Mat { - const sc = sincos(angle); - return .{ - f32x4(sc[1], sc[0], 0.0, 0.0), - f32x4(-sc[0], sc[1], 0.0, 0.0), - f32x4(0.0, 0.0, 1.0, 0.0), - f32x4(0.0, 0.0, 0.0, 1.0), - }; -} - -pub fn translation(x: f32, y: f32, z: f32) Mat { - return .{ - f32x4(1.0, 0.0, 0.0, 0.0), - f32x4(0.0, 1.0, 0.0, 0.0), - f32x4(0.0, 0.0, 1.0, 0.0), - f32x4(x, y, z, 1.0), - }; -} -pub fn translationV(v: Vec) Mat { - return translation(v[0], v[1], v[2]); -} - -pub fn scaling(x: f32, y: f32, z: f32) Mat { - return .{ - f32x4(x, 0.0, 0.0, 0.0), - f32x4(0.0, y, 0.0, 0.0), - f32x4(0.0, 0.0, z, 0.0), - f32x4(0.0, 0.0, 0.0, 1.0), - }; -} -pub fn scalingV(v: Vec) Mat { - return scaling(v[0], v[1], v[2]); -} - -pub fn lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { - const az = normalize3(eyedir); - const ax = normalize3(cross3(updir, az)); - const ay = normalize3(cross3(az, ax)); - return .{ - f32x4(ax[0], ay[0], az[0], 0), - f32x4(ax[1], ay[1], az[1], 0), - f32x4(ax[2], ay[2], az[2], 0), - f32x4(-dot3(ax, eyepos)[0], -dot3(ay, eyepos)[0], -dot3(az, eyepos)[0], 1.0), - }; -} -pub fn lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { - return lookToLh(eyepos, -eyedir, updir); -} -pub fn lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { - return lookToLh(eyepos, focuspos - eyepos, updir); -} -pub fn lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { - return lookToLh(eyepos, eyepos - focuspos, updir); -} -test "zmath.matrix.lookToLh" { - const m = lookToLh(f32x4(0.0, 0.0, -3.0, 1.0), f32x4(0.0, 0.0, 1.0, 0.0), f32x4(0.0, 1.0, 0.0, 0.0)); - try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.001); - try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.001); - try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.001); - try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 3.0, 1.0), 0.001); -} - -pub fn perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { - const scfov = sincos(0.5 * fovy); - - assert(near > 0.0 and far > 0.0); - assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); - - const h = scfov[1] / scfov[0]; - const w = h / aspect; - const r = far / (far - near); - return .{ - f32x4(w, 0.0, 0.0, 0.0), - f32x4(0.0, h, 0.0, 0.0), - f32x4(0.0, 0.0, r, 1.0), - f32x4(0.0, 0.0, -r * near, 0.0), - }; -} -pub fn perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { - const scfov = sincos(0.5 * fovy); - - assert(near > 0.0 and far > 0.0); - assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); - - const h = scfov[1] / scfov[0]; - const w = h / aspect; - const r = far / (near - far); - return .{ - f32x4(w, 0.0, 0.0, 0.0), - f32x4(0.0, h, 0.0, 0.0), - f32x4(0.0, 0.0, r, -1.0), - f32x4(0.0, 0.0, r * near, 0.0), - }; -} - -// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) -pub fn perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { - const scfov = sincos(0.5 * fovy); - - assert(near > 0.0 and far > 0.0); - assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); - - const h = scfov[1] / scfov[0]; - const w = h / aspect; - const r = far - near; - return .{ - f32x4(w, 0.0, 0.0, 0.0), - f32x4(0.0, h, 0.0, 0.0), - f32x4(0.0, 0.0, (near + far) / r, 1.0), - f32x4(0.0, 0.0, 2.0 * near * far / -r, 0.0), - }; -} - -// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) -pub fn perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { - const scfov = sincos(0.5 * fovy); - - assert(near > 0.0 and far > 0.0); - assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); - - const h = scfov[1] / scfov[0]; - const w = h / aspect; - const r = near - far; - return .{ - f32x4(w, 0.0, 0.0, 0.0), - f32x4(0.0, h, 0.0, 0.0), - f32x4(0.0, 0.0, (near + far) / r, -1.0), - f32x4(0.0, 0.0, 2.0 * near * far / r, 0.0), - }; -} - -pub fn orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = 1 / (far - near); - return .{ - f32x4(2 / w, 0.0, 0.0, 0.0), - f32x4(0.0, 2 / h, 0.0, 0.0), - f32x4(0.0, 0.0, r, 0.0), - f32x4(0.0, 0.0, -r * near, 1.0), - }; -} - -pub fn orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = 1 / (near - far); - return .{ - f32x4(2 / w, 0.0, 0.0, 0.0), - f32x4(0.0, 2 / h, 0.0, 0.0), - f32x4(0.0, 0.0, r, 0.0), - f32x4(0.0, 0.0, r * near, 1.0), - }; -} - -// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) -pub fn orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = far - near; - return .{ - f32x4(2 / w, 0.0, 0.0, 0.0), - f32x4(0.0, 2 / h, 0.0, 0.0), - f32x4(0.0, 0.0, 2 / r, 0.0), - f32x4(0.0, 0.0, (near + far) / -r, 1.0), - }; -} - -// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) -pub fn orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = near - far; - return .{ - f32x4(2 / w, 0.0, 0.0, 0.0), - f32x4(0.0, 2 / h, 0.0, 0.0), - f32x4(0.0, 0.0, 2 / r, 0.0), - f32x4(0.0, 0.0, (near + far) / r, 1.0), - }; -} - -pub fn orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = 1 / (far - near); - return .{ - f32x4(2 / (right - left), 0.0, 0.0, 0.0), - f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), - f32x4(0.0, 0.0, r, 0.0), - f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), -r * near, 1.0), - }; -} - -pub fn orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = 1 / (near - far); - return .{ - f32x4(2 / (right - left), 0.0, 0.0, 0.0), - f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), - f32x4(0.0, 0.0, r, 0.0), - f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), r * near, 1.0), - }; -} - -// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) -pub fn orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = far - near; - return .{ - f32x4(2 / (right - left), 0.0, 0.0, 0.0), - f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), - f32x4(0.0, 0.0, 2 / r, 0.0), - f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / -r, 1.0), - }; -} - -// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) -pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { - assert(!math.approxEqAbs(f32, far, near, 0.001)); - - const r = near - far; - return .{ - f32x4(2 / (right - left), 0.0, 0.0, 0.0), - f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), - f32x4(0.0, 0.0, 2 / r, 0.0), - f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / r, 1.0), - }; -} - -pub fn determinant(m: Mat) F32x4 { - var v0 = swizzle(m[2], .y, .x, .x, .x); - var v1 = swizzle(m[3], .z, .z, .y, .y); - var v2 = swizzle(m[2], .y, .x, .x, .x); - var v3 = swizzle(m[3], .w, .w, .w, .z); - var v4 = swizzle(m[2], .z, .z, .y, .y); - var v5 = swizzle(m[3], .w, .w, .w, .z); - - var p0 = v0 * v1; - var p1 = v2 * v3; - var p2 = v4 * v5; - - v0 = swizzle(m[2], .z, .z, .y, .y); - v1 = swizzle(m[3], .y, .x, .x, .x); - v2 = swizzle(m[2], .w, .w, .w, .z); - v3 = swizzle(m[3], .y, .x, .x, .x); - v4 = swizzle(m[2], .w, .w, .w, .z); - v5 = swizzle(m[3], .z, .z, .y, .y); - - p0 = mulAdd(-v0, v1, p0); - p1 = mulAdd(-v2, v3, p1); - p2 = mulAdd(-v4, v5, p2); - - v0 = swizzle(m[1], .w, .w, .w, .z); - v1 = swizzle(m[1], .z, .z, .y, .y); - v2 = swizzle(m[1], .y, .x, .x, .x); - - const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0); - var r = v0 * p0; - r = mulAdd(-v1, p1, r); - r = mulAdd(v2, p2, r); - return dot4(s, r); -} -test "zmath.matrix.determinant" { - const m = Mat{ - f32x4(10.0, -9.0, -12.0, 1.0), - f32x4(7.0, -12.0, 11.0, 1.0), - f32x4(-10.0, 10.0, 3.0, 1.0), - f32x4(1.0, 2.0, 3.0, 4.0), - }; - try expectVecApproxEqAbs(determinant(m), splat(F32x4, 2939.0), 0.0001); -} - -pub fn inverse(a: anytype) @TypeOf(a) { - const T = @TypeOf(a); - return switch (T) { - Mat => inverseMat(a), - Quat => inverseQuat(a), - else => @compileError("zmath.inverse() not implemented for " ++ @typeName(T)), - }; -} - -fn inverseMat(m: Mat) Mat { - return inverseDet(m, null); -} - -pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { - const mt = transpose(m); - var v0: [4]F32x4 = undefined; - var v1: [4]F32x4 = undefined; - - v0[0] = swizzle(mt[2], .x, .x, .y, .y); - v1[0] = swizzle(mt[3], .z, .w, .z, .w); - v0[1] = swizzle(mt[0], .x, .x, .y, .y); - v1[1] = swizzle(mt[1], .z, .w, .z, .w); - v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); - v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); - - var d0 = v0[0] * v1[0]; - var d1 = v0[1] * v1[1]; - var d2 = v0[2] * v1[2]; - - v0[0] = swizzle(mt[2], .z, .w, .z, .w); - v1[0] = swizzle(mt[3], .x, .x, .y, .y); - v0[1] = swizzle(mt[0], .z, .w, .z, .w); - v1[1] = swizzle(mt[1], .x, .x, .y, .y); - v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); - v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); - - d0 = mulAdd(-v0[0], v1[0], d0); - d1 = mulAdd(-v0[1], v1[1], d1); - d2 = mulAdd(-v0[2], v1[2], d2); - - v0[0] = swizzle(mt[1], .y, .z, .x, .y); - v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 }); - v0[1] = swizzle(mt[0], .z, .x, .y, .x); - v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 }); - v0[2] = swizzle(mt[3], .y, .z, .x, .y); - v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 }); - v0[3] = swizzle(mt[2], .z, .x, .y, .x); - v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 }); - - var c0 = v0[0] * v1[0]; - var c2 = v0[1] * v1[1]; - var c4 = v0[2] * v1[2]; - var c6 = v0[3] * v1[3]; - - v0[0] = swizzle(mt[1], .z, .w, .y, .z); - v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) }); - v0[1] = swizzle(mt[0], .w, .z, .w, .y); - v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 }); - v0[2] = swizzle(mt[3], .z, .w, .y, .z); - v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) }); - v0[3] = swizzle(mt[2], .w, .z, .w, .y); - v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 }); - - c0 = mulAdd(-v0[0], v1[0], c0); - c2 = mulAdd(-v0[1], v1[1], c2); - c4 = mulAdd(-v0[2], v1[2], c4); - c6 = mulAdd(-v0[3], v1[3], c6); - - v0[0] = swizzle(mt[1], .w, .x, .w, .x); - v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 }); - v0[1] = swizzle(mt[0], .y, .w, .x, .z); - v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) }); - v0[2] = swizzle(mt[3], .w, .x, .w, .x); - v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 }); - v0[3] = swizzle(mt[2], .y, .w, .x, .z); - v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) }); - - const c1 = mulAdd(-v0[0], v1[0], c0); - const c3 = mulAdd(v0[1], v1[1], c2); - const c5 = mulAdd(-v0[2], v1[2], c4); - const c7 = mulAdd(v0[3], v1[3], c6); - - c0 = mulAdd(v0[0], v1[0], c0); - c2 = mulAdd(-v0[1], v1[1], c2); - c4 = mulAdd(v0[2], v1[2], c4); - c6 = mulAdd(-v0[3], v1[3], c6); - - var mr = Mat{ - f32x4(c0[0], c1[1], c0[2], c1[3]), - f32x4(c2[0], c3[1], c2[2], c3[3]), - f32x4(c4[0], c5[1], c4[2], c5[3]), - f32x4(c6[0], c7[1], c6[2], c7[3]), - }; - - const det = dot4(mr[0], mt[0]); - if (out_det != null) { - out_det.?.* = det; - } - - if (math.approxEqAbs(f32, det[0], 0.0, math.floatEps(f32))) { - return .{ - f32x4(0.0, 0.0, 0.0, 0.0), - f32x4(0.0, 0.0, 0.0, 0.0), - f32x4(0.0, 0.0, 0.0, 0.0), - f32x4(0.0, 0.0, 0.0, 0.0), - }; - } - - const scale = splat(F32x4, 1.0) / det; - mr[0] *= scale; - mr[1] *= scale; - mr[2] *= scale; - mr[3] *= scale; - return mr; -} -test "zmath.matrix.inverse" { - const m = Mat{ - f32x4(10.0, -9.0, -12.0, 1.0), - f32x4(7.0, -12.0, 11.0, 1.0), - f32x4(-10.0, 10.0, 3.0, 1.0), - f32x4(1.0, 2.0, 3.0, 4.0), - }; - var det: F32x4 = undefined; - const mi = inverseDet(m, &det); - try expectVecApproxEqAbs(det, splat(F32x4, 2939.0), 0.0001); - - try expectVecApproxEqAbs(mi[0], f32x4(-0.170806, -0.13576, -0.349439, 0.164001), 0.0001); - try expectVecApproxEqAbs(mi[1], f32x4(-0.163661, -0.14801, -0.253147, 0.141204), 0.0001); - try expectVecApproxEqAbs(mi[2], f32x4(-0.0871045, 0.00646478, -0.0785982, 0.0398095), 0.0001); - try expectVecApproxEqAbs(mi[3], f32x4(0.18986, 0.103096, 0.272882, 0.10854), 0.0001); -} - -pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { - const sincos_angle = sincos(angle); - - const c2 = splat(F32x4, 1.0 - sincos_angle[1]); - const c1 = splat(F32x4, sincos_angle[1]); - const c0 = splat(F32x4, sincos_angle[0]); - - const n0 = swizzle(axis, .y, .z, .x, .w); - const n1 = swizzle(axis, .z, .x, .y, .w); - - var v0 = c2 * n0 * n1; - const r0 = c2 * axis * axis + c1; - const r1 = c0 * axis + v0; - var r2 = v0 - c0 * axis; - - v0 = andInt(r0, f32x4_mask3); - - var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) }); - v1 = swizzle(v1, .y, .z, .w, .x); - - var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) }); - v2 = swizzle(v2, .x, .z, .x, .z); - - r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); - r2 = swizzle(r2, .x, .z, .w, .y); - - var m: Mat = undefined; - m[0] = r2; - - r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); - r2 = swizzle(r2, .z, .x, .w, .y); - m[1] = r2; - - v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); - m[2] = v2; - m[3] = f32x4(0.0, 0.0, 0.0, 1.0); - return m; -} -pub fn matFromAxisAngle(axis: Vec, angle: f32) Mat { - assert(!all(axis == splat(F32x4, 0.0), 3)); - assert(!all(isInf(axis), 3)); - const normal = normalize3(axis); - return matFromNormAxisAngle(normal, angle); -} -test "zmath.matrix.matFromAxisAngle" { - { - const m0 = matFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), math.pi * 0.25); - const m1 = rotationX(math.pi * 0.25); - try expectVecApproxEqAbs(m0[0], m1[0], 0.001); - try expectVecApproxEqAbs(m0[1], m1[1], 0.001); - try expectVecApproxEqAbs(m0[2], m1[2], 0.001); - try expectVecApproxEqAbs(m0[3], m1[3], 0.001); - } - { - const m0 = matFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), math.pi * 0.125); - const m1 = rotationY(math.pi * 0.125); - try expectVecApproxEqAbs(m0[0], m1[0], 0.001); - try expectVecApproxEqAbs(m0[1], m1[1], 0.001); - try expectVecApproxEqAbs(m0[2], m1[2], 0.001); - try expectVecApproxEqAbs(m0[3], m1[3], 0.001); - } - { - const m0 = matFromAxisAngle(f32x4(0.0, 0.0, 1.0, 0.0), math.pi * 0.333); - const m1 = rotationZ(math.pi * 0.333); - try expectVecApproxEqAbs(m0[0], m1[0], 0.001); - try expectVecApproxEqAbs(m0[1], m1[1], 0.001); - try expectVecApproxEqAbs(m0[2], m1[2], 0.001); - try expectVecApproxEqAbs(m0[3], m1[3], 0.001); - } -} - -pub fn matFromQuat(quat: Quat) Mat { - const q0 = quat + quat; - var q1 = quat * q0; - - var v0 = swizzle(q1, .y, .x, .x, .w); - v0 = andInt(v0, f32x4_mask3); - - var v1 = swizzle(q1, .z, .z, .y, .w); - v1 = andInt(v1, f32x4_mask3); - - const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1; - - v0 = swizzle(quat, .x, .x, .y, .w); - v1 = swizzle(q0, .z, .y, .z, .w); - v0 = v0 * v1; - - v1 = swizzle(quat, .w, .w, .w, .w); - const v2 = swizzle(q0, .y, .z, .x, .w); - v1 = v1 * v2; - - const r1 = v0 + v1; - const r2 = v0 - v1; - - v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); - v0 = swizzle(v0, .x, .z, .w, .y); - v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) }); - v1 = swizzle(v1, .x, .z, .x, .z); - - q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); - q1 = swizzle(q1, .x, .z, .w, .y); - - var m: Mat = undefined; - m[0] = q1; - - q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); - q1 = swizzle(q1, .z, .x, .w, .y); - m[1] = q1; - - q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); - m[2] = q1; - m[3] = f32x4(0.0, 0.0, 0.0, 1.0); - return m; -} -test "zmath.matrix.matFromQuat" { - { - const m = matFromQuat(f32x4(0.0, 0.0, 0.0, 1.0)); - try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.0001); - try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); - try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); - try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); - } -} - -pub fn matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat { - return matFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); -} -pub fn matFromRollPitchYawV(angles: Vec) Mat { - return matFromQuat(quatFromRollPitchYawV(angles)); -} - -pub fn matToQuat(m: Mat) Quat { - return quatFromMat(m); -} - -pub inline fn loadMat(mem: []const f32) Mat { - return .{ - load(mem[0..4], F32x4, 0), - load(mem[4..8], F32x4, 0), - load(mem[8..12], F32x4, 0), - load(mem[12..16], F32x4, 0), - }; -} -test "zmath.loadMat" { - const a = [18]f32{ - 1.0, 2.0, 3.0, 4.0, - 5.0, 6.0, 7.0, 8.0, - 9.0, 10.0, 11.0, 12.0, - 13.0, 14.0, 15.0, 16.0, - 17.0, 18.0, - }; - const m = loadMat(a[1..]); - try expectVecEqual(m[0], f32x4(2.0, 3.0, 4.0, 5.0)); - try expectVecEqual(m[1], f32x4(6.0, 7.0, 8.0, 9.0)); - try expectVecEqual(m[2], f32x4(10.0, 11.0, 12.0, 13.0)); - try expectVecEqual(m[3], f32x4(14.0, 15.0, 16.0, 17.0)); -} - -pub inline fn storeMat(mem: []f32, m: Mat) void { - store(mem[0..4], m[0], 0); - store(mem[4..8], m[1], 0); - store(mem[8..12], m[2], 0); - store(mem[12..16], m[3], 0); -} - -pub inline fn loadMat43(mem: []const f32) Mat { - return .{ - f32x4(mem[0], mem[1], mem[2], 0.0), - f32x4(mem[3], mem[4], mem[5], 0.0), - f32x4(mem[6], mem[7], mem[8], 0.0), - f32x4(mem[9], mem[10], mem[11], 1.0), - }; -} - -pub inline fn storeMat43(mem: []f32, m: Mat) void { - store(mem[0..3], m[0], 3); - store(mem[3..6], m[1], 3); - store(mem[6..9], m[2], 3); - store(mem[9..12], m[3], 3); -} - -pub inline fn loadMat34(mem: []const f32) Mat { - return .{ - load(mem[0..4], F32x4, 0), - load(mem[4..8], F32x4, 0), - load(mem[8..12], F32x4, 0), - f32x4(0.0, 0.0, 0.0, 1.0), - }; -} - -pub inline fn storeMat34(mem: []f32, m: Mat) void { - store(mem[0..4], m[0], 0); - store(mem[4..8], m[1], 0); - store(mem[8..12], m[2], 0); -} - -pub inline fn matToArr(m: Mat) [16]f32 { - var array: [16]f32 = undefined; - storeMat(array[0..], m); - return array; -} - -pub inline fn matToArr43(m: Mat) [12]f32 { - var array: [12]f32 = undefined; - storeMat43(array[0..], m); - return array; -} - -pub inline fn matToArr34(m: Mat) [12]f32 { - var array: [12]f32 = undefined; - storeMat34(array[0..], m); - return array; -} -// ------------------------------------------------------------------------------ -// -// 5. Quaternion functions -// -// ------------------------------------------------------------------------------ -pub fn qmul(q0: Quat, q1: Quat) Quat { - var result = swizzle(q1, .w, .w, .w, .w); - var q1x = swizzle(q1, .x, .x, .x, .x); - var q1y = swizzle(q1, .y, .y, .y, .y); - var q1z = swizzle(q1, .z, .z, .z, .z); - result = result * q0; - var q0_shuf = swizzle(q0, .w, .z, .y, .x); - q1x = q1x * q0_shuf; - q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z); - result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result); - q1y = q1y * q0_shuf; - q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x); - q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0); - q1z = q1z * q0_shuf; - q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y); - return result + q1y; -} -test "zmath.quaternion.mul" { - { - const q0 = f32x4(2.0, 3.0, 4.0, 1.0); - const q1 = f32x4(3.0, 2.0, 1.0, 4.0); - try expectVecApproxEqAbs(qmul(q0, q1), f32x4(16.0, 4.0, 22.0, -12.0), 0.0001); - } -} - -pub fn quatToMat(quat: Quat) Mat { - return matFromQuat(quat); -} - -pub fn quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void { - axis.* = quat; - angle.* = 2.0 * acos(quat[3]); -} -test "zmath.quaternion.quatToAxisAngle" { - { - const q0 = quatFromNormAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); - var axis: Vec = f32x4(4.0, 3.0, 2.0, 1.0); - var angle: f32 = 10.0; - quatToAxisAngle(q0, &axis, &angle); - try expect(math.approxEqAbs(f32, axis[0], @sin(@as(f32, 0.25) * math.pi * 0.5), 0.0001)); - try expect(axis[1] == 0.0); - try expect(axis[2] == 0.0); - try expect(math.approxEqAbs(f32, angle, 0.25 * math.pi, 0.0001)); - } -} - -pub fn quatFromMat(m: Mat) Quat { - const r0 = m[0]; - const r1 = m[1]; - const r2 = m[2]; - const r00 = swizzle(r0, .x, .x, .x, .x); - const r11 = swizzle(r1, .y, .y, .y, .y); - const r22 = swizzle(r2, .z, .z, .z, .z); - - const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0); - const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0); - const x2py2gez2pw2 = r22 <= splat(F32x4, 0.0); - - var t0 = mulAdd(r00, f32x4(1.0, -1.0, -1.0, 1.0), splat(F32x4, 1.0)); - var t1 = r11 * f32x4(-1.0, 1.0, -1.0, 1.0); - var t2 = mulAdd(r22, f32x4(-1.0, -1.0, 1.0, 1.0), t0); - const x2y2z2w2 = t1 + t2; - - t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) }); - t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) }); - t1 = swizzle(t1, .x, .z, .w, .y); - const xyxzyz = t0 + t1; - - t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) }); - t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) }); - t1 = swizzle(t1, .x, .z, .w, .y); - const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0); - - t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) }); - t1 = @shuffle(f32, x2y2z2w2, xwywzw, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 0) }); - t2 = @shuffle(f32, xyxzyz, xwywzw, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); - - const tensor0 = @shuffle(f32, t0, t2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); - const tensor1 = @shuffle(f32, t0, t2, [4]i32{ 2, 1, ~@as(i32, 1), ~@as(i32, 3) }); - const tensor2 = @shuffle(f32, t2, t1, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 2) }); - const tensor3 = @shuffle(f32, t2, t1, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 1) }); - - t0 = select(x2gey2, tensor0, tensor1); - t1 = select(z2gew2, tensor2, tensor3); - t2 = select(x2py2gez2pw2, t0, t1); - - return t2 / length4(t2); -} -test "zmath.quatFromMat" { - { - const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); - const q1 = quatFromMat(rotationX(0.25 * math.pi)); - try expectVecApproxEqAbs(q0, q1, 0.0001); - } - { - const q0 = quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); - const q1 = quatFromMat(matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); - try expectVecApproxEqAbs(q0, q1, 0.0001); - } - { - const q0 = quatFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi); - const q1 = quatFromMat(matFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi)); - try expectVecApproxEqAbs(q0, q1, 0.0001); - } -} - -pub fn quatFromNormAxisAngle(axis: Vec, angle: f32) Quat { - const n = f32x4(axis[0], axis[1], axis[2], 1.0); - const sc = sincos(0.5 * angle); - return n * f32x4(sc[0], sc[0], sc[0], sc[1]); -} -pub fn quatFromAxisAngle(axis: Vec, angle: f32) Quat { - assert(!all(axis == splat(F32x4, 0.0), 3)); - assert(!all(isInf(axis), 3)); - const normal = normalize3(axis); - return quatFromNormAxisAngle(normal, angle); -} -test "zmath.quaternion.quatFromNormAxisAngle" { - { - const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); - const q1 = quatFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), 0.125 * math.pi); - const m0 = rotationX(0.25 * math.pi); - const m1 = rotationY(0.125 * math.pi); - const mr0 = quatToMat(qmul(q0, q1)); - const mr1 = mul(m0, m1); - try expectVecApproxEqAbs(mr0[0], mr1[0], 0.0001); - try expectVecApproxEqAbs(mr0[1], mr1[1], 0.0001); - try expectVecApproxEqAbs(mr0[2], mr1[2], 0.0001); - try expectVecApproxEqAbs(mr0[3], mr1[3], 0.0001); - } - { - const m0 = quatToMat(quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); - const m1 = matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); - try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); - try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); - try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); - try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); - } -} - -pub inline fn qidentity() Quat { - return f32x4(@as(f32, 0.0), @as(f32, 0.0), @as(f32, 0.0), @as(f32, 1.0)); -} - -pub inline fn conjugate(quat: Quat) Quat { - return quat * f32x4(-1.0, -1.0, -1.0, 1.0); -} - -fn inverseQuat(quat: Quat) Quat { - const l = lengthSq4(quat); - const conj = conjugate(quat); - return select(l <= splat(F32x4, math.floatEps(f32)), splat(F32x4, 0.0), conj / l); -} -test "zmath.quaternion.inverseQuat" { - try expectVecApproxEqAbs( - inverse(f32x4(2.0, 3.0, 4.0, 1.0)), - f32x4(-1.0 / 15.0, -1.0 / 10.0, -2.0 / 15.0, 1.0 / 30.0), - 0.0001, - ); - try expectVecApproxEqAbs(inverse(qidentity()), qidentity(), 0.0001); -} - -// Algorithm from: https://github.com/g-truc/glm/blob/master/glm/detail/type_quat.inl -pub fn rotate(q: Quat, v: Vec) Vec { - const w = splat(F32x4, q[3]); - const axis = f32x4(q[0], q[1], q[2], 0.0); - const uv = cross3(axis, v); - return v + ((uv * w) + cross3(axis, uv)) * splat(F32x4, 2.0); -} -test "zmath.quaternion.rotate" { - const quat = quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi); - const mat = matFromQuat(quat); - const forward = f32x4(0.0, 0.0, -1.0, 0.0); - const up = f32x4(0.0, 1.0, 0.0, 0.0); - const right = f32x4(1.0, 0.0, 0.0, 0.0); - try expectVecApproxEqAbs(rotate(quat, forward), mul(forward, mat), 0.0001); - try expectVecApproxEqAbs(rotate(quat, up), mul(up, mat), 0.0001); - try expectVecApproxEqAbs(rotate(quat, right), mul(right, mat), 0.0001); -} - -pub fn slerp(q0: Quat, q1: Quat, t: f32) Quat { - return slerpV(q0, q1, splat(F32x4, t)); -} -pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat { - var cos_omega = dot4(q0, q1); - const sign = select(cos_omega < splat(F32x4, 0.0), splat(F32x4, -1.0), splat(F32x4, 1.0)); - - cos_omega = cos_omega * sign; - const sin_omega = sqrt(splat(F32x4, 1.0) - cos_omega * cos_omega); - - const omega = atan2(sin_omega, cos_omega); - - var v01 = t; - v01 = xorInt(andInt(v01, f32x4_mask2), f32x4_sign_mask1); - v01 = f32x4(1.0, 0.0, 0.0, 0.0) + v01; - - var s0 = sin(v01 * omega) / sin_omega; - s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01); - - const s1 = swizzle(s0, .y, .y, .y, .y); - s0 = swizzle(s0, .x, .x, .x, .x); - - return q0 * s0 + sign * q1 * s1; -} -test "zmath.quaternion.slerp" { - const from = f32x4(0.0, 0.0, 0.0, 1.0); - const to = f32x4(0.5, 0.5, -0.5, 0.5); - const result = slerp(from, to, 0.5); - try expectVecApproxEqAbs(result, f32x4(0.28867513, 0.28867513, -0.28867513, 0.86602540), 0.0001); -} - -// Converts q back to euler angles, assuming a YXZ rotation order. -// See: http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler -pub fn quatToRollPitchYaw(q: Quat) [3]f32 { - var angles: [3]f32 = undefined; - - const p = swizzle(q, .w, .y, .x, .z); - const sign = -1.0; - - const singularity = p[0] * p[2] + sign * p[1] * p[3]; - if (singularity > 0.499) { - angles[0] = math.pi * 0.5; - angles[1] = 2.0 * math.atan2(p[1], p[0]); - angles[2] = 0.0; - } else if (singularity < -0.499) { - angles[0] = -math.pi * 0.5; - angles[1] = 2.0 * math.atan2(p[1], p[0]); - angles[2] = 0.0; - } else { - const sq = p * p; - const y = splat(F32x4, 2.0) * f32x4(p[0] * p[1] - sign * p[2] * p[3], p[0] * p[3] - sign * p[1] * p[2], 0.0, 0.0); - const x = splat(F32x4, 1.0) - (splat(F32x4, 2.0) * f32x4(sq[1] + sq[2], sq[2] + sq[3], 0.0, 0.0)); - const res = atan2(y, x); - angles[0] = math.asin(2.0 * singularity); - angles[1] = res[0]; - angles[2] = res[1]; - } - - return angles; -} - -test "zmath.quaternion.quatToRollPitchYaw" { - { - const expected = f32x4(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); - const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); - const result = quatToRollPitchYaw(quat); - try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); - } - - { - const expected = f32x4(0.3 * math.pi, 0.1 * math.pi, 0.2 * math.pi, 0.0); - const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); - const result = quatToRollPitchYaw(quat); - try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); - } - - // North pole singularity - { - const angle = f32x4(0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); - const expected = f32x4(0.5 * math.pi, -0.1 * math.pi, 0.0, 0.0); - const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); - const result = quatToRollPitchYaw(quat); - try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); - } - - // South pole singularity - { - const angle = f32x4(-0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); - const expected = f32x4(-0.5 * math.pi, 0.5 * math.pi, 0.0, 0.0); - const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); - const result = quatToRollPitchYaw(quat); - try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); - } -} - -pub fn quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat { - return quatFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); -} -pub fn quatFromRollPitchYawV(angles: Vec) Quat { // | pitch | yaw | roll | 0 | - const sc = sincos(splat(Vec, 0.5) * angles); - const p0 = @shuffle(f32, sc[1], sc[0], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); - const p1 = @shuffle(f32, sc[0], sc[1], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); - const y0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); - const y1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); - const r0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); - const r1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); - const q1 = p1 * f32x4(1.0, -1.0, -1.0, 1.0) * y1; - const q0 = p0 * y0 * r0; - return mulAdd(q1, r1, q0); -} -test "zmath.quaternion.quatFromRollPitchYawV" { - { - const m0 = quatToMat(quatFromRollPitchYawV(f32x4(0.25 * math.pi, 0.0, 0.0, 0.0))); - const m1 = rotationX(0.25 * math.pi); - try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); - try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); - try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); - try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); - } - { - const m0 = quatToMat(quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi)); - const m1 = mul( - rotationZ(0.3 * math.pi), - mul(rotationX(0.1 * math.pi), rotationY(0.2 * math.pi)), - ); - try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); - try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); - try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); - try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); - } -} -// ------------------------------------------------------------------------------ -// -// 6. Color functions -// -// ------------------------------------------------------------------------------ -pub fn adjustSaturation(color: F32x4, saturation: f32) F32x4 { - const luminance = dot3(f32x4(0.2125, 0.7154, 0.0721, 0.0), color); - var result = mulAdd(color - luminance, f32x4s(saturation), luminance); - result[3] = color[3]; - return result; -} - -pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 { - var result = mulAdd(color - f32x4s(0.5), f32x4s(contrast), f32x4s(0.5)); - result[3] = color[3]; - return result; -} - -pub fn rgbToHsl(rgb: F32x4) F32x4 { - const r = swizzle(rgb, .x, .x, .x, .x); - const g = swizzle(rgb, .y, .y, .y, .y); - const b = swizzle(rgb, .z, .z, .z, .z); - - const minv = min(r, min(g, b)); - const maxv = max(r, max(g, b)); - - const l = (minv + maxv) * f32x4s(0.5); - const d = maxv - minv; - const la = select(boolx4(true, true, true, false), l, rgb); - - if (all(d < f32x4s(math.floatEps(f32)), 3)) { - return select(boolx4(true, true, false, false), f32x4s(0.0), la); - } else { - var s: F32x4 = undefined; - var h: F32x4 = undefined; - - const d2 = minv + maxv; - - if (all(l > f32x4s(0.5), 3)) { - s = d / (f32x4s(2.0) - d2); - } else { - s = d / d2; - } - - if (all(r == maxv, 3)) { - h = (g - b) / d; - } else if (all(g == maxv, 3)) { - h = f32x4s(2.0) + (b - r) / d; - } else { - h = f32x4s(4.0) + (r - g) / d; - } - - h /= f32x4s(6.0); - - if (all(h < f32x4s(0.0), 3)) { - h += f32x4s(1.0); - } - - const lha = select(boolx4(true, true, false, false), h, la); - return select(boolx4(true, false, true, true), lha, s); - } -} -test "zmath.color.rgbToHsl" { - try expectVecApproxEqAbs(rgbToHsl(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.6, 0.5, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 0.5, 0.5), 0.0001); - try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 0.5, 0.25), 0.0001); - try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 0.5, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); -} - -fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 { - var t = h; - - if (all(t < f32x4s(0.0), 3)) - t += f32x4s(1.0); - - if (all(t > f32x4s(1.0), 3)) - t -= f32x4s(1.0); - - if (all(t < f32x4s(1.0 / 6.0), 3)) - return mulAdd(q - p, f32x4s(6.0) * t, p); - - if (all(t < f32x4s(0.5), 3)) - return q; - - if (all(t < f32x4s(2.0 / 3.0), 3)) - return mulAdd(q - p, f32x4s(6.0) * (f32x4s(2.0 / 3.0) - t), p); - - return p; -} - -pub fn hslToRgb(hsl: F32x4) F32x4 { - const s = swizzle(hsl, .y, .y, .y, .y); - const l = swizzle(hsl, .z, .z, .z, .z); - - if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) { - return select(boolx4(true, true, true, false), l, hsl); - } else { - const h = swizzle(hsl, .x, .x, .x, .x); - var q: F32x4 = undefined; - if (all(l < f32x4s(0.5), 3)) { - q = l * (f32x4s(1.0) + s); - } else { - q = (l + s) - (l * s); - } - - const p = f32x4s(2.0) * l - q; - - const r = hueToClr(p, q, h + f32x4s(1.0 / 3.0)); - const g = hueToClr(p, q, h); - const b = hueToClr(p, q, h - f32x4s(1.0 / 3.0)); - - const rg = select(boolx4(true, false, false, false), r, g); - const ba = select(boolx4(true, true, true, false), b, hsl); - return select(boolx4(true, true, false, false), rg, ba); - } -} -test "zmath.color.hslToRgb" { - try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hslToRgb(f32x4(0.6111, 0.6, 0.5, 1.0)), 0.0001); - try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hslToRgb(f32x4(0.0, 1.0, 0.5, 0.5)), 0.0001); - try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hslToRgb(f32x4(0.3333, 1.0, 0.5, 0.25)), 0.0005); - try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hslToRgb(f32x4(0.6666, 1.0, 0.5, 1.0)), 0.0005); - try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), 0.0001); - try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), 0.0001); - try expectVecApproxEqAbs(hslToRgb(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0))), f32x4(1.0, 1.0, 1.0, 1.0), 0.0005); - try expectVecApproxEqAbs( - hslToRgb(rgbToHsl(f32x4(0.82198, 0.1839, 0.632, 1.0))), - f32x4(0.82198, 0.1839, 0.632, 1.0), - 0.0005, - ); - try expectVecApproxEqAbs( - rgbToHsl(hslToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), - f32x4(0.82198, 0.1839, 0.632, 1.0), - 0.0005, - ); - try expectVecApproxEqAbs( - rgbToHsl(hslToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), - f32x4(0.1839, 0.82198, 0.632, 1.0), - 0.0005, - ); - try expectVecApproxEqAbs( - hslToRgb(rgbToHsl(f32x4(0.1839, 0.632, 0.82198, 1.0))), - f32x4(0.1839, 0.632, 0.82198, 1.0), - 0.0005, - ); -} - -pub fn rgbToHsv(rgb: F32x4) F32x4 { - const r = swizzle(rgb, .x, .x, .x, .x); - const g = swizzle(rgb, .y, .y, .y, .y); - const b = swizzle(rgb, .z, .z, .z, .z); - - const minv = min(r, min(g, b)); - const v = max(r, max(g, b)); - const d = v - minv; - const s = if (all(isNearEqual(v, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) f32x4s(0.0) else d / v; - - if (all(d < f32x4s(math.floatEps(f32)), 3)) { - const hv = select(boolx4(true, false, false, false), f32x4s(0.0), v); - const hva = select(boolx4(true, true, true, false), hv, rgb); - return select(boolx4(true, false, true, true), hva, s); - } else { - var h: F32x4 = undefined; - if (all(r == v, 3)) { - h = (g - b) / d; - if (all(g < b, 3)) - h += f32x4s(6.0); - } else if (all(g == v, 3)) { - h = f32x4s(2.0) + (b - r) / d; - } else { - h = f32x4s(4.0) + (r - g) / d; - } - - h /= f32x4s(6.0); - const hv = select(boolx4(true, false, false, false), h, v); - const hva = select(boolx4(true, true, true, false), hv, rgb); - return select(boolx4(true, false, true, true), hva, s); - } -} -test "zmath.color.rgbToHsv" { - try expectVecApproxEqAbs(rgbToHsv(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.75, 0.8, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.2, 0.8, 1.0)), f32x4(0.7222, 0.75, 0.8, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.8, 0.2, 1.0)), f32x4(0.2777, 0.75, 0.8, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 1.0, 0.5), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 1.0, 0.25), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 1.0, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); - try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); -} - -pub fn hsvToRgb(hsv: F32x4) F32x4 { - const h = swizzle(hsv, .x, .x, .x, .x); - const s = swizzle(hsv, .y, .y, .y, .y); - const v = swizzle(hsv, .z, .z, .z, .z); - - const h6 = h * f32x4s(6.0); - const i = floor(h6); - const f = h6 - i; - - const p = v * (f32x4s(1.0) - s); - const q = v * (f32x4s(1.0) - f * s); - const t = v * (f32x4s(1.0) - (f32x4s(1.0) - f) * s); - - const ii = @as(i32, @intFromFloat(mod(i, f32x4s(6.0))[0])); - const rgb = switch (ii) { - 0 => blk: { - const vt = select(boolx4(true, false, false, false), v, t); - break :blk select(boolx4(true, true, false, false), vt, p); - }, - 1 => blk: { - const qv = select(boolx4(true, false, false, false), q, v); - break :blk select(boolx4(true, true, false, false), qv, p); - }, - 2 => blk: { - const pv = select(boolx4(true, false, false, false), p, v); - break :blk select(boolx4(true, true, false, false), pv, t); - }, - 3 => blk: { - const pq = select(boolx4(true, false, false, false), p, q); - break :blk select(boolx4(true, true, false, false), pq, v); - }, - 4 => blk: { - const tp = select(boolx4(true, false, false, false), t, p); - break :blk select(boolx4(true, true, false, false), tp, v); - }, - 5 => blk: { - const vp = select(boolx4(true, false, false, false), v, p); - break :blk select(boolx4(true, true, false, false), vp, q); - }, - else => unreachable, - }; - return select(boolx4(true, true, true, false), rgb, hsv); -} -test "zmath.color.hsvToRgb" { - const epsilon = 0.0005; - try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hsvToRgb(f32x4(0.6111, 0.75, 0.8, 1.0)), epsilon); - try expectVecApproxEqAbs(f32x4(0.4, 0.2, 0.8, 1.0), hsvToRgb(f32x4(0.7222, 0.75, 0.8, 1.0)), epsilon); - try expectVecApproxEqAbs(f32x4(0.4, 0.8, 0.2, 1.0), hsvToRgb(f32x4(0.2777, 0.75, 0.8, 1.0)), epsilon); - try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hsvToRgb(f32x4(0.0, 1.0, 1.0, 0.5)), epsilon); - try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hsvToRgb(f32x4(0.3333, 1.0, 1.0, 0.25)), epsilon); - try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hsvToRgb(f32x4(0.6666, 1.0, 1.0, 1.0)), epsilon); - try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), epsilon); - try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), epsilon); - try expectVecApproxEqAbs( - hsvToRgb(rgbToHsv(f32x4(0.1839, 0.632, 0.82198, 1.0))), - f32x4(0.1839, 0.632, 0.82198, 1.0), - epsilon, - ); - try expectVecApproxEqAbs( - hsvToRgb(rgbToHsv(f32x4(0.82198, 0.1839, 0.632, 1.0))), - f32x4(0.82198, 0.1839, 0.632, 1.0), - epsilon, - ); - try expectVecApproxEqAbs( - rgbToHsv(hsvToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), - f32x4(0.82198, 0.1839, 0.632, 1.0), - epsilon, - ); - try expectVecApproxEqAbs( - rgbToHsv(hsvToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), - f32x4(0.1839, 0.82198, 0.632, 1.0), - epsilon, - ); -} - -pub fn rgbToSrgb(rgb: F32x4) F32x4 { - const static = struct { - const cutoff = f32x4(0.0031308, 0.0031308, 0.0031308, 1.0); - const linear = f32x4(12.92, 12.92, 12.92, 1.0); - const scale = f32x4(1.055, 1.055, 1.055, 1.0); - const bias = f32x4(0.055, 0.055, 0.055, 1.0); - const rgamma = 1.0 / 2.4; - }; - var v = saturate(rgb); - const v0 = v * static.linear; - const v1 = static.scale * f32x4( - math.pow(f32, v[0], static.rgamma), - math.pow(f32, v[1], static.rgamma), - math.pow(f32, v[2], static.rgamma), - v[3], - ) - static.bias; - v = select(v < static.cutoff, v0, v1); - return select(boolx4(true, true, true, false), v, rgb); -} -test "zmath.color.rgbToSrgb" { - const epsilon = 0.001; - try expectVecApproxEqAbs(rgbToSrgb(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.484, 0.665, 0.906, 1.0), epsilon); -} - -pub fn srgbToRgb(srgb: F32x4) F32x4 { - const static = struct { - const cutoff = f32x4(0.04045, 0.04045, 0.04045, 1.0); - const rlinear = f32x4(1.0 / 12.92, 1.0 / 12.92, 1.0 / 12.92, 1.0); - const scale = f32x4(1.0 / 1.055, 1.0 / 1.055, 1.0 / 1.055, 1.0); - const bias = f32x4(0.055, 0.055, 0.055, 1.0); - const gamma = 2.4; - }; - var v = saturate(srgb); - const v0 = v * static.rlinear; - var v1 = static.scale * (v + static.bias); - v1 = f32x4( - math.pow(f32, v1[0], static.gamma), - math.pow(f32, v1[1], static.gamma), - math.pow(f32, v1[2], static.gamma), - v1[3], - ); - v = select(v > static.cutoff, v1, v0); - return select(boolx4(true, true, true, false), v, srgb); -} -test "zmath.color.srgbToRgb" { - const epsilon = 0.0007; - try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), srgbToRgb(f32x4(0.484, 0.665, 0.906, 1.0)), epsilon); - try expectVecApproxEqAbs( - rgbToSrgb(srgbToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), - f32x4(0.1839, 0.82198, 0.632, 1.0), - epsilon, - ); -} -// ------------------------------------------------------------------------------ -// -// X. Misc functions -// -// ------------------------------------------------------------------------------ -pub fn linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 { - const ptvec = pt - linept0; - const linevec = linept1 - linept0; - const scale = dot3(ptvec, linevec) / lengthSq3(linevec); - return length3(ptvec - linevec * scale); -} -test "zmath.linePointDistance" { - { - const linept0 = f32x4(-1.0, -2.0, -3.0, 1.0); - const linept1 = f32x4(1.0, 2.0, 3.0, 1.0); - const pt = f32x4(1.0, 1.0, 1.0, 1.0); - const v = linePointDistance(linept0, linept1, pt); - try expectVecApproxEqAbs(v, splat(F32x4, 0.654), 0.001); - } -} - -fn sin32(v: f32) f32 { - var y = v - math.tau * @round(v * 1.0 / math.tau); - - if (y > 0.5 * math.pi) { - y = math.pi - y; - } else if (y < -math.pi * 0.5) { - y = -math.pi - y; - } - const y2 = y * y; - - // 11-degree minimax approximation - var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); - sinv = mulAdd(sinv, y2, -0.00019840874); - sinv = mulAdd(sinv, y2, 0.0083333310); - sinv = mulAdd(sinv, y2, -0.16666667); - return y * mulAdd(sinv, y2, 1.0); -} -fn cos32(v: f32) f32 { - var y = v - math.tau * @round(v * 1.0 / math.tau); - - const sign = blk: { - if (y > 0.5 * math.pi) { - y = math.pi - y; - break :blk @as(f32, -1.0); - } else if (y < -math.pi * 0.5) { - y = -math.pi - y; - break :blk @as(f32, -1.0); - } else { - break :blk @as(f32, 1.0); - } - }; - const y2 = y * y; - - // 10-degree minimax approximation - var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); - cosv = mulAdd(cosv, y2, -0.0013888378); - cosv = mulAdd(cosv, y2, 0.041666638); - cosv = mulAdd(cosv, y2, -0.5); - return sign * mulAdd(cosv, y2, 1.0); -} -fn sincos32(v: f32) [2]f32 { - var y = v - math.tau * @round(v * 1.0 / math.tau); - - const sign = blk: { - if (y > 0.5 * math.pi) { - y = math.pi - y; - break :blk @as(f32, -1.0); - } else if (y < -math.pi * 0.5) { - y = -math.pi - y; - break :blk @as(f32, -1.0); - } else { - break :blk @as(f32, 1.0); - } - }; - const y2 = y * y; - - // 11-degree minimax approximation - var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); - sinv = mulAdd(sinv, y2, -0.00019840874); - sinv = mulAdd(sinv, y2, 0.0083333310); - sinv = mulAdd(sinv, y2, -0.16666667); - sinv = y * mulAdd(sinv, y2, 1.0); - - // 10-degree minimax approximation - var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); - cosv = mulAdd(cosv, y2, -0.0013888378); - cosv = mulAdd(cosv, y2, 0.041666638); - cosv = mulAdd(cosv, y2, -0.5); - cosv = sign * mulAdd(cosv, y2, 1.0); - - return .{ sinv, cosv }; -} -test "zmath.sincos32" { - const epsilon = 0.0001; - - try expect(math.isNan(sincos32(math.inf(f32))[0])); - try expect(math.isNan(sincos32(math.inf(f32))[1])); - try expect(math.isNan(sincos32(-math.inf(f32))[0])); - try expect(math.isNan(sincos32(-math.inf(f32))[1])); - try expect(math.isNan(sincos32(math.nan(f32))[0])); - try expect(math.isNan(sincos32(-math.nan(f32))[1])); - - try expect(math.isNan(sin32(math.inf(f32)))); - try expect(math.isNan(cos32(math.inf(f32)))); - try expect(math.isNan(sin32(-math.inf(f32)))); - try expect(math.isNan(cos32(-math.inf(f32)))); - try expect(math.isNan(sin32(math.nan(f32)))); - try expect(math.isNan(cos32(-math.nan(f32)))); - - var f: f32 = -100.0; - var i: u32 = 0; - while (i < 100) : (i += 1) { - const sc = sincos32(f); - const s0 = sin32(f); - const c0 = cos32(f); - const s = @sin(f); - const c = @cos(f); - try expect(math.approxEqAbs(f32, sc[0], s, epsilon)); - try expect(math.approxEqAbs(f32, sc[1], c, epsilon)); - try expect(math.approxEqAbs(f32, s0, s, epsilon)); - try expect(math.approxEqAbs(f32, c0, c, epsilon)); - f += 0.12345 * @as(f32, @floatFromInt(i)); - } -} - -fn asin32(v: f32) f32 { - const x = @abs(v); - var omx = 1.0 - x; - if (omx < 0.0) { - omx = 0.0; - } - const root = @sqrt(omx); - - // 7-degree minimax approximation - var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); - result = mulAdd(result, x, -0.0170881256); - result = mulAdd(result, x, 0.0308918810); - result = mulAdd(result, x, -0.0501743046); - result = mulAdd(result, x, 0.0889789874); - result = mulAdd(result, x, -0.2145988016); - result = root * mulAdd(result, x, 1.5707963050); - - return if (v >= 0.0) 0.5 * math.pi - result else result - 0.5 * math.pi; -} -test "zmath.asin32" { - const epsilon = 0.0001; - - try expect(math.approxEqAbs(f32, asin(@as(f32, -1.1)), -0.5 * math.pi, epsilon)); - try expect(math.approxEqAbs(f32, asin(@as(f32, 1.1)), 0.5 * math.pi, epsilon)); - try expect(math.approxEqAbs(f32, asin(@as(f32, -1000.1)), -0.5 * math.pi, epsilon)); - try expect(math.approxEqAbs(f32, asin(@as(f32, 100000.1)), 0.5 * math.pi, epsilon)); - try expect(math.isNan(asin(math.inf(f32)))); - try expect(math.isNan(asin(-math.inf(f32)))); - try expect(math.isNan(asin(math.nan(f32)))); - try expect(math.isNan(asin(-math.nan(f32)))); - - try expectVecApproxEqAbs(asin(splat(F32x8, -100.0)), splat(F32x8, -0.5 * math.pi), epsilon); - try expectVecApproxEqAbs(asin(splat(F32x16, 100.0)), splat(F32x16, 0.5 * math.pi), epsilon); - try expect(all(isNan(asin(splat(F32x4, math.inf(f32)))), 0) == true); - try expect(all(isNan(asin(splat(F32x4, -math.inf(f32)))), 0) == true); - try expect(all(isNan(asin(splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(asin(splat(F32x4, math.snan(f32)))), 0) == true); - - var f: f32 = -1.0; - var i: u32 = 0; - while (i < 8) : (i += 1) { - const r0 = asin32(f); - const r1 = math.asin(f); - const r4 = asin(splat(F32x4, f)); - const r8 = asin(splat(F32x8, f)); - const r16 = asin(splat(F32x16, f)); - try expect(math.approxEqAbs(f32, r0, r1, epsilon)); - try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); - try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); - try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); - f += 0.09 * @as(f32, @floatFromInt(i)); - } -} - -fn acos32(v: f32) f32 { - const x = @abs(v); - var omx = 1.0 - x; - if (omx < 0.0) { - omx = 0.0; - } - const root = @sqrt(omx); - - // 7-degree minimax approximation - var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); - result = mulAdd(result, x, -0.0170881256); - result = mulAdd(result, x, 0.0308918810); - result = mulAdd(result, x, -0.0501743046); - result = mulAdd(result, x, 0.0889789874); - result = mulAdd(result, x, -0.2145988016); - result = root * mulAdd(result, x, 1.5707963050); - - return if (v >= 0.0) result else math.pi - result; -} -test "zmath.acos32" { - const epsilon = 0.1; - - try expect(math.approxEqAbs(f32, acos(@as(f32, -1.1)), math.pi, epsilon)); - try expect(math.approxEqAbs(f32, acos(@as(f32, -10000.1)), math.pi, epsilon)); - try expect(math.approxEqAbs(f32, acos(@as(f32, 1.1)), 0.0, epsilon)); - try expect(math.approxEqAbs(f32, acos(@as(f32, 1000.1)), 0.0, epsilon)); - try expect(math.isNan(acos(math.inf(f32)))); - try expect(math.isNan(acos(-math.inf(f32)))); - try expect(math.isNan(acos(math.nan(f32)))); - try expect(math.isNan(acos(-math.nan(f32)))); - - try expectVecApproxEqAbs(acos(splat(F32x8, -100.0)), splat(F32x8, math.pi), epsilon); - try expectVecApproxEqAbs(acos(splat(F32x16, 100.0)), splat(F32x16, 0.0), epsilon); - try expect(all(isNan(acos(splat(F32x4, math.inf(f32)))), 0) == true); - try expect(all(isNan(acos(splat(F32x4, -math.inf(f32)))), 0) == true); - try expect(all(isNan(acos(splat(F32x4, math.nan(f32)))), 0) == true); - try expect(all(isNan(acos(splat(F32x4, math.snan(f32)))), 0) == true); - - var f: f32 = -1.0; - var i: u32 = 0; - while (i < 8) : (i += 1) { - const r0 = acos32(f); - const r1 = math.acos(f); - const r4 = acos(splat(F32x4, f)); - const r8 = acos(splat(F32x8, f)); - const r16 = acos(splat(F32x16, f)); - try expect(math.approxEqAbs(f32, r0, r1, epsilon)); - try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); - try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); - try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); - f += 0.09 * @as(f32, @floatFromInt(i)); - } -} - -pub fn modAngle32(in_angle: f32) f32 { - const angle = in_angle + math.pi; - var temp: f32 = @abs(angle); - temp = temp - (2.0 * math.pi * @as(f32, @floatFromInt(@as(i32, @intFromFloat(temp / math.pi))))); - temp = temp - math.pi; - if (angle < 0.0) { - temp = -temp; - } - return temp; -} - -pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeOf(re0, im0, re1, im1) { - const re0_re1 = re0 * re1; - const re0_im1 = re0 * im1; - return .{ - mulAdd(-im0, im1, re0_re1), // re - mulAdd(re1, im0, re0_im1), // im - }; -} -// ------------------------------------------------------------------------------ -// -// FFT (implementation based on xdsp.h from DirectXMath) -// -// ------------------------------------------------------------------------------ -fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void { - const re0l = swizzle(re0.*, .x, .x, .y, .y); - const re0h = swizzle(re0.*, .z, .z, .w, .w); - - const im0l = swizzle(im0.*, .x, .x, .y, .y); - const im0h = swizzle(im0.*, .z, .z, .w, .w); - - const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l); - const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l); - - const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); - const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w); - const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y); - - const re_templ = swizzle(re_temp, .x, .y, .x, .y); - const im_templ = swizzle(im_temp, .x, .y, .x, .y); - - re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ); - im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ); -} - -fn fftButterflyDit4_4( - re0: *F32x4, - re1: *F32x4, - re2: *F32x4, - re3: *F32x4, - im0: *F32x4, - im1: *F32x4, - im2: *F32x4, - im3: *F32x4, - unity_table_re: []const F32x4, - unity_table_im: []const F32x4, - stride: u32, - last: bool, -) void { - const re_temp0 = re0.* + re2.*; - const im_temp0 = im0.* + im2.*; - - const re_temp2 = re1.* + re3.*; - const im_temp2 = im1.* + im3.*; - - const re_temp1 = re0.* - re2.*; - const im_temp1 = im0.* - im2.*; - - const re_temp3 = re1.* - re3.*; - const im_temp3 = im1.* - im3.*; - - var re_temp4 = re_temp0 + re_temp2; - var im_temp4 = im_temp0 + im_temp2; - - var re_temp5 = re_temp1 + im_temp3; - var im_temp5 = im_temp1 - re_temp3; - - var re_temp6 = re_temp0 - re_temp2; - var im_temp6 = im_temp0 - im_temp2; - - var re_temp7 = re_temp1 - im_temp3; - var im_temp7 = im_temp1 + re_temp3; - - { - const re_im = cmulSoa(re_temp5, im_temp5, unity_table_re[stride], unity_table_im[stride]); - re_temp5 = re_im[0]; - im_temp5 = re_im[1]; - } - { - const re_im = cmulSoa(re_temp6, im_temp6, unity_table_re[stride * 2], unity_table_im[stride * 2]); - re_temp6 = re_im[0]; - im_temp6 = re_im[1]; - } - { - const re_im = cmulSoa(re_temp7, im_temp7, unity_table_re[stride * 3], unity_table_im[stride * 3]); - re_temp7 = re_im[0]; - im_temp7 = re_im[1]; - } - - if (last) { - fftButterflyDit4_1(&re_temp4, &im_temp4); - fftButterflyDit4_1(&re_temp5, &im_temp5); - fftButterflyDit4_1(&re_temp6, &im_temp6); - fftButterflyDit4_1(&re_temp7, &im_temp7); - } - - re0.* = re_temp4; - im0.* = im_temp4; - - re1.* = re_temp5; - im1.* = im_temp5; - - re2.* = re_temp6; - im2.* = im_temp6; - - re3.* = re_temp7; - im3.* = im_temp7; -} - -fn fft4(re: []F32x4, im: []F32x4, count: u32) void { - assert(std.math.isPowerOfTwo(count)); - assert(re.len >= count); - assert(im.len >= count); - - var index: u32 = 0; - while (index < count) : (index += 1) { - fftButterflyDit4_1(&re[index], &im[index]); - } -} -test "zmath.fft4" { - const epsilon = 0.0001; - var re = [_]F32x4{f32x4(1.0, 2.0, 3.0, 4.0)}; - var im = [_]F32x4{f32x4s(0.0)}; - fft4(re[0..], im[0..], 1); - - var re_uns: [1]F32x4 = undefined; - var im_uns: [1]F32x4 = undefined; - fftUnswizzle(re[0..], re_uns[0..]); - fftUnswizzle(im[0..], im_uns[0..]); - - try expectVecApproxEqAbs(re_uns[0], f32x4(10.0, -2.0, -2.0, -2.0), epsilon); - try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 2.0, 0.0, -2.0), epsilon); -} - -fn fft8(re: []F32x4, im: []F32x4, count: u32) void { - assert(std.math.isPowerOfTwo(count)); - assert(re.len >= 2 * count); - assert(im.len >= 2 * count); - - var index: u32 = 0; - while (index < count) : (index += 1) { - var pre = re[index * 2 ..]; - var pim = im[index * 2 ..]; - - var odds_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); - var evens_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); - var odds_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); - var evens_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); - fftButterflyDit4_1(&odds_re, &odds_im); - fftButterflyDit4_1(&evens_re, &evens_im); - - { - const re_im = cmulSoa( - odds_re, - odds_im, - f32x4(1.0, 0.70710677, 0.0, -0.70710677), - f32x4(0.0, -0.70710677, -1.0, -0.70710677), - ); - pre[0] = evens_re + re_im[0]; - pim[0] = evens_im + re_im[1]; - } - { - const re_im = cmulSoa( - odds_re, - odds_im, - f32x4(-1.0, -0.70710677, 0.0, 0.70710677), - f32x4(0.0, 0.70710677, 1.0, 0.70710677), - ); - pre[1] = evens_re + re_im[0]; - pim[1] = evens_im + re_im[1]; - } - } -} -test "zmath.fft8" { - const epsilon = 0.0001; - var re = [_]F32x4{ f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0) }; - var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0) }; - fft8(re[0..], im[0..], 1); - - var re_uns: [2]F32x4 = undefined; - var im_uns: [2]F32x4 = undefined; - fftUnswizzle(re[0..], re_uns[0..]); - fftUnswizzle(im[0..], im_uns[0..]); - - try expectVecApproxEqAbs(re_uns[0], f32x4(36.0, -4.0, -4.0, -4.0), epsilon); - try expectVecApproxEqAbs(re_uns[1], f32x4(-4.0, -4.0, -4.0, -4.0), epsilon); - try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 9.656854, 4.0, 1.656854), epsilon); - try expectVecApproxEqAbs(im_uns[1], f32x4(0.0, -1.656854, -4.0, -9.656854), epsilon); -} - -fn fft16(re: []F32x4, im: []F32x4, count: u32) void { - assert(std.math.isPowerOfTwo(count)); - assert(re.len >= 4 * count); - assert(im.len >= 4 * count); - - const static = struct { - const unity_table_re = [4]F32x4{ - f32x4(1.0, 1.0, 1.0, 1.0), - f32x4(1.0, 0.92387950, 0.70710677, 0.38268343), - f32x4(1.0, 0.70710677, -4.3711388e-008, -0.70710677), - f32x4(1.0, 0.38268343, -0.70710677, -0.92387950), - }; - const unity_table_im = [4]F32x4{ - f32x4(-0.0, -0.0, -0.0, -0.0), - f32x4(-0.0, -0.38268343, -0.70710677, -0.92387950), - f32x4(-0.0, -0.70710677, -1.0, -0.70710677), - f32x4(-0.0, -0.92387950, -0.70710677, 0.38268343), - }; - }; - - var index: u32 = 0; - while (index < count) : (index += 1) { - fftButterflyDit4_4( - &re[index * 4], - &re[index * 4 + 1], - &re[index * 4 + 2], - &re[index * 4 + 3], - &im[index * 4], - &im[index * 4 + 1], - &im[index * 4 + 2], - &im[index * 4 + 3], - static.unity_table_re[0..], - static.unity_table_im[0..], - 1, - true, - ); - } -} -test "zmath.fft16" { - const epsilon = 0.0001; - var re = [_]F32x4{ - f32x4(1.0, 2.0, 3.0, 4.0), - f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), - f32x4(13.0, 14.0, 15.0, 16.0), - }; - var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0) }; - fft16(re[0..], im[0..], 1); - - var re_uns: [4]F32x4 = undefined; - var im_uns: [4]F32x4 = undefined; - fftUnswizzle(re[0..], re_uns[0..]); - fftUnswizzle(im[0..], im_uns[0..]); - - try expectVecApproxEqAbs(re_uns[0], f32x4(136.0, -8.0, -8.0, -8.0), epsilon); - try expectVecApproxEqAbs(re_uns[1], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); - try expectVecApproxEqAbs(re_uns[2], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); - try expectVecApproxEqAbs(re_uns[3], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); - try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 40.218716, 19.313708, 11.972846), epsilon); - try expectVecApproxEqAbs(im_uns[1], f32x4(8.0, 5.345429, 3.313708, 1.591299), epsilon); - try expectVecApproxEqAbs(im_uns[2], f32x4(0.0, -1.591299, -3.313708, -5.345429), epsilon); - try expectVecApproxEqAbs(im_uns[3], f32x4(-8.0, -11.972846, -19.313708, -40.218716), epsilon); -} - -fn fftN(re: []F32x4, im: []F32x4, unity_table: []const F32x4, length: u32, count: u32) void { - assert(length > 16); - assert(std.math.isPowerOfTwo(length)); - assert(std.math.isPowerOfTwo(count)); - assert(re.len >= length * count / 4); - assert(re.len == im.len); - - const total = count * length; - const total_vectors = total / 4; - const stage_vectors = length / 4; - const stage_vectors_mask = stage_vectors - 1; - const stride = length / 16; - const stride_mask = stride - 1; - const stride_inv_mask = ~stride_mask; - - var unity_table_re = unity_table; - var unity_table_im = unity_table[length / 4 ..]; - - var index: u32 = 0; - while (index < total_vectors / 4) : (index += 1) { - const n = (index & stride_inv_mask) * 4 + (index & stride_mask); - fftButterflyDit4_4( - &re[n], - &re[n + stride], - &re[n + stride * 2], - &re[n + stride * 3], - &im[n], - &im[n + stride], - &im[n + stride * 2], - &im[n + stride * 3], - unity_table_re[(n & stage_vectors_mask)..], - unity_table_im[(n & stage_vectors_mask)..], - stride, - false, - ); - } - - if (length > 16 * 4) { - fftN(re, im, unity_table[(length / 2)..], length / 4, count * 4); - } else if (length == 16 * 4) { - fft16(re, im, count * 4); - } else if (length == 8 * 4) { - fft8(re, im, count * 4); - } else if (length == 4 * 4) { - fft4(re, im, count * 4); - } -} -test "zmath.fftN" { - var unity_table: [128]F32x4 = undefined; - const epsilon = 0.0001; - - // 32 samples - { - var re = [_]F32x4{ - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - }; - var im = [_]F32x4{ - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - }; - - fftInitUnityTable(unity_table[0..32]); - fft(re[0..], im[0..], unity_table[0..32]); - - try expectVecApproxEqAbs(re[0], f32x4(528.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[1], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[2], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[3], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[4], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[5], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[6], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(re[7], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); - try expectVecApproxEqAbs(im[0], f32x4(0.0, 162.450726, 80.437432, 52.744931), epsilon); - try expectVecApproxEqAbs(im[1], f32x4(38.627417, 29.933895, 23.945692, 19.496056), epsilon); - try expectVecApproxEqAbs(im[2], f32x4(16.0, 13.130861, 10.690858, 8.552178), epsilon); - try expectVecApproxEqAbs(im[3], f32x4(6.627417, 4.853547, 3.182598, 1.575862), epsilon); - try expectVecApproxEqAbs(im[4], f32x4(0.0, -1.575862, -3.182598, -4.853547), epsilon); - try expectVecApproxEqAbs(im[5], f32x4(-6.627417, -8.552178, -10.690858, -13.130861), epsilon); - try expectVecApproxEqAbs(im[6], f32x4(-16.0, -19.496056, -23.945692, -29.933895), epsilon); - try expectVecApproxEqAbs(im[7], f32x4(-38.627417, -52.744931, -80.437432, -162.450726), epsilon); - } - - // 64 samples - { - var re = [_]F32x4{ - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - }; - var im = [_]F32x4{ - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - }; - - fftInitUnityTable(unity_table[0..64]); - fft(re[0..], im[0..], unity_table[0..64]); - - try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); - var i: u32 = 1; - while (i < 16) : (i += 1) { - try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); - } - - const expected = [_]f32{ - 0.0, 0.0, 324.901452, 0.000000, 160.874864, 0.0, 105.489863, 0.000000, - 77.254834, 0.0, 59.867789, 0.0, 47.891384, 0.0, 38.992113, 0.0, - 32.000000, 0.000000, 26.261721, 0.000000, 21.381716, 0.000000, 17.104356, 0.000000, - 13.254834, 0.000000, 9.707094, 0.000000, 6.365196, 0.000000, 3.151725, 0.000000, - 0.000000, 0.000000, -3.151725, 0.000000, -6.365196, 0.000000, -9.707094, 0.000000, - -13.254834, 0.000000, -17.104356, 0.000000, -21.381716, 0.000000, -26.261721, 0.000000, - -32.000000, 0.000000, -38.992113, 0.000000, -47.891384, 0.000000, -59.867789, 0.000000, - -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000, - }; - for (expected, 0..) |e, ie| { - try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); - } - } - - // 128 samples - { - var re = [_]F32x4{ - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - }; - var im = [_]F32x4{ - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - }; - - fftInitUnityTable(unity_table[0..128]); - fft(re[0..], im[0..], unity_table[0..128]); - - try expectVecApproxEqAbs(re[0], f32x4(2112.0, 0.0, 0.0, 0.0), epsilon); - var i: u32 = 1; - while (i < 32) : (i += 1) { - try expectVecApproxEqAbs(re[i], f32x4(-64.0, 0.0, 0.0, 0.0), epsilon); - } - - const expected = [_]f32{ - 0.000000, 0.000000, 0.000000, 0.000000, 649.802905, 0.000000, 0.000000, 0.000000, - 321.749727, 0.000000, 0.000000, 0.000000, 210.979725, 0.000000, 0.000000, 0.000000, - 154.509668, 0.000000, 0.000000, 0.000000, 119.735578, 0.000000, 0.000000, 0.000000, - 95.782769, 0.000000, 0.000000, 0.000000, 77.984226, 0.000000, 0.000000, 0.000000, - 64.000000, 0.000000, 0.000000, 0.000000, 52.523443, 0.000000, 0.000000, 0.000000, - 42.763433, 0.000000, 0.000000, 0.000000, 34.208713, 0.000000, 0.000000, 0.000000, - 26.509668, 0.000000, 0.000000, 0.000000, 19.414188, 0.000000, 0.000000, 0.000000, - 12.730392, 0.000000, 0.000000, 0.000000, 6.303450, 0.000000, 0.000000, 0.000000, - 0.000000, 0.000000, 0.000000, 0.000000, -6.303450, 0.000000, 0.000000, 0.000000, - -12.730392, 0.000000, 0.000000, 0.000000, -19.414188, 0.000000, 0.000000, 0.000000, - -26.509668, 0.000000, 0.000000, 0.000000, -34.208713, 0.000000, 0.000000, 0.000000, - -42.763433, 0.000000, 0.000000, 0.000000, -52.523443, 0.000000, 0.000000, 0.000000, - -64.000000, 0.000000, 0.000000, 0.000000, -77.984226, 0.000000, 0.000000, 0.000000, - -95.782769, 0.000000, 0.000000, 0.000000, -119.735578, 0.000000, 0.000000, 0.000000, - -154.509668, 0.000000, 0.000000, 0.000000, -210.979725, 0.000000, 0.000000, 0.000000, - -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000, - }; - for (expected, 0..) |e, ie| { - try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); - } - } -} - -fn fftUnswizzle(input: []const F32x4, output: []F32x4) void { - assert(std.math.isPowerOfTwo(input.len)); - assert(input.len == output.len); - assert(input.ptr != output.ptr); - - const log2_length = std.math.log2_int(usize, input.len * 4); - assert(log2_length >= 2); - - const length = input.len; - - const f32_output = @as([*]f32, @ptrCast(output.ptr))[0 .. output.len * 4]; - - const static = struct { - const swizzle_table = [256]u8{ - 0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, - 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, - 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, - 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, - 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, - 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, - 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, - 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, - 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, - 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, - 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, - 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, - 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, - 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, - 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, - 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF, - }; - }; - - if ((log2_length & 1) == 0) { - const rev32 = @as(u6, @intCast(32 - log2_length)); - var index: usize = 0; - while (index < length) : (index += 1) { - const n = index * 4; - const addr = - (@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | - (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | - (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | - @as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])); - f32_output[addr >> rev32] = input[index][0]; - f32_output[(0x40000000 | addr) >> rev32] = input[index][1]; - f32_output[(0x80000000 | addr) >> rev32] = input[index][2]; - f32_output[(0xC0000000 | addr) >> rev32] = input[index][3]; - } - } else { - const rev7 = @as(usize, 1) << @as(u6, @intCast(log2_length - 3)); - const rev32 = @as(u6, @intCast(32 - (log2_length - 3))); - var index: usize = 0; - while (index < length) : (index += 1) { - const n = index / 2; - var addr = - (((@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | - (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | - (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | - (@as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])))) >> rev32) | - ((index & 1) * rev7 * 4); - f32_output[addr] = input[index][0]; - addr += rev7; - f32_output[addr] = input[index][1]; - addr += rev7; - f32_output[addr] = input[index][2]; - addr += rev7; - f32_output[addr] = input[index][3]; - } - } -} - -pub fn fftInitUnityTable(out_unity_table: []F32x4) void { - assert(std.math.isPowerOfTwo(out_unity_table.len)); - assert(out_unity_table.len >= 32 and out_unity_table.len <= 512); - - var unity_table = out_unity_table; - - const v0123 = f32x4(0.0, 1.0, 2.0, 3.0); - var length = out_unity_table.len / 4; - var vlstep = f32x4s(0.5 * math.pi / @as(f32, @floatFromInt(length))); - - while (true) { - length /= 4; - var vjp = v0123; - - var j: u32 = 0; - while (j < length) : (j += 1) { - unity_table[j] = f32x4s(1.0); - unity_table[j + length * 4] = f32x4s(0.0); - - var vls = vjp * vlstep; - var sin_cos = sincos(vls); - unity_table[j + length] = sin_cos[1]; - unity_table[j + length * 5] = sin_cos[0] * f32x4s(-1.0); - - var vijp = vjp + vjp; - vls = vijp * vlstep; - sin_cos = sincos(vls); - unity_table[j + length * 2] = sin_cos[1]; - unity_table[j + length * 6] = sin_cos[0] * f32x4s(-1.0); - - vijp = vijp + vjp; - vls = vijp * vlstep; - sin_cos = sincos(vls); - unity_table[j + length * 3] = sin_cos[1]; - unity_table[j + length * 7] = sin_cos[0] * f32x4s(-1.0); - - vjp += f32x4s(4.0); - } - vlstep *= f32x4s(4.0); - unity_table = unity_table[8 * length ..]; - - if (length <= 4) - break; - } -} - -pub fn fft(re: []F32x4, im: []F32x4, unity_table: []const F32x4) void { - const length = @as(u32, @intCast(re.len * 4)); - assert(std.math.isPowerOfTwo(length)); - assert(length >= 4 and length <= 512); - assert(re.len == im.len); - - var re_temp_storage: [128]F32x4 = undefined; - var im_temp_storage: [128]F32x4 = undefined; - const re_temp = re_temp_storage[0..re.len]; - const im_temp = im_temp_storage[0..im.len]; - - @memcpy(re_temp, re); - @memcpy(im_temp, im); - - if (length > 16) { - assert(unity_table.len == length); - fftN(re_temp, im_temp, unity_table, length, 1); - } else if (length == 16) { - fft16(re_temp, im_temp, 1); - } else if (length == 8) { - fft8(re_temp, im_temp, 1); - } else if (length == 4) { - fft4(re_temp, im_temp, 1); - } - - fftUnswizzle(re_temp, re); - fftUnswizzle(im_temp, im); -} - -pub fn ifft(re: []F32x4, im: []const F32x4, unity_table: []const F32x4) void { - const length = @as(u32, @intCast(re.len * 4)); - assert(std.math.isPowerOfTwo(length)); - assert(length >= 4 and length <= 512); - assert(re.len == im.len); - - var re_temp_storage: [128]F32x4 = undefined; - var im_temp_storage: [128]F32x4 = undefined; - var re_temp = re_temp_storage[0..re.len]; - var im_temp = im_temp_storage[0..im.len]; - - const rnp = f32x4s(1.0 / @as(f32, @floatFromInt(length))); - const rnm = f32x4s(-1.0 / @as(f32, @floatFromInt(length))); - - for (re, 0..) |_, i| { - re_temp[i] = re[i] * rnp; - im_temp[i] = im[i] * rnm; - } - - if (length > 16) { - assert(unity_table.len == length); - fftN(re_temp, im_temp, unity_table, length, 1); - } else if (length == 16) { - fft16(re_temp, im_temp, 1); - } else if (length == 8) { - fft8(re_temp, im_temp, 1); - } else if (length == 4) { - fft4(re_temp, im_temp, 1); - } - - fftUnswizzle(re_temp, re); -} -test "zmath.ifft" { - var unity_table: [512]F32x4 = undefined; - const epsilon = 0.0001; - - // 64 samples - { - var re = [_]F32x4{ - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), - f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), - f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), - f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), - }; - var im = [_]F32x4{ - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), - }; - - fftInitUnityTable(unity_table[0..64]); - fft(re[0..], im[0..], unity_table[0..64]); - - try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); - var i: u32 = 1; - while (i < 16) : (i += 1) { - try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); - } - - ifft(re[0..], im[0..], unity_table[0..64]); - - try expectVecApproxEqAbs(re[0], f32x4(1.0, 2.0, 3.0, 4.0), epsilon); - try expectVecApproxEqAbs(re[1], f32x4(5.0, 6.0, 7.0, 8.0), epsilon); - try expectVecApproxEqAbs(re[2], f32x4(9.0, 10.0, 11.0, 12.0), epsilon); - try expectVecApproxEqAbs(re[3], f32x4(13.0, 14.0, 15.0, 16.0), epsilon); - try expectVecApproxEqAbs(re[4], f32x4(17.0, 18.0, 19.0, 20.0), epsilon); - try expectVecApproxEqAbs(re[5], f32x4(21.0, 22.0, 23.0, 24.0), epsilon); - try expectVecApproxEqAbs(re[6], f32x4(25.0, 26.0, 27.0, 28.0), epsilon); - try expectVecApproxEqAbs(re[7], f32x4(29.0, 30.0, 31.0, 32.0), epsilon); - } - - // 512 samples - { - var re: [128]F32x4 = undefined; - var im = [_]F32x4{f32x4s(0.0)} ** 128; - - for (&re, 0..) |*v, i| { - const f = @as(f32, @floatFromInt(i * 4)); - v.* = f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0); - } - - fftInitUnityTable(unity_table[0..512]); - fft(re[0..], im[0..], unity_table[0..512]); - - for (re, 0..) |v, i| { - const f = @as(f32, @floatFromInt(i * 4)); - try expect(!approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon)); - } - - ifft(re[0..], im[0..], unity_table[0..512]); - - for (re, 0..) |v, i| { - const f = @as(f32, @floatFromInt(i * 4)); - try expectVecApproxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon); - } - } -} -// ------------------------------------------------------------------------------ -// -// Private functions and constants -// -// ------------------------------------------------------------------------------ -const f32x4_sign_mask1: F32x4 = F32x4{ @as(f32, @bitCast(@as(u32, 0x8000_0000))), 0, 0, 0 }; -const f32x4_mask2: F32x4 = F32x4{ - @as(f32, @bitCast(@as(u32, 0xffff_ffff))), - @as(f32, @bitCast(@as(u32, 0xffff_ffff))), - 0, - 0, -}; -const f32x4_mask3: F32x4 = F32x4{ - @as(f32, @bitCast(@as(u32, 0xffff_ffff))), - @as(f32, @bitCast(@as(u32, 0xffff_ffff))), - @as(f32, @bitCast(@as(u32, 0xffff_ffff))), - 0, -}; - -inline fn splatNegativeZero(comptime T: type) T { - return @splat(@as(f32, @bitCast(@as(u32, 0x8000_0000)))); -} -inline fn splatNoFraction(comptime T: type) T { - return @splat(@as(f32, 8_388_608.0)); -} -inline fn splatAbsMask(comptime T: type) T { - return @splat(@as(f32, @bitCast(@as(u32, 0x7fff_ffff)))); -} - -fn floatToIntAndBack(v: anytype) @TypeOf(v) { - // This routine won't handle nan, inf and numbers greater than 8_388_608.0 (will generate undefined values). - @setRuntimeSafety(false); - - const T = @TypeOf(v); - const len = veclen(T); - - var vi32: [len]i32 = undefined; - comptime var i: u32 = 0; - // vcvttps2dq - inline while (i < len) : (i += 1) { - vi32[i] = @as(i32, @intFromFloat(v[i])); - } - - var vf32: [len]f32 = undefined; - i = 0; - // vcvtdq2ps - inline while (i < len) : (i += 1) { - vf32[i] = @as(f32, @floatFromInt(vi32[i])); - } - - return vf32; -} -test "zmath.floatToIntAndBack" { - { - const v = floatToIntAndBack(f32x4(1.1, 2.9, 3.0, -4.5)); - try expectVecEqual(v, f32x4(1.0, 2.0, 3.0, -4.0)); - } - { - const v = floatToIntAndBack(f32x8(1.1, 2.9, 3.0, -4.5, 2.5, -2.5, 1.1, -100.2)); - try expectVecEqual(v, f32x8(1.0, 2.0, 3.0, -4.0, 2.0, -2.0, 1.0, -100.0)); - } - { - const v = floatToIntAndBack(f32x4(math.inf(f32), 2.9, math.nan(f32), math.snan(f32))); - try expect(v[1] == 2.0); - } -} - -pub fn expectVecEqual(expected: anytype, actual: anytype) !void { - const T = @TypeOf(expected, actual); - inline for (0..veclen(T)) |i| { - try std.testing.expectEqual(expected[i], actual[i]); - } -} - -pub fn expectVecApproxEqAbs(expected: anytype, actual: anytype, eps: f32) !void { - const T = @TypeOf(expected, actual); - inline for (0..veclen(T)) |i| { - try std.testing.expectApproxEqAbs(expected[i], actual[i], eps); - } -} - -pub fn approxEqAbs(v0: anytype, v1: anytype, eps: f32) bool { - const T = @TypeOf(v0, v1); - comptime var i: comptime_int = 0; - inline while (i < veclen(T)) : (i += 1) { - if (!math.approxEqAbs(f32, v0[i], v1[i], eps)) { - return false; - } - } - return true; -} - -// ------------------------------------------------------------------------------ -// This software is available under 2 licenses -- choose whichever you prefer. -// ------------------------------------------------------------------------------ -// ALTERNATIVE A - MIT License -// Copyright (c) 2022 Michal Ziulek and Contributors -// Permission is hereby granted, free of charge, to any person obtaining a copy of -// this software and associated documentation files (the "Software"), to deal in -// the Software without restriction, including without limitation the rights to -// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -// of the Software, and to permit persons to whom the Software is furnished to do -// so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// ------------------------------------------------------------------------------ -// ALTERNATIVE B - Public Domain (www.unlicense.org) -// This is free and unencumbered software released into the public domain. -// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this -// software, either in source code form or as a compiled binary, for any purpose, -// commercial or non-commercial, and by any means. -// In jurisdictions that recognize copyright laws, the author or authors of this -// software dedicate any and all copyright interest in the software to the public -// domain. We make this dedication for the benefit of the public at large and to -// the detriment of our heirs and successors. We intend this dedication to be an -// overt act of relinquishment in perpetuity of all present and future rights to -// this software under copyright law. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// ------------------------------------------------------------------------------ From 940b6d5ea7288c2f2426205a9a45dd526bbcb389 Mon Sep 17 00:00:00 2001 From: kk Date: Mon, 24 Mar 2025 13:55:41 +0800 Subject: [PATCH 3/3] Fix how the comments are structured --- src/main.zig | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/main.zig b/src/main.zig index fa5a5cd..3f67b59 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,24 +1,10 @@ -//-------------------------------------------------------------------------------------------------- +// ============================================================================== // // SIMD math library for game developers // https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath // -// See zmath.zig for more details. // See util.zig for additional functionality. // -//-------------------------------------------------------------------------------------------------- -pub const util = @import("util.zig"); - -// ensure transitive closure of test coverage -comptime { - _ = util; -} - -// ============================================================================== -// -// SIMD math library for game developers -// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath -// // Should work on all OSes supported by Zig. Works on x86_64 and ARM. // Provides ~140 optimized routines and ~70 extensive tests. // Can be used with any graphics API. @@ -278,6 +264,13 @@ comptime { // // ============================================================================== +pub const util = @import("util.zig"); + +// ensure transitive closure of test coverage +comptime { + _ = util; +} + // Fundamental types pub const F32x4 = @Vector(4, f32); pub const F32x8 = @Vector(8, f32);