From 8ea6712019184dabec4d7108216c7ba1e88c6c61 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 01:46:06 +0000 Subject: [PATCH 01/38] avx512 multiply --- src/Nethermind.Int256/UInt256.cs | 143 ++++++++++++++++++++++++++++--- 1 file changed, 130 insertions(+), 13 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 75a4c73..454e16c 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1010,25 +1010,142 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Unsafe.AsRef(in res.u1) = high; return; } + + if (!Avx512F.IsSupported || !Avx512DQ.IsSupported) + { + ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); + ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); - ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); - ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); + (ulong carry, ulong r0) = Multiply64(rx, ry); + UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); + UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); + ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; - (ulong carry, ulong r0) = Multiply64(rx, ry); - UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); - UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); - ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; + UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); + UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); + res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; - UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); - UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); - res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; + UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); + res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; - UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); - res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; + ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); - ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); + res = new UInt256(r0, r1, r2, r3); + } + else + { - res = new UInt256(r0, r1, r2, r3); + // Unpack the four 64-bit limbs (little-endian: u0 is least-significant) + ulong a0 = x.u0, a1 = x.u1, a2 = x.u2, a3 = x.u3; + ulong b0 = y.u0, b1 = y.u1, b2 = y.u2, b3 = y.u3; + + // --- Compute the 10 64x64–bit products using our vectorized method --- + + // Group 1: 8 products + Vector512 vecA1 = Vector512.Create(a0, a0, a1, a0, a1, a2, a0, a1); + Vector512 vecB1 = Vector512.Create(b0, b1, b0, b2, b1, b0, b3, b2); + Mul64Vector(vecA1, vecB1, out Vector512 lo1, out Vector512 hi1); + + // Extract products from group1 + ulong P00_lo = lo1.GetElement(0), P00_hi = hi1.GetElement(0); + ulong P01_lo = lo1.GetElement(1), P01_hi = hi1.GetElement(1); + ulong P10_lo = lo1.GetElement(2), P10_hi = hi1.GetElement(2); + ulong P02_lo = lo1.GetElement(3), P02_hi = hi1.GetElement(3); + ulong P11_lo = lo1.GetElement(4), P11_hi = hi1.GetElement(4); + ulong P20_lo = lo1.GetElement(5), P20_hi = hi1.GetElement(5); + ulong P03_lo = lo1.GetElement(6), P03_hi = hi1.GetElement(6); + ulong P12_lo = lo1.GetElement(7), P12_hi = hi1.GetElement(7); + + // Group 2: 2 products + Vector512 vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); + Vector512 vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); + Mul64Vector(vecA2, vecB2, out Vector512 lo2, out Vector512 hi2); + ulong P21_lo = lo2.GetElement(0); // P21_hi is not needed (contributes only above 256 bits) + ulong P30_lo = lo2.GetElement(1); // Likewise for P30_hi + + // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- + // (Recall: a 128–bit product is given as (lo, hi), where lo is the lower 64 bits and hi the upper 64 bits.) + + // P00 (no shift) + UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); + + // P01 and P10 (each shifted left by 64 bits) + UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0); + UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0); + UInt256 sum64; + AddImpl(part64a, part64b, out sum64); + + // P02, P11 and P20 (each shifted left by 128 bits) + UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi); + UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi); + UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi); + UInt256 sum128, temp; + AddImpl(part128a, part128b, out temp); + AddImpl(temp, part128c, out sum128); + + // P03, P12, P21 and P30 (shifted left by 192 bits – note only the low 64 bits matter) + UInt256 part192a = new UInt256(0, 0, 0, P03_lo); + UInt256 part192b = new UInt256(0, 0, 0, P12_lo); + UInt256 part192c = new UInt256(0, 0, 0, P21_lo); + UInt256 part192d = new UInt256(0, 0, 0, P30_lo); + UInt256 sum192; + AddImpl(part192a, part192b, out temp); + AddImpl(temp, part192c, out temp); + AddImpl(temp, part192d, out sum192); + + // --- Sum all the partial products using AddImpl --- + UInt256 intermediate; + AddImpl(part0, sum64, out intermediate); + AddImpl(intermediate, sum128, out intermediate); + AddImpl(intermediate, sum192, out res); + } + } + + + // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes), + // computes per lane: + // product = a * b = (hi, lo) + // using the splitting method since there is no MultiplyHigh intrinsic. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Mul64Vector(Vector512 a, Vector512 b, + out Vector512 lo, out Vector512 hi) + { + // Mask for the lower 32 bits. + Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); + + // Split each 64-bit operand into 32-bit halves: + // a0 = lower 32 bits, a1 = upper 32 bits + Vector512 a0 = Avx512F.And(a, mask32); + Vector512 a1 = Avx512F.ShiftRightLogical(a, 32); + Vector512 b0 = Avx512F.And(b, mask32); + Vector512 b1 = Avx512F.ShiftRightLogical(b, 32); + + // Compute the four 32x32 partial products. + // Each multiplication here is on 32-bit values, so the result fits in 64 bits. + Vector512 u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0 + Vector512 u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1 + Vector512 u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0 + Vector512 u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1 + + // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) + Vector512 u0_hi = Avx512F.ShiftRightLogical(u0, 32); + Vector512 u1_lo = Avx512F.And(u1, mask32); + Vector512 u2_lo = Avx512F.And(u2, mask32); + Vector512 t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo); + + // The extra carry: c = t >> 32. + Vector512 c = Avx512F.ShiftRightLogical(t, 32); + + // Now, assemble the lower 64 bits: + // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. + Vector512 u0_lo = Avx512F.And(u0, mask32); + Vector512 t_lo = Avx512F.And(t, mask32); + lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32)); + + // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. + Vector512 u1_hi = Avx512F.ShiftRightLogical(u1, 32); + Vector512 u2_hi = Avx512F.ShiftRightLogical(u2, 32); + hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c); } public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res); From f69cad2098d4ee99631c2d86b16a01a37cd01bcf Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 02:36:22 +0000 Subject: [PATCH 02/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 55 ++++++++++++++------------------ 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 454e16c..d59efed 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1034,19 +1034,19 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } else { - + // Vectorized branch using AVX-512. // Unpack the four 64-bit limbs (little-endian: u0 is least-significant) ulong a0 = x.u0, a1 = x.u1, a2 = x.u2, a3 = x.u3; ulong b0 = y.u0, b1 = y.u1, b2 = y.u2, b3 = y.u3; - // --- Compute the 10 64x64–bit products using our vectorized method --- + // --- Compute the 10 64x64–bit partial products using our vectorized method --- // Group 1: 8 products Vector512 vecA1 = Vector512.Create(a0, a0, a1, a0, a1, a2, a0, a1); Vector512 vecB1 = Vector512.Create(b0, b1, b0, b2, b1, b0, b3, b2); Mul64Vector(vecA1, vecB1, out Vector512 lo1, out Vector512 hi1); - // Extract products from group1 + // Extract products from group 1. ulong P00_lo = lo1.GetElement(0), P00_hi = hi1.GetElement(0); ulong P01_lo = lo1.GetElement(1), P01_hi = hi1.GetElement(1); ulong P10_lo = lo1.GetElement(2), P10_hi = hi1.GetElement(2); @@ -1056,51 +1056,44 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) ulong P03_lo = lo1.GetElement(6), P03_hi = hi1.GetElement(6); ulong P12_lo = lo1.GetElement(7), P12_hi = hi1.GetElement(7); - // Group 2: 2 products - Vector512 vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); - Vector512 vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); - Mul64Vector(vecA2, vecB2, out Vector512 lo2, out Vector512 hi2); - ulong P21_lo = lo2.GetElement(0); // P21_hi is not needed (contributes only above 256 bits) - ulong P30_lo = lo2.GetElement(1); // Likewise for P30_hi - // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- - // (Recall: a 128–bit product is given as (lo, hi), where lo is the lower 64 bits and hi the upper 64 bits.) - // P00 (no shift) + // Group with no shift. UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); - // P01 and P10 (each shifted left by 64 bits) + // Group shifted left by 64 bits. UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0); UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0); UInt256 sum64; AddImpl(part64a, part64b, out sum64); - // P02, P11 and P20 (each shifted left by 128 bits) + // Group shifted left by 128 bits. UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi); UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi); UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi); - UInt256 sum128, temp; - AddImpl(part128a, part128b, out temp); - AddImpl(temp, part128c, out sum128); - - // P03, P12, P21 and P30 (shifted left by 192 bits – note only the low 64 bits matter) - UInt256 part192a = new UInt256(0, 0, 0, P03_lo); - UInt256 part192b = new UInt256(0, 0, 0, P12_lo); - UInt256 part192c = new UInt256(0, 0, 0, P21_lo); - UInt256 part192d = new UInt256(0, 0, 0, P30_lo); - UInt256 sum192; - AddImpl(part192a, part192b, out temp); - AddImpl(temp, part192c, out temp); - AddImpl(temp, part192d, out sum192); - - // --- Sum all the partial products using AddImpl --- + UInt256 sum128, temp256; + AddImpl(part128a, part128b, out temp256); + AddImpl(temp256, part128c, out sum128); + + + // Group 2: 2 products + Vector512 vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); + Vector512 vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); + Mul64Vector(vecA2, vecB2, out Vector512 lo2, out Vector512 hi2); + ulong P21_lo = lo2.GetElement(0); // Only lower 64 bits matter. + ulong P30_lo = lo2.GetElement(1); + + // Group shifted left by 192 bits – only the lower 64 bits contribute. + // Any carry is discarded, so just use normal addition. + UInt256 part192256 = new UInt256(0, 0, 0, (P03_lo + P12_lo + P21_lo + P30_lo)); + + // --- Sum all the partial products using the proven UInt256 adder (AddImpl) --- UInt256 intermediate; AddImpl(part0, sum64, out intermediate); AddImpl(intermediate, sum128, out intermediate); - AddImpl(intermediate, sum192, out res); + AddImpl(intermediate, part192256, out res); } } - // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes), // computes per lane: From e8c03a6fa78a647921c589ed1cbb423a83c26841 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 02:54:58 +0000 Subject: [PATCH 03/38] optimize --- src/Nethermind.Int256/UInt256.cs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index d59efed..d8e5656 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1034,16 +1034,17 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } else { + Vector256 vecA = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 vecB = Unsafe.As>(ref Unsafe.AsRef(in y)); + // Vectorized branch using AVX-512. // Unpack the four 64-bit limbs (little-endian: u0 is least-significant) - ulong a0 = x.u0, a1 = x.u1, a2 = x.u2, a3 = x.u3; - ulong b0 = y.u0, b1 = y.u1, b2 = y.u2, b3 = y.u3; // --- Compute the 10 64x64–bit partial products using our vectorized method --- // Group 1: 8 products - Vector512 vecA1 = Vector512.Create(a0, a0, a1, a0, a1, a2, a0, a1); - Vector512 vecB1 = Vector512.Create(b0, b1, b0, b2, b1, b0, b3, b2); + Vector512 vecA1 = Vector512.Create(Avx2.Permute4x64(vecA, 16), Avx2.Permute4x64(vecA, 73)); + Vector512 vecB1 = Vector512.Create(Avx2.Permute4x64(vecB, 132), Avx2.Permute4x64(vecB, 177)); Mul64Vector(vecA1, vecB1, out Vector512 lo1, out Vector512 hi1); // Extract products from group 1. @@ -1077,6 +1078,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Group 2: 2 products + ulong a2 = x.u2, a3 = x.u3; + ulong b0 = y.u0, b1 = y.u1; Vector512 vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); Vector512 vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); Mul64Vector(vecA2, vecB2, out Vector512 lo2, out Vector512 hi2); From 30aea1a50657f39d039a7211b07d4a02e0ad3e4a Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 02:59:03 +0000 Subject: [PATCH 04/38] optimize --- src/Nethermind.Int256/UInt256.cs | 41 +++++++++++++++++--------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index d8e5656..76b358b 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1059,22 +1059,28 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- - // Group with no shift. - UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); - - // Group shifted left by 64 bits. - UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0); - UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0); - UInt256 sum64; - AddImpl(part64a, part64b, out sum64); - - // Group shifted left by 128 bits. - UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi); - UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi); - UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi); - UInt256 sum128, temp256; - AddImpl(part128a, part128b, out temp256); - AddImpl(temp256, part128c, out sum128); + UInt256 intermediate; + { + // Group with no shift. + UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); + + // Group shifted left by 64 bits. + UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0); + UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0); + UInt256 sum64; + AddImpl(part64a, part64b, out sum64); + AddImpl(part0, sum64, out intermediate); + } + { + // Group shifted left by 128 bits. + UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi); + UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi); + UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi); + UInt256 sum128, temp256; + AddImpl(part128a, part128b, out temp256); + AddImpl(temp256, part128c, out sum128); + AddImpl(intermediate, sum128, out intermediate); + } // Group 2: 2 products @@ -1091,9 +1097,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) UInt256 part192256 = new UInt256(0, 0, 0, (P03_lo + P12_lo + P21_lo + P30_lo)); // --- Sum all the partial products using the proven UInt256 adder (AddImpl) --- - UInt256 intermediate; - AddImpl(part0, sum64, out intermediate); - AddImpl(intermediate, sum128, out intermediate); AddImpl(intermediate, part192256, out res); } } From 11428ea467d9601391ccf6190ee29c8bc0bf84d9 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 03:33:31 +0000 Subject: [PATCH 05/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 76b358b..a579ff7 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1084,20 +1084,26 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Group 2: 2 products - ulong a2 = x.u2, a3 = x.u3; - ulong b0 = y.u0, b1 = y.u1; - Vector512 vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); - Vector512 vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); - Mul64Vector(vecA2, vecB2, out Vector512 lo2, out Vector512 hi2); - ulong P21_lo = lo2.GetElement(0); // Only lower 64 bits matter. + // Pack the two 64-bit values from x into a Vector128 + Vector128 vecA2 = Vector128.Create(x.u2, x.u3); + // Pack the two 64-bit values from y into a Vector128 in the required order. + // Here, we want lane 0 to contain b1 (for P21_lo) and lane 1 to contain b0 (for P30_lo). + Vector128 vecB2 = Vector128.Create(y.u1, y.u0); + + // Use MultiplyLow to multiply corresponding lanes and keep only the lower 64 bits. + Vector128 lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2); + + // Extract the results: + ulong P21_lo = lo2.GetElement(0); ulong P30_lo = lo2.GetElement(1); - // Group shifted left by 192 bits – only the lower 64 bits contribute. - // Any carry is discarded, so just use normal addition. - UInt256 part192256 = new UInt256(0, 0, 0, (P03_lo + P12_lo + P21_lo + P30_lo)); + ulong group192 = P03_lo + P12_lo + P21_lo + P30_lo; - // --- Sum all the partial products using the proven UInt256 adder (AddImpl) --- - AddImpl(intermediate, part192256, out res); + // Now add that to the most-significant limb of the intermediate result. + res = new UInt256(intermediate.u0, + intermediate.u1, + intermediate.u2, + intermediate.u3 + group192); // any carry here is dropped modulo 2^256 } } From 33c724208e59355faa5102744266ac4ca93975a9 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 03:57:57 +0000 Subject: [PATCH 06/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index a579ff7..eaeb4ca 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1092,18 +1092,25 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Use MultiplyLow to multiply corresponding lanes and keep only the lower 64 bits. Vector128 lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2); + + lo2 = Sse2.Add(lo2, Vector128.Create(P03_lo, P12_lo)); + // Reinterpret lo2 as a vector of doubles. + Vector128 lo2Double = lo2.AsDouble(); - // Extract the results: - ulong P21_lo = lo2.GetElement(0); - ulong P30_lo = lo2.GetElement(1); + // Use Sse2.Shuffle (which is _mm_shuffle_pd) with control mask 0x1 to swap the two lanes. + Vector128 shufDouble = Sse2.Shuffle(lo2Double, lo2Double, 0x1); - ulong group192 = P03_lo + P12_lo + P21_lo + P30_lo; + // Reinterpret back to ulong. + Vector128 shuf = shufDouble.AsUInt64(); - // Now add that to the most-significant limb of the intermediate result. - res = new UInt256(intermediate.u0, - intermediate.u1, - intermediate.u2, - intermediate.u3 + group192); // any carry here is dropped modulo 2^256 + // Add the original vector and the shuffled one. + Vector128 sumVec = Sse2.Add(lo2, shuf); + + // Now the horizontal sum is in lane 0. + ulong group192 = intermediate.u3 + sumVec.GetElement(0); + + Unsafe.SkipInit(out res); + Unsafe.As>(ref res) = Unsafe.As>(ref intermediate).WithElement(3, group192); } } From 6ccee9980b1f7ff15c771967b229d17d955501cf Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 04:31:28 +0000 Subject: [PATCH 07/38] Simplify --- src/Nethermind.Int256/UInt256.cs | 74 ++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index eaeb4ca..153e6a1 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1037,12 +1037,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector256 vecA = Unsafe.As>(ref Unsafe.AsRef(in x)); Vector256 vecB = Unsafe.As>(ref Unsafe.AsRef(in y)); - // Vectorized branch using AVX-512. - // Unpack the four 64-bit limbs (little-endian: u0 is least-significant) - - // --- Compute the 10 64x64–bit partial products using our vectorized method --- - - // Group 1: 8 products Vector512 vecA1 = Vector512.Create(Avx2.Permute4x64(vecA, 16), Avx2.Permute4x64(vecA, 73)); Vector512 vecB1 = Vector512.Create(Avx2.Permute4x64(vecB, 132), Avx2.Permute4x64(vecB, 177)); Mul64Vector(vecA1, vecB1, out Vector512 lo1, out Vector512 hi1); @@ -1054,63 +1048,79 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) ulong P02_lo = lo1.GetElement(3), P02_hi = hi1.GetElement(3); ulong P11_lo = lo1.GetElement(4), P11_hi = hi1.GetElement(4); ulong P20_lo = lo1.GetElement(5), P20_hi = hi1.GetElement(5); - ulong P03_lo = lo1.GetElement(6), P03_hi = hi1.GetElement(6); - ulong P12_lo = lo1.GetElement(7), P12_hi = hi1.GetElement(7); + ulong P03_lo = lo1.GetElement(6); + ulong P12_lo = lo1.GetElement(7); // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- UInt256 intermediate; { - // Group with no shift. - UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); - - // Group shifted left by 64 bits. UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0); UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0); UInt256 sum64; AddImpl(part64a, part64b, out sum64); + + UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); AddImpl(part0, sum64, out intermediate); } { - // Group shifted left by 128 bits. - UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi); - UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi); - UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi); - UInt256 sum128, temp256; - AddImpl(part128a, part128b, out temp256); - AddImpl(temp256, part128c, out sum128); - AddImpl(intermediate, sum128, out intermediate); + // Pack the nonzero (upper 128-bit) parts into Vector128 + Vector128 v128a = Vector128.Create(P02_lo, P02_hi); + Vector128 v128b = Vector128.Create(P11_lo, P11_hi); + Vector128 v128c = Vector128.Create(P20_lo, P20_hi); + + // Use our 128-bit adder to sum these. + // (This helper adds two 128-bit values with proper carry propagation.) + Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); + Vector128 sum128 = Vector128AddWithCarry(temp128, v128c); + + // Now, these two 64-bit lanes represent the contribution from group 128. + // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result. + // Extract the current upper half of the intermediate sum. + Vector128 interUpper = Vector128.Create(intermediate.u2, intermediate.u3); + // Add the computed 128-bit group sum to that upper half. + Vector128 newInterUpper = Vector128AddWithCarry(interUpper, sum128); + + // Update the intermediate result—its lower half (u0 and u1) remains unchanged. + intermediate = new UInt256( + intermediate.u0, + intermediate.u1, + newInterUpper.GetElement(0), + newInterUpper.GetElement(1)); } - - // Group 2: 2 products - // Pack the two 64-bit values from x into a Vector128 Vector128 vecA2 = Vector128.Create(x.u2, x.u3); - // Pack the two 64-bit values from y into a Vector128 in the required order. - // Here, we want lane 0 to contain b1 (for P21_lo) and lane 1 to contain b0 (for P30_lo). Vector128 vecB2 = Vector128.Create(y.u1, y.u0); - // Use MultiplyLow to multiply corresponding lanes and keep only the lower 64 bits. Vector128 lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2); lo2 = Sse2.Add(lo2, Vector128.Create(P03_lo, P12_lo)); - // Reinterpret lo2 as a vector of doubles. Vector128 lo2Double = lo2.AsDouble(); - // Use Sse2.Shuffle (which is _mm_shuffle_pd) with control mask 0x1 to swap the two lanes. Vector128 shufDouble = Sse2.Shuffle(lo2Double, lo2Double, 0x1); - // Reinterpret back to ulong. Vector128 shuf = shufDouble.AsUInt64(); - // Add the original vector and the shuffled one. Vector128 sumVec = Sse2.Add(lo2, shuf); - // Now the horizontal sum is in lane 0. ulong group192 = intermediate.u3 + sumVec.GetElement(0); Unsafe.SkipInit(out res); - Unsafe.As>(ref res) = Unsafe.As>(ref intermediate).WithElement(3, group192); + Unsafe.As>(ref res) = + Unsafe.As>(ref intermediate).WithElement(3, group192); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 Vector128AddWithCarry(Vector128 a, Vector128 b) + { + Vector128 sum = Sse2.Add(a, b); + Vector128 carryMask = Avx512F.VL.CompareLessThan(sum, a); + carryMask = Sse2.ShiftRightLogical(carryMask, 63); + ulong s0 = sum.GetElement(0); + ulong s1 = sum.GetElement(1); + ulong c0 = carryMask.GetElement(0); + s1 += c0; + return Vector128.Create(s0, s1); } } From 1be23d338bcf77c590f1d525358d245f7df50568 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 06:08:07 +0000 Subject: [PATCH 08/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 153e6a1..5a3a6c5 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1055,23 +1055,29 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) UInt256 intermediate; { - UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0); - UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0); - UInt256 sum64; - AddImpl(part64a, part64b, out sum64); - - UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0); - AddImpl(part0, sum64, out intermediate); + Vector128 v128a = Vector128.Create(P01_lo, P01_hi); + Vector128 v128b = Vector128.Create(P10_lo, P10_hi); + Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); + + var hi = temp128.GetElement(1); + var combine = P00_hi + temp128.GetElement(0); + + intermediate = new UInt256( + P00_lo, + combine, + hi + (P00_hi > combine ? 1ul : 0ul), + P01_hi > hi ? 1ul : 0ul); } { // Pack the nonzero (upper 128-bit) parts into Vector128 Vector128 v128a = Vector128.Create(P02_lo, P02_hi); Vector128 v128b = Vector128.Create(P11_lo, P11_hi); - Vector128 v128c = Vector128.Create(P20_lo, P20_hi); // Use our 128-bit adder to sum these. // (This helper adds two 128-bit values with proper carry propagation.) Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); + + Vector128 v128c = Vector128.Create(P20_lo, P20_hi); Vector128 sum128 = Vector128AddWithCarry(temp128, v128c); // Now, these two 64-bit lanes represent the contribution from group 128. From 726fbccb3b0dbe2a7f10208ce8dcd1d519282bf9 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 06:12:42 +0000 Subject: [PATCH 09/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 5a3a6c5..8670695 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1053,7 +1053,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- - UInt256 intermediate; + Vector256 intermediate; { Vector128 v128a = Vector128.Create(P01_lo, P01_hi); Vector128 v128b = Vector128.Create(P10_lo, P10_hi); @@ -1062,7 +1062,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) var hi = temp128.GetElement(1); var combine = P00_hi + temp128.GetElement(0); - intermediate = new UInt256( + intermediate = Vector256.Create( P00_lo, combine, hi + (P00_hi > combine ? 1ul : 0ul), @@ -1083,16 +1083,14 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Now, these two 64-bit lanes represent the contribution from group 128. // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result. // Extract the current upper half of the intermediate sum. - Vector128 interUpper = Vector128.Create(intermediate.u2, intermediate.u3); + Vector128 interUpper = intermediate.GetUpper(); // Add the computed 128-bit group sum to that upper half. Vector128 newInterUpper = Vector128AddWithCarry(interUpper, sum128); // Update the intermediate result—its lower half (u0 and u1) remains unchanged. - intermediate = new UInt256( - intermediate.u0, - intermediate.u1, - newInterUpper.GetElement(0), - newInterUpper.GetElement(1)); + intermediate = Vector256.Create( + intermediate.GetLower(), + newInterUpper); } Vector128 vecA2 = Vector128.Create(x.u2, x.u3); @@ -1109,11 +1107,11 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector128 sumVec = Sse2.Add(lo2, shuf); - ulong group192 = intermediate.u3 + sumVec.GetElement(0); + ulong group192 = intermediate.GetElement(3) + sumVec.GetElement(0); Unsafe.SkipInit(out res); Unsafe.As>(ref res) = - Unsafe.As>(ref intermediate).WithElement(3, group192); + intermediate.WithElement(3, group192); } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 503bdb8c5a19fede30ff0132a55d6be9efde6d8c Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 06:15:35 +0000 Subject: [PATCH 10/38] optimize --- src/Nethermind.Int256/UInt256.cs | 62 +++++++++++++++----------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 8670695..6b89e5d 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1053,45 +1053,41 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- - Vector256 intermediate; - { - Vector128 v128a = Vector128.Create(P01_lo, P01_hi); - Vector128 v128b = Vector128.Create(P10_lo, P10_hi); - Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); + Vector128 v128aa = Vector128.Create(P01_lo, P01_hi); + Vector128 v128bb = Vector128.Create(P10_lo, P10_hi); + Vector128 temp128a = Vector128AddWithCarry(v128aa, v128bb); - var hi = temp128.GetElement(1); - var combine = P00_hi + temp128.GetElement(0); + var hi = temp128a.GetElement(1); + var combine = P00_hi + temp128a.GetElement(0); - intermediate = Vector256.Create( - P00_lo, - combine, - hi + (P00_hi > combine ? 1ul : 0ul), - P01_hi > hi ? 1ul : 0ul); - } - { - // Pack the nonzero (upper 128-bit) parts into Vector128 - Vector128 v128a = Vector128.Create(P02_lo, P02_hi); - Vector128 v128b = Vector128.Create(P11_lo, P11_hi); + Vector256 intermediate = Vector256.Create( + P00_lo, + combine, + hi + (P00_hi > combine ? 1ul : 0ul), + P01_hi > hi ? 1ul : 0ul); + + // Pack the nonzero (upper 128-bit) parts into Vector128 + Vector128 v128a = Vector128.Create(P02_lo, P02_hi); + Vector128 v128b = Vector128.Create(P11_lo, P11_hi); - // Use our 128-bit adder to sum these. - // (This helper adds two 128-bit values with proper carry propagation.) - Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); + // Use our 128-bit adder to sum these. + // (This helper adds two 128-bit values with proper carry propagation.) + Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); - Vector128 v128c = Vector128.Create(P20_lo, P20_hi); - Vector128 sum128 = Vector128AddWithCarry(temp128, v128c); + Vector128 v128c = Vector128.Create(P20_lo, P20_hi); + Vector128 sum128 = Vector128AddWithCarry(temp128, v128c); - // Now, these two 64-bit lanes represent the contribution from group 128. - // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result. - // Extract the current upper half of the intermediate sum. - Vector128 interUpper = intermediate.GetUpper(); - // Add the computed 128-bit group sum to that upper half. - Vector128 newInterUpper = Vector128AddWithCarry(interUpper, sum128); + // Now, these two 64-bit lanes represent the contribution from group 128. + // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result. + // Extract the current upper half of the intermediate sum. + Vector128 interUpper = intermediate.GetUpper(); + // Add the computed 128-bit group sum to that upper half. + Vector128 newInterUpper = Vector128AddWithCarry(interUpper, sum128); - // Update the intermediate result—its lower half (u0 and u1) remains unchanged. - intermediate = Vector256.Create( - intermediate.GetLower(), - newInterUpper); - } + // Update the intermediate result—its lower half (u0 and u1) remains unchanged. + intermediate = Vector256.Create( + intermediate.GetLower(), + newInterUpper); Vector128 vecA2 = Vector128.Create(x.u2, x.u3); Vector128 vecB2 = Vector128.Create(y.u1, y.u0); From 71d893f30ed8d632a2b4f99229de7dc7af3a1a62 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 06:18:20 +0000 Subject: [PATCH 11/38] Recoment and rename --- src/Nethermind.Int256/UInt256.cs | 171 ++++++++++++++++++------------- 1 file changed, 99 insertions(+), 72 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 6b89e5d..2d01415 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1034,80 +1034,107 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } else { - Vector256 vecA = Unsafe.As>(ref Unsafe.AsRef(in x)); - Vector256 vecB = Unsafe.As>(ref Unsafe.AsRef(in y)); - - Vector512 vecA1 = Vector512.Create(Avx2.Permute4x64(vecA, 16), Avx2.Permute4x64(vecA, 73)); - Vector512 vecB1 = Vector512.Create(Avx2.Permute4x64(vecB, 132), Avx2.Permute4x64(vecB, 177)); - Mul64Vector(vecA1, vecB1, out Vector512 lo1, out Vector512 hi1); - - // Extract products from group 1. - ulong P00_lo = lo1.GetElement(0), P00_hi = hi1.GetElement(0); - ulong P01_lo = lo1.GetElement(1), P01_hi = hi1.GetElement(1); - ulong P10_lo = lo1.GetElement(2), P10_hi = hi1.GetElement(2); - ulong P02_lo = lo1.GetElement(3), P02_hi = hi1.GetElement(3); - ulong P11_lo = lo1.GetElement(4), P11_hi = hi1.GetElement(4); - ulong P20_lo = lo1.GetElement(5), P20_hi = hi1.GetElement(5); - ulong P03_lo = lo1.GetElement(6); - ulong P12_lo = lo1.GetElement(7); - - // --- Package each 128-bit partial product into a UInt256 (with proper shifting) --- - - Vector128 v128aa = Vector128.Create(P01_lo, P01_hi); - Vector128 v128bb = Vector128.Create(P10_lo, P10_hi); - Vector128 temp128a = Vector128AddWithCarry(v128aa, v128bb); - - var hi = temp128a.GetElement(1); - var combine = P00_hi + temp128a.GetElement(0); - - Vector256 intermediate = Vector256.Create( - P00_lo, - combine, - hi + (P00_hi > combine ? 1ul : 0ul), - P01_hi > hi ? 1ul : 0ul); - - // Pack the nonzero (upper 128-bit) parts into Vector128 - Vector128 v128a = Vector128.Create(P02_lo, P02_hi); - Vector128 v128b = Vector128.Create(P11_lo, P11_hi); - - // Use our 128-bit adder to sum these. - // (This helper adds two 128-bit values with proper carry propagation.) - Vector128 temp128 = Vector128AddWithCarry(v128a, v128b); - - Vector128 v128c = Vector128.Create(P20_lo, P20_hi); - Vector128 sum128 = Vector128AddWithCarry(temp128, v128c); - - // Now, these two 64-bit lanes represent the contribution from group 128. - // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result. - // Extract the current upper half of the intermediate sum. - Vector128 interUpper = intermediate.GetUpper(); - // Add the computed 128-bit group sum to that upper half. - Vector128 newInterUpper = Vector128AddWithCarry(interUpper, sum128); - - // Update the intermediate result—its lower half (u0 and u1) remains unchanged. - intermediate = Vector256.Create( - intermediate.GetLower(), - newInterUpper); - - Vector128 vecA2 = Vector128.Create(x.u2, x.u3); - Vector128 vecB2 = Vector128.Create(y.u1, y.u0); - - Vector128 lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2); - - lo2 = Sse2.Add(lo2, Vector128.Create(P03_lo, P12_lo)); - Vector128 lo2Double = lo2.AsDouble(); - - Vector128 shufDouble = Sse2.Shuffle(lo2Double, lo2Double, 0x1); - - Vector128 shuf = shufDouble.AsUInt64(); - - Vector128 sumVec = Sse2.Add(lo2, shuf); - - ulong group192 = intermediate.GetElement(3) + sumVec.GetElement(0); - + // Load the 256‐bit inputs into 256‐bit vector registers. + Vector256 aVector = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 bVector = Unsafe.As>(ref Unsafe.AsRef(in y)); + + // Rearrange the 64‐bit limbs of each input into 512‐bit vectors. + // The chosen permutations align the limbs so that later 64‐bit multiplications yield the correct cross‐products. + Vector512 rearrangedA = Vector512.Create( + Avx2.Permute4x64(aVector, 16), // Lower part permutation for A + Avx2.Permute4x64(aVector, 73)); // Upper part permutation for A + + Vector512 rearrangedB = Vector512.Create( + Avx2.Permute4x64(bVector, 132), // Lower part permutation for B + Avx2.Permute4x64(bVector, 177)); // Upper part permutation for B + + // Multiply the corresponding 64‐bit limbs of the rearranged inputs. + // Each multiplication yields a 128‐bit product split into a low and high 64‐bit part. + Mul64Vector(rearrangedA, rearrangedB, out Vector512 partialLo, out Vector512 partialHi); + + // --- Extract Partial Products from the First Group --- + // + // The following partial products (with both low and high parts) result from the 64‐bit multiplications. + // They are named to indicate their source position in the multiplication grid. + ulong prod00_Lo = partialLo.GetElement(0), prod00_Hi = partialHi.GetElement(0); + ulong prod01_Lo = partialLo.GetElement(1), prod01_Hi = partialHi.GetElement(1); + ulong prod10_Lo = partialLo.GetElement(2), prod10_Hi = partialHi.GetElement(2); + ulong prod02_Lo = partialLo.GetElement(3), prod02_Hi = partialHi.GetElement(3); + ulong prod11_Lo = partialLo.GetElement(4), prod11_Hi = partialHi.GetElement(4); + ulong prod20_Lo = partialLo.GetElement(5), prod20_Hi = partialHi.GetElement(5); + ulong prod03_Lo = partialLo.GetElement(6); // Only lower 64‐bits produced. + ulong prod12_Lo = partialLo.GetElement(7); // Only lower 64‐bits produced. + + // --- Combine Lower-Group Partial Products into an Intermediate 256-bit Result --- + // + // The cross-terms prod01 and prod10 contribute to the middle limbs of the full product. + // First, add these two 128‐bit values (each stored as two 64‐bit limbs) with proper carry propagation. + Vector128 crossTermA = Vector128.Create(prod01_Lo, prod01_Hi); + Vector128 crossTermB = Vector128.Create(prod10_Lo, prod10_Hi); + Vector128 crossSum = Vector128AddWithCarry(crossTermA, crossTermB); + + // The lower 64‐bit lane of the cross‐sum will be added to the high part of prod00. + ulong crossLowPart = crossSum.GetElement(0); + ulong combinedProd00_Hi = prod00_Hi + crossLowPart; + + // Build the initial 256‐bit intermediate result from the lower-group products: + // • Limb 0: prod00_Lo (lowest 64 bits of prod00) + // • Limb 1: combinedProd00_Hi (prod00_Hi plus the low cross‐term) + // • Limb 2: The high lane of the cross‐sum plus a carry if the addition in limb 1 overflowed. + // • Limb 3: A final carry from the cross‐term addition (if prod01_Hi exceeds crossSum’s high lane). + Vector256 intermediateResult = Vector256.Create( + prod00_Lo, + combinedProd00_Hi, + crossSum.GetElement(1) + (prod00_Hi > combinedProd00_Hi ? 1ul : 0ul), + (prod01_Hi > crossSum.GetElement(1) ? 1ul : 0ul)); + + // --- Add Contributions from the Upper Group Partial Products --- + // + // The products prod02 and prod11 form one 128‐bit group. + Vector128 group2_A = Vector128.Create(prod02_Lo, prod02_Hi); + Vector128 group2_B = Vector128.Create(prod11_Lo, prod11_Hi); + Vector128 group2Sum = Vector128AddWithCarry(group2_A, group2_B); + + // Include the contribution from prod20 into the group sum. + Vector128 group2_C = Vector128.Create(prod20_Lo, prod20_Hi); + Vector128 totalGroup2 = Vector128AddWithCarry(group2Sum, group2_C); + + // These 128 bits (two 64-bit lanes) belong in the upper half (limbs 2 and 3) of the intermediate result. + // Retrieve the current upper 128 bits of the intermediate result and add the group2 sum. + Vector128 currentUpperHalf = intermediateResult.GetUpper(); + Vector128 newUpperHalf = Vector128AddWithCarry(currentUpperHalf, totalGroup2); + + // Update the intermediate result with the new upper half (the lower half remains unchanged). + intermediateResult = Vector256.Create( + intermediateResult.GetLower(), + newUpperHalf); + + // --- Process and Add the Final (Group 3) Contributions --- + // + // For the remaining contribution, multiply selected limbs from the inputs. + // Here, the upper 128 bits of x and the lower 128 bits of y (in reversed order) are multiplied. + Vector128 aHigh = Vector128.Create(x.u2, x.u3); + Vector128 bLow = Vector128.Create(y.u1, y.u0); + Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow); + + // Add the remaining lower parts from prod03 and prod12. + finalProdLow = Sse2.Add(finalProdLow, Vector128.Create(prod03_Lo, prod12_Lo)); + + // Perform a horizontal add on finalProdLow to collapse its two 64‐bit lanes into one sum. + // (This is done by shuffling the 64-bit lanes using a double-precision view and then adding them.) + Vector128 finalProdAsDouble = finalProdLow.AsDouble(); + Vector128 shuffledDouble = Sse2.Shuffle(finalProdAsDouble, finalProdAsDouble, 0x1); + Vector128 shuffledULong = shuffledDouble.AsUInt64(); + Vector128 horizontalSum = Sse2.Add(finalProdLow, shuffledULong); + + // Add the horizontal sum (the final contribution) to the most-significant limb (limb 3) of the intermediate result. + ulong updatedMostSignificant = intermediateResult.GetElement(3) + horizontalSum.GetElement(0); + + // Write the final 256-bit product, updating limb 3 with the new value. Unsafe.SkipInit(out res); Unsafe.As>(ref res) = - intermediate.WithElement(3, group192); + intermediateResult.WithElement(3, updatedMostSignificant); + } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 81a49f5541baeb589f442066a631ba5a961a4273 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 11:49:17 +0000 Subject: [PATCH 12/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 48 +++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 2d01415..726269d 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1122,9 +1122,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Perform a horizontal add on finalProdLow to collapse its two 64‐bit lanes into one sum. // (This is done by shuffling the 64-bit lanes using a double-precision view and then adding them.) - Vector128 finalProdAsDouble = finalProdLow.AsDouble(); - Vector128 shuffledDouble = Sse2.Shuffle(finalProdAsDouble, finalProdAsDouble, 0x1); - Vector128 shuffledULong = shuffledDouble.AsUInt64(); + Vector128 shuffledULong = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64(); Vector128 horizontalSum = Sse2.Add(finalProdLow, shuffledULong); // Add the horizontal sum (the final contribution) to the most-significant limb (limb 3) of the intermediate result. @@ -1137,17 +1135,41 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } + /// + /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. + /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits + /// and element 1 holding the higher 64 bits. + /// + /// The first 128-bit unsigned integer operand. + /// The second 128-bit unsigned integer operand. + /// + /// A representing the sum of the two operands, with any carry from the lower lane added + /// into the higher lane. + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 Vector128AddWithCarry(Vector128 a, Vector128 b) - { - Vector128 sum = Sse2.Add(a, b); - Vector128 carryMask = Avx512F.VL.CompareLessThan(sum, a); - carryMask = Sse2.ShiftRightLogical(carryMask, 63); - ulong s0 = sum.GetElement(0); - ulong s1 = sum.GetElement(1); - ulong c0 = carryMask.GetElement(0); - s1 += c0; - return Vector128.Create(s0, s1); + static Vector128 Vector128AddWithCarry(Vector128 left, Vector128 right) + { + // Perform a lane-wise addition of the two operands. + Vector128 sum = Sse2.Add(left, right); + + // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands. + // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise. + Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left); + + // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. + // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. + overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); + + // Promote the carry from the lower lane (element 0) into the upper lane. + // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane. + Vector128 swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64(); + + // Next, clear the (now swapped) lower lane by blending with a zero vector. + // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged. + Vector128 promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128.Zero, 0x1).AsUInt64(); + + // Add the propagated carry to the sum. + return Sse2.Add(sum, promotedCarry); } } From 2c5c7a38e90e2a4fca26e12c7b30f212d48fb755 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 7 Feb 2025 13:59:15 +0000 Subject: [PATCH 13/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 380 ++++++++++++++++++------------- 1 file changed, 216 insertions(+), 164 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 726269d..95149d7 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1010,7 +1010,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Unsafe.AsRef(in res.u1) = high; return; } - + if (!Avx512F.IsSupported || !Avx512DQ.IsSupported) { ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); @@ -1034,189 +1034,241 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } else { - // Load the 256‐bit inputs into 256‐bit vector registers. + // 1. Load the 256‐bit inputs into 256‐bit vector registers. Vector256 aVector = Unsafe.As>(ref Unsafe.AsRef(in x)); Vector256 bVector = Unsafe.As>(ref Unsafe.AsRef(in y)); - // Rearrange the 64‐bit limbs of each input into 512‐bit vectors. - // The chosen permutations align the limbs so that later 64‐bit multiplications yield the correct cross‐products. - Vector512 rearrangedA = Vector512.Create( - Avx2.Permute4x64(aVector, 16), // Lower part permutation for A - Avx2.Permute4x64(aVector, 73)); // Upper part permutation for A + // 2. Rearrange the 64‐bit limbs into 512‐bit vectors. + Vector256 aPerm0 = Avx2.Permute4x64(aVector, 16); + Vector256 aPerm1 = Avx2.Permute4x64(aVector, 73); + Vector512 rearrangedA = Vector512.Create(aPerm0, aPerm1); - Vector512 rearrangedB = Vector512.Create( - Avx2.Permute4x64(bVector, 132), // Lower part permutation for B - Avx2.Permute4x64(bVector, 177)); // Upper part permutation for B + Vector256 bPerm0 = Avx2.Permute4x64(bVector, 132); + Vector256 bPerm1 = Avx2.Permute4x64(bVector, 177); + Vector512 rearrangedB = Vector512.Create(bPerm0, bPerm1); - // Multiply the corresponding 64‐bit limbs of the rearranged inputs. - // Each multiplication yields a 128‐bit product split into a low and high 64‐bit part. + // 3. Multiply the corresponding 64‐bit limbs. Mul64Vector(rearrangedA, rearrangedB, out Vector512 partialLo, out Vector512 partialHi); - // --- Extract Partial Products from the First Group --- - // - // The following partial products (with both low and high parts) result from the 64‐bit multiplications. - // They are named to indicate their source position in the multiplication grid. - ulong prod00_Lo = partialLo.GetElement(0), prod00_Hi = partialHi.GetElement(0); - ulong prod01_Lo = partialLo.GetElement(1), prod01_Hi = partialHi.GetElement(1); - ulong prod10_Lo = partialLo.GetElement(2), prod10_Hi = partialHi.GetElement(2); - ulong prod02_Lo = partialLo.GetElement(3), prod02_Hi = partialHi.GetElement(3); - ulong prod11_Lo = partialLo.GetElement(4), prod11_Hi = partialHi.GetElement(4); - ulong prod20_Lo = partialLo.GetElement(5), prod20_Hi = partialHi.GetElement(5); - ulong prod03_Lo = partialLo.GetElement(6); // Only lower 64‐bits produced. - ulong prod12_Lo = partialLo.GetElement(7); // Only lower 64‐bits produced. - - // --- Combine Lower-Group Partial Products into an Intermediate 256-bit Result --- - // - // The cross-terms prod01 and prod10 contribute to the middle limbs of the full product. - // First, add these two 128‐bit values (each stored as two 64‐bit limbs) with proper carry propagation. - Vector128 crossTermA = Vector128.Create(prod01_Lo, prod01_Hi); - Vector128 crossTermB = Vector128.Create(prod10_Lo, prod10_Hi); - Vector128 crossSum = Vector128AddWithCarry(crossTermA, crossTermB); - - // The lower 64‐bit lane of the cross‐sum will be added to the high part of prod00. - ulong crossLowPart = crossSum.GetElement(0); - ulong combinedProd00_Hi = prod00_Hi + crossLowPart; - - // Build the initial 256‐bit intermediate result from the lower-group products: - // • Limb 0: prod00_Lo (lowest 64 bits of prod00) - // • Limb 1: combinedProd00_Hi (prod00_Hi plus the low cross‐term) - // • Limb 2: The high lane of the cross‐sum plus a carry if the addition in limb 1 overflowed. - // • Limb 3: A final carry from the cross‐term addition (if prod01_Hi exceeds crossSum’s high lane). - Vector256 intermediateResult = Vector256.Create( - prod00_Lo, - combinedProd00_Hi, - crossSum.GetElement(1) + (prod00_Hi > combinedProd00_Hi ? 1ul : 0ul), - (prod01_Hi > crossSum.GetElement(1) ? 1ul : 0ul)); - - // --- Add Contributions from the Upper Group Partial Products --- - // - // The products prod02 and prod11 form one 128‐bit group. - Vector128 group2_A = Vector128.Create(prod02_Lo, prod02_Hi); - Vector128 group2_B = Vector128.Create(prod11_Lo, prod11_Hi); - Vector128 group2Sum = Vector128AddWithCarry(group2_A, group2_B); - - // Include the contribution from prod20 into the group sum. - Vector128 group2_C = Vector128.Create(prod20_Lo, prod20_Hi); - Vector128 totalGroup2 = Vector128AddWithCarry(group2Sum, group2_C); - - // These 128 bits (two 64-bit lanes) belong in the upper half (limbs 2 and 3) of the intermediate result. - // Retrieve the current upper 128 bits of the intermediate result and add the group2 sum. - Vector128 currentUpperHalf = intermediateResult.GetUpper(); - Vector128 newUpperHalf = Vector128AddWithCarry(currentUpperHalf, totalGroup2); - - // Update the intermediate result with the new upper half (the lower half remains unchanged). - intermediateResult = Vector256.Create( - intermediateResult.GetLower(), - newUpperHalf); - - // --- Process and Add the Final (Group 3) Contributions --- - // - // For the remaining contribution, multiply selected limbs from the inputs. - // Here, the upper 128 bits of x and the lower 128 bits of y (in reversed order) are multiplied. + // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20) + // into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.) + // – Products 0 and 1 come from index 0: + Vector128 pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo + Vector128 pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi + // Unpack lower (lane0) and upper (lane1) to form product0 and product1: + Vector128 prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi); // prod00 = {lo, hi} + Vector128 prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi} + + // – Products 2 and 3 come from index 1: + Vector128 pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo + Vector128 pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi + Vector128 prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi); // prod10 + Vector128 prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02 + + // – Products 4 and 5 come from index 2: + Vector128 pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo + Vector128 pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi + Vector128 prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi); // prod11 + Vector128 prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20 + + // 5. Group‑1 “cross‑term” addition: + // crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2) + Vector128 crossSum = Add128(prod1, prod2); + + // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb. + // Instead of extracting a scalar, we broadcast the lower 64 bits to a vector. + // (Assume BroadcastLower128 returns a copy with both lanes equal to element0.) + Vector128 csLow = BroadcastLower128(crossSum); + // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue} + Vector128 highMask = Vector128.Create(0ul, ulong.MaxValue); + Vector128 addMask = Sse2.And(csLow, highMask); + Vector128 prod0Updated = Sse2.Add(prod0, addMask); + + // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) + uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan( + ExtractHighLimb(prod0Updated), // compare updated high limb... + ExtractHighLimb(prod0) // ...with the original high limb + ).AsByte()) & 1; + // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) + Vector128 csHigh = BroadcastUpper128(crossSum); + Vector128 limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag)); + + // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: + uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan( + ExtractHighLimb(prod1), csHigh).AsByte()) & 1); + Vector128 limb3Vec = Vector128.CreateScalar((ulong)limb3); + + // 7. Build the 256‑bit “intermediate” result from group‑1: + // Lower 128 bits = prod00 (with updated high limb) + // Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector. + Vector128 lowerIntermediate = prod0Updated; + // Pack limb2 into the lower half and limb3 into the upper half. + Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec); + Vector256 intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate); + + // 8. Process group‑2: (prod02, prod11, prod20) + Vector128 group2Sum = Add128(prod3, prod4); + Vector128 totalGroup2 = Add128(group2Sum, prod5); + // Add totalGroup2 into the current upper 128 bits of intermediateResult. + Vector128 currentUpper = GetUpper(intermediateResult); + Vector128 newUpper = Add128(currentUpper, totalGroup2); + intermediateResult = WithUpper(intermediateResult, newUpper); + + // 9. Process group‑3: + // Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts. Vector128 aHigh = Vector128.Create(x.u2, x.u3); Vector128 bLow = Vector128.Create(y.u1, y.u0); + // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits) Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow); - // Add the remaining lower parts from prod03 and prod12. - finalProdLow = Sse2.Add(finalProdLow, Vector128.Create(prod03_Lo, prod12_Lo)); + // Extract from partialLo the two lower parts for prod03 and prod12. + // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7), + // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes. + Vector128 prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1); + // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”): + // (Note: prod6 already holds both lower parts.) + finalProdLow = Sse2.Add(finalProdLow, prod6); + // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. + Vector128 horizontalSum = HorizontalAdd(finalProdLow); + // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. + Vector128 upperTemp = GetUpper(intermediateResult); + Vector128 hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask); + Vector128 newUpperTemp = Sse2.Add(upperTemp, hsBroadcast); + intermediateResult = WithUpper(intermediateResult, newUpperTemp); + + // 10. Write out the final 256‑bit result. + Unsafe.SkipInit(out res); + Unsafe.As>(ref res) = intermediateResult; - // Perform a horizontal add on finalProdLow to collapse its two 64‐bit lanes into one sum. - // (This is done by shuffling the 64-bit lanes using a double-precision view and then adding them.) - Vector128 shuffledULong = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64(); - Vector128 horizontalSum = Sse2.Add(finalProdLow, shuffledULong); + static Vector128 HorizontalAdd(Vector128 vec) + { + // Reinterpret the 64-bit integer vector as a vector of two doubles. + // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes. + Vector128 swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64(); - // Add the horizontal sum (the final contribution) to the most-significant limb (limb 3) of the intermediate result. - ulong updatedMostSignificant = intermediateResult.GetElement(3) + horizontalSum.GetElement(0); + // Add the original vector and the swapped vector. + // This results in a vector where both lanes equal (vec[0] + vec[1]). + return Sse2.Add(vec, swapped); + } - // Write the final 256-bit product, updating limb 3 with the new value. - Unsafe.SkipInit(out res); - Unsafe.As>(ref res) = - intermediateResult.WithElement(3, updatedMostSignificant); - - } - - /// - /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. - /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits - /// and element 1 holding the higher 64 bits. - /// - /// The first 128-bit unsigned integer operand. - /// The second 128-bit unsigned integer operand. - /// - /// A representing the sum of the two operands, with any carry from the lower lane added - /// into the higher lane. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 Vector128AddWithCarry(Vector128 left, Vector128 right) - { - // Perform a lane-wise addition of the two operands. - Vector128 sum = Sse2.Add(left, right); + // Helpers that mimic “GetUpper” and “WithUpper” on a 256‑bit vector. + // (You might implement these as extension methods on Vector256.) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 GetUpper(Vector256 vec) + { + // For example, using Avx2.ExtractVector128: + return Avx2.ExtractVector128(vec, 1); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector256 WithUpper(Vector256 vec, Vector128 upper) + { + // Replace the upper 128 bits of vec with upper. + return Avx2.InsertVector128(vec, upper, 1); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 ExtractHighLimb(Vector128 vec) + { + // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb, + // then reinterpret back as 64-bit. + return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64(); + } - // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands. - // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise. - Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left); + // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 BroadcastLower128(Vector128 vec) + { + // Replicate element0 to both lanes. + return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 BroadcastUpper128(Vector128 vec) + { + // Replicate element1 to both lanes. + return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1 + } + /// + /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. + /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits + /// and element 1 holding the higher 64 bits. + /// + /// The first 128-bit unsigned integer operand. + /// The second 128-bit unsigned integer operand. + /// + /// A representing the sum of the two operands, with any carry from the lower lane added + /// into the higher lane. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 Add128(Vector128 left, Vector128 right) + { + // Perform a lane-wise addition of the two operands. + Vector128 sum = Sse2.Add(left, right); + + // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands. + // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise. + Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left); - // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. - // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. - overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); + // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. + // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. + overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); - // Promote the carry from the lower lane (element 0) into the upper lane. - // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane. - Vector128 swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64(); + // Promote the carry from the lower lane (element 0) into the upper lane. + // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane. + Vector128 swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64(); - // Next, clear the (now swapped) lower lane by blending with a zero vector. - // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged. - Vector128 promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128.Zero, 0x1).AsUInt64(); + // Next, clear the (now swapped) lower lane by blending with a zero vector. + // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged. + Vector128 promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128.Zero, 0x1).AsUInt64(); - // Add the propagated carry to the sum. - return Sse2.Add(sum, promotedCarry); + // Add the propagated carry to the sum. + return Sse2.Add(sum, promotedCarry); + } + } + + // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes), + // computes per lane: + // product = a * b = (hi, lo) + // using the splitting method since there is no MultiplyHigh intrinsic. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Mul64Vector(Vector512 a, Vector512 b, + out Vector512 lo, out Vector512 hi) + { + // Mask for the lower 32 bits. + Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); + + // Split each 64-bit operand into 32-bit halves: + // a0 = lower 32 bits, a1 = upper 32 bits + Vector512 a0 = Avx512F.And(a, mask32); + Vector512 a1 = Avx512F.ShiftRightLogical(a, 32); + Vector512 b0 = Avx512F.And(b, mask32); + Vector512 b1 = Avx512F.ShiftRightLogical(b, 32); + + // Compute the four 32x32 partial products. + // Each multiplication here is on 32-bit values, so the result fits in 64 bits. + Vector512 u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0 + Vector512 u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1 + Vector512 u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0 + Vector512 u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1 + + // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) + Vector512 u0_hi = Avx512F.ShiftRightLogical(u0, 32); + Vector512 u1_lo = Avx512F.And(u1, mask32); + Vector512 u2_lo = Avx512F.And(u2, mask32); + Vector512 t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo); + + // The extra carry: c = t >> 32. + Vector512 c = Avx512F.ShiftRightLogical(t, 32); + + // Now, assemble the lower 64 bits: + // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. + Vector512 u0_lo = Avx512F.And(u0, mask32); + Vector512 t_lo = Avx512F.And(t, mask32); + lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32)); + + // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. + Vector512 u1_hi = Avx512F.ShiftRightLogical(u1, 32); + Vector512 u2_hi = Avx512F.ShiftRightLogical(u2, 32); + hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c); } - } - - // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes), - // computes per lane: - // product = a * b = (hi, lo) - // using the splitting method since there is no MultiplyHigh intrinsic. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Mul64Vector(Vector512 a, Vector512 b, - out Vector512 lo, out Vector512 hi) - { - // Mask for the lower 32 bits. - Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); - - // Split each 64-bit operand into 32-bit halves: - // a0 = lower 32 bits, a1 = upper 32 bits - Vector512 a0 = Avx512F.And(a, mask32); - Vector512 a1 = Avx512F.ShiftRightLogical(a, 32); - Vector512 b0 = Avx512F.And(b, mask32); - Vector512 b1 = Avx512F.ShiftRightLogical(b, 32); - - // Compute the four 32x32 partial products. - // Each multiplication here is on 32-bit values, so the result fits in 64 bits. - Vector512 u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0 - Vector512 u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1 - Vector512 u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0 - Vector512 u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1 - - // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) - Vector512 u0_hi = Avx512F.ShiftRightLogical(u0, 32); - Vector512 u1_lo = Avx512F.And(u1, mask32); - Vector512 u2_lo = Avx512F.And(u2, mask32); - Vector512 t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo); - - // The extra carry: c = t >> 32. - Vector512 c = Avx512F.ShiftRightLogical(t, 32); - - // Now, assemble the lower 64 bits: - // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. - Vector512 u0_lo = Avx512F.And(u0, mask32); - Vector512 t_lo = Avx512F.And(t, mask32); - lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32)); - - // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. - Vector512 u1_hi = Avx512F.ShiftRightLogical(u1, 32); - Vector512 u2_hi = Avx512F.ShiftRightLogical(u2, 32); - hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c); } public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res); From d0195e9671b7ac19ea3e4bff100fe849ab169944 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 04:59:50 +0000 Subject: [PATCH 14/38] Fix benchmarks --- src/Nethermind.Int256.Benchmark/Benchmarks.cs | 92 +++++++++---------- .../NoIntrinsicsJobAttribute.cs | 4 +- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/src/Nethermind.Int256.Benchmark/Benchmarks.cs b/src/Nethermind.Int256.Benchmark/Benchmarks.cs index 616aac1..b22a1da 100644 --- a/src/Nethermind.Int256.Benchmark/Benchmarks.cs +++ b/src/Nethermind.Int256.Benchmark/Benchmarks.cs @@ -89,8 +89,8 @@ public class SignedIntTwoParamBenchmarkBase : SignedBenchmarkBase public (int, Int256) D; } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddUnsigned : UnsignedTwoParamBenchmarkBase { @@ -108,8 +108,8 @@ public UInt256 Add_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddSigned : SignedTwoParamBenchmarkBase { @@ -127,8 +127,8 @@ public Int256 Add_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractUnsigned : UnsignedTwoParamBenchmarkBase { @@ -146,8 +146,8 @@ public UInt256 Subtract_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractSigned : SignedTwoParamBenchmarkBase { @@ -165,8 +165,8 @@ public Int256 Subtract_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddModUnsinged : UnsignedThreeParamBenchmarkBase { @@ -184,8 +184,8 @@ public UInt256 AddMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddModSinged : SignedThreeParamBenchmarkBase { @@ -203,8 +203,8 @@ public Int256 AddMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractModUnsinged : UnsignedThreeParamBenchmarkBase { @@ -222,8 +222,8 @@ public UInt256 SubtractMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractModSigned : SignedThreeParamBenchmarkBase { @@ -241,8 +241,8 @@ public Int256 SubtractMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplyUnsigned : UnsignedTwoParamBenchmarkBase { @@ -260,8 +260,8 @@ public UInt256 Multiply_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplySigned : SignedTwoParamBenchmarkBase { @@ -279,8 +279,8 @@ public Int256 Multiply_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplyModUnsigned : UnsignedThreeParamBenchmarkBase { @@ -298,8 +298,8 @@ public UInt256 MultiplyMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplyModSigned : SignedThreeParamBenchmarkBase { @@ -317,8 +317,8 @@ public Int256 MultiplyMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class DivideUnsigned : UnsignedTwoParamBenchmarkBase { @@ -336,8 +336,8 @@ public UInt256 Divide_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class DivideSigned : SignedTwoParamBenchmarkBase { @@ -355,8 +355,8 @@ public Int256 Divide_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpUnsigned : UnsignedIntTwoParamBenchmarkBase { @@ -374,8 +374,8 @@ public UInt256 Exp_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpSigned : SignedIntTwoParamBenchmarkBase { @@ -393,8 +393,8 @@ public Int256 Exp_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpModUnsigned : UnsignedThreeParamBenchmarkBase { @@ -412,8 +412,8 @@ public UInt256 ExpMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpModSigned : SignedBenchmarkBase { @@ -440,8 +440,8 @@ public Int256 ExpMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class LeftShiftUnsigned : UnsignedIntTwoParamBenchmarkBase { @@ -459,8 +459,8 @@ public UInt256 LeftShift_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class LeftShiftSigned : SignedIntTwoParamBenchmarkBase { @@ -478,8 +478,8 @@ public Int256 LeftShift_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class RightShiftUnsigned : UnsignedIntTwoParamBenchmarkBase { @@ -497,8 +497,8 @@ public UInt256 RightShift_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class RightShiftSigned : SignedIntTwoParamBenchmarkBase { @@ -516,8 +516,8 @@ public Int256 RightShift_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class IsZeroOne { diff --git a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs index d90679e..f3307cb 100644 --- a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs +++ b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs @@ -1,4 +1,4 @@ -using System; +using System; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; @@ -116,6 +116,8 @@ internal static Runtime GetRuntime(this RuntimeMoniker runtimeMoniker) return CoreRuntime.Core70; case RuntimeMoniker.Net80: return CoreRuntime.Core80; + case RuntimeMoniker.Net90: + return CoreRuntime.Core90; case RuntimeMoniker.Mono: return MonoRuntime.Default; case RuntimeMoniker.NativeAot60: From 27f98c4eebca35ac2a7d36e366c1ab1e7966aeeb Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 05:00:06 +0000 Subject: [PATCH 15/38] Temp refactor --- src/Nethermind.Int256/UInt256.cs | 486 +++++++++++++++---------------- 1 file changed, 238 insertions(+), 248 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 95149d7..16adf6f 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1002,273 +1002,263 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) { if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0) { - // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615) - ulong high = Math.BigMul(x.u0, y.u0, out ulong low); - // Assignment to res after multiply in case is used as input for x or y (by ref aliasing) - res = default; - Unsafe.AsRef(in res.u0) = low; - Unsafe.AsRef(in res.u1) = high; + MultiplyULong(x, y, out res); return; } - if (!Avx512F.IsSupported || !Avx512DQ.IsSupported) { - ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); - ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); - - (ulong carry, ulong r0) = Multiply64(rx, ry); - UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); - UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); - ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; - - UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); - UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); - res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; + MultiplyNonAvx512(x, y, out res); + return; + } - UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); - res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; + // 1. Load the 256‐bit inputs into 256‐bit vector registers. + Vector256 aVector = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 bVector = Unsafe.As>(ref Unsafe.AsRef(in y)); + + // 2. Rearrange the 64‐bit limbs into 512‐bit vectors. + Vector256 aPerm0 = Avx2.Permute4x64(aVector, 16); + Vector256 aPerm1 = Avx2.Permute4x64(aVector, 73); + Vector512 rearrangedA = Vector512.Create(aPerm0, aPerm1); + + Vector256 bPerm0 = Avx2.Permute4x64(bVector, 132); + Vector256 bPerm1 = Avx2.Permute4x64(bVector, 177); + Vector512 rearrangedB = Vector512.Create(bPerm0, bPerm1); + + // 3. Multiply the corresponding 64‐bit limbs. + + // Mask for the lower 32 bits. + Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); + + // Split each 64-bit operand into 32-bit halves: + // a0 = lower 32 bits, a1 = upper 32 bits + Vector512 a0 = Avx512F.And(rearrangedA, mask32); + Vector512 a1 = Avx512F.ShiftRightLogical(rearrangedA, 32); + Vector512 b0 = Avx512F.And(rearrangedB, mask32); + Vector512 b1 = Avx512F.ShiftRightLogical(rearrangedB, 32); + + // Compute the four 32x32 partial products. + // Each multiplication here is on 32-bit values, so the result fits in 64 bits. + Vector512 u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0 + Vector512 u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1 + Vector512 u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0 + Vector512 u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1 + + // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) + Vector512 u0_hi = Avx512F.ShiftRightLogical(u0, 32); + Vector512 u1_lo = Avx512F.And(u1, mask32); + Vector512 u2_lo = Avx512F.And(u2, mask32); + Vector512 t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo); + + // The extra carry: c = t >> 32. + Vector512 c = Avx512F.ShiftRightLogical(t, 32); + + // Now, assemble the lower 64 bits: + // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. + Vector512 u0_lo = Avx512F.And(u0, mask32); + Vector512 t_lo = Avx512F.And(t, mask32); + Vector512 partialLo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32)); + + // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. + Vector512 u1_hi = Avx512F.ShiftRightLogical(u1, 32); + Vector512 u2_hi = Avx512F.ShiftRightLogical(u2, 32); + Vector512 partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c); + + // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20) + // into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.) + // – Products 0 and 1 come from index 0: + Vector128 pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo + Vector128 pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi + // Unpack lower (lane0) and upper (lane1) to form product0 and product1: + Vector128 prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi); // prod00 = {lo, hi} + Vector128 prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi} + + // – Products 2 and 3 come from index 1: + Vector128 pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo + Vector128 pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi + Vector128 prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi); // prod10 + Vector128 prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02 + + // – Products 4 and 5 come from index 2: + Vector128 pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo + Vector128 pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi + Vector128 prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi); // prod11 + Vector128 prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20 + + // 5. Group‑1 “cross‑term” addition: + // crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2) + Vector128 crossSum = Add128(prod1, prod2); + + // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb. + // Instead of extracting a scalar, we broadcast the lower 64 bits to a vector. + // (Assume BroadcastLower128 returns a copy with both lanes equal to element0.) + Vector128 csLow = BroadcastLower128(crossSum); + // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue} + Vector128 highMask = Vector128.Create(0ul, ulong.MaxValue); + Vector128 addMask = Sse2.And(csLow, highMask); + Vector128 prod0Updated = Sse2.Add(prod0, addMask); + + // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) + uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan( + ExtractHighLimb(prod0Updated), // compare updated high limb... + ExtractHighLimb(prod0) // ...with the original high limb + ).AsByte()) & 1; + // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) + Vector128 csHigh = BroadcastUpper128(crossSum); + Vector128 limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag)); + + // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: + uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan( + ExtractHighLimb(prod1), csHigh).AsByte()) & 1); + Vector128 limb3Vec = Vector128.CreateScalar((ulong)limb3); + + // 7. Build the 256‑bit “intermediate” result from group‑1: + // Lower 128 bits = prod00 (with updated high limb) + // Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector. + Vector128 lowerIntermediate = prod0Updated; + // Pack limb2 into the lower half and limb3 into the upper half. + Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec); + Vector256 intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate); + + // 8. Process group‑2: (prod02, prod11, prod20) + Vector128 group2Sum = Add128(prod3, prod4); + Vector128 totalGroup2 = Add128(group2Sum, prod5); + // Add totalGroup2 into the current upper 128 bits of intermediateResult. + Vector128 currentUpper = intermediateResult.GetUpper(); + Vector128 newUpper = Add128(currentUpper, totalGroup2); + intermediateResult = WithUpper(intermediateResult, newUpper); + + // 9. Process group‑3: + // Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts. + Vector128 aHigh = Vector128.Create(x.u2, x.u3); + Vector128 bLow = Vector128.Create(y.u1, y.u0); + // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits) + Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow); + + // Extract from partialLo the two lower parts for prod03 and prod12. + // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7), + // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes. + Vector128 prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1); + // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”): + // (Note: prod6 already holds both lower parts.) + finalProdLow = Sse2.Add(finalProdLow, prod6); + // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. + Vector128 horizontalSum = HorizontalAdd(finalProdLow); + // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. + Vector128 upperTemp = intermediateResult.GetUpper(); + Vector128 hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask); + Vector128 newUpperTemp = Sse2.Add(upperTemp, hsBroadcast); + intermediateResult = WithUpper(intermediateResult, newUpperTemp); + + // 10. Write out the final 256‑bit result. + Unsafe.SkipInit(out res); + Unsafe.As>(ref res) = intermediateResult; + + static Vector128 HorizontalAdd(Vector128 vec) + { + // Reinterpret the 64-bit integer vector as a vector of two doubles. + // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes. + Vector128 swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64(); + + // Add the original vector and the swapped vector. + // This results in a vector where both lanes equal (vec[0] + vec[1]). + return Sse2.Add(vec, swapped); + } - ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector256 WithUpper(Vector256 vec, Vector128 upper) + { + // Replace the upper 128 bits of vec with upper. + return Avx2.InsertVector128(vec, upper, 1); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 ExtractHighLimb(Vector128 vec) + { + // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb, + // then reinterpret back as 64-bit. + return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64(); + } - res = new UInt256(r0, r1, r2, r3); + // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 BroadcastLower128(Vector128 vec) + { + // Replicate element0 to both lanes. + return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64(); } - else + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 BroadcastUpper128(Vector128 vec) + { + // Replicate element1 to both lanes. + return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1 + } + /// + /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. + /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits + /// and element 1 holding the higher 64 bits. + /// + /// The first 128-bit unsigned integer operand. + /// The second 128-bit unsigned integer operand. + /// + /// A representing the sum of the two operands, with any carry from the lower lane added + /// into the higher lane. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 Add128(Vector128 left, Vector128 right) { - // 1. Load the 256‐bit inputs into 256‐bit vector registers. - Vector256 aVector = Unsafe.As>(ref Unsafe.AsRef(in x)); - Vector256 bVector = Unsafe.As>(ref Unsafe.AsRef(in y)); - - // 2. Rearrange the 64‐bit limbs into 512‐bit vectors. - Vector256 aPerm0 = Avx2.Permute4x64(aVector, 16); - Vector256 aPerm1 = Avx2.Permute4x64(aVector, 73); - Vector512 rearrangedA = Vector512.Create(aPerm0, aPerm1); - - Vector256 bPerm0 = Avx2.Permute4x64(bVector, 132); - Vector256 bPerm1 = Avx2.Permute4x64(bVector, 177); - Vector512 rearrangedB = Vector512.Create(bPerm0, bPerm1); - - // 3. Multiply the corresponding 64‐bit limbs. - Mul64Vector(rearrangedA, rearrangedB, out Vector512 partialLo, out Vector512 partialHi); - - // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20) - // into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.) - // – Products 0 and 1 come from index 0: - Vector128 pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo - Vector128 pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi - // Unpack lower (lane0) and upper (lane1) to form product0 and product1: - Vector128 prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi); // prod00 = {lo, hi} - Vector128 prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi} - - // – Products 2 and 3 come from index 1: - Vector128 pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo - Vector128 pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi - Vector128 prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi); // prod10 - Vector128 prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02 - - // – Products 4 and 5 come from index 2: - Vector128 pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo - Vector128 pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi - Vector128 prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi); // prod11 - Vector128 prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20 - - // 5. Group‑1 “cross‑term” addition: - // crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2) - Vector128 crossSum = Add128(prod1, prod2); - - // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb. - // Instead of extracting a scalar, we broadcast the lower 64 bits to a vector. - // (Assume BroadcastLower128 returns a copy with both lanes equal to element0.) - Vector128 csLow = BroadcastLower128(crossSum); - // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue} - Vector128 highMask = Vector128.Create(0ul, ulong.MaxValue); - Vector128 addMask = Sse2.And(csLow, highMask); - Vector128 prod0Updated = Sse2.Add(prod0, addMask); - - // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) - uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan( - ExtractHighLimb(prod0Updated), // compare updated high limb... - ExtractHighLimb(prod0) // ...with the original high limb - ).AsByte()) & 1; - // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) - Vector128 csHigh = BroadcastUpper128(crossSum); - Vector128 limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag)); - - // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: - uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan( - ExtractHighLimb(prod1), csHigh).AsByte()) & 1); - Vector128 limb3Vec = Vector128.CreateScalar((ulong)limb3); - - // 7. Build the 256‑bit “intermediate” result from group‑1: - // Lower 128 bits = prod00 (with updated high limb) - // Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector. - Vector128 lowerIntermediate = prod0Updated; - // Pack limb2 into the lower half and limb3 into the upper half. - Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec); - Vector256 intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate); - - // 8. Process group‑2: (prod02, prod11, prod20) - Vector128 group2Sum = Add128(prod3, prod4); - Vector128 totalGroup2 = Add128(group2Sum, prod5); - // Add totalGroup2 into the current upper 128 bits of intermediateResult. - Vector128 currentUpper = GetUpper(intermediateResult); - Vector128 newUpper = Add128(currentUpper, totalGroup2); - intermediateResult = WithUpper(intermediateResult, newUpper); - - // 9. Process group‑3: - // Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts. - Vector128 aHigh = Vector128.Create(x.u2, x.u3); - Vector128 bLow = Vector128.Create(y.u1, y.u0); - // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits) - Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow); - - // Extract from partialLo the two lower parts for prod03 and prod12. - // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7), - // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes. - Vector128 prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1); - // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”): - // (Note: prod6 already holds both lower parts.) - finalProdLow = Sse2.Add(finalProdLow, prod6); - // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. - Vector128 horizontalSum = HorizontalAdd(finalProdLow); - // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. - Vector128 upperTemp = GetUpper(intermediateResult); - Vector128 hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask); - Vector128 newUpperTemp = Sse2.Add(upperTemp, hsBroadcast); - intermediateResult = WithUpper(intermediateResult, newUpperTemp); - - // 10. Write out the final 256‑bit result. - Unsafe.SkipInit(out res); - Unsafe.As>(ref res) = intermediateResult; + // Perform a lane-wise addition of the two operands. + Vector128 sum = Sse2.Add(left, right); - static Vector128 HorizontalAdd(Vector128 vec) - { - // Reinterpret the 64-bit integer vector as a vector of two doubles. - // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes. - Vector128 swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64(); + // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands. + // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise. + Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left); - // Add the original vector and the swapped vector. - // This results in a vector where both lanes equal (vec[0] + vec[1]). - return Sse2.Add(vec, swapped); - } + // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. + // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. + overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); - // Helpers that mimic “GetUpper” and “WithUpper” on a 256‑bit vector. - // (You might implement these as extension methods on Vector256.) - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 GetUpper(Vector256 vec) - { - // For example, using Avx2.ExtractVector128: - return Avx2.ExtractVector128(vec, 1); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector256 WithUpper(Vector256 vec, Vector128 upper) - { - // Replace the upper 128 bits of vec with upper. - return Avx2.InsertVector128(vec, upper, 1); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 ExtractHighLimb(Vector128 vec) - { - // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb, - // then reinterpret back as 64-bit. - return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64(); - } + // Promote the carry from the lower lane (element 0) into the upper lane. + // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane. + Vector128 swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64(); - // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 BroadcastLower128(Vector128 vec) - { - // Replicate element0 to both lanes. - return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64(); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 BroadcastUpper128(Vector128 vec) - { - // Replicate element1 to both lanes. - return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1 - } - /// - /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. - /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits - /// and element 1 holding the higher 64 bits. - /// - /// The first 128-bit unsigned integer operand. - /// The second 128-bit unsigned integer operand. - /// - /// A representing the sum of the two operands, with any carry from the lower lane added - /// into the higher lane. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 Add128(Vector128 left, Vector128 right) - { - // Perform a lane-wise addition of the two operands. - Vector128 sum = Sse2.Add(left, right); + // Next, clear the (now swapped) lower lane by blending with a zero vector. + // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged. + Vector128 promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128.Zero, 0x1).AsUInt64(); - // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands. - // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise. - Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left); + // Add the propagated carry to the sum. + return Sse2.Add(sum, promotedCarry); + } + } - // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. - // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. - overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); + private static void MultiplyNonAvx512(UInt256 x, UInt256 y, out UInt256 res) + { + ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); + ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); - // Promote the carry from the lower lane (element 0) into the upper lane. - // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane. - Vector128 swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64(); + (ulong carry, ulong r0) = Multiply64(rx, ry); + UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); + UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); + ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; - // Next, clear the (now swapped) lower lane by blending with a zero vector. - // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged. - Vector128 promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128.Zero, 0x1).AsUInt64(); + UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); + UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); + res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; - // Add the propagated carry to the sum. - return Sse2.Add(sum, promotedCarry); - } - } + UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); + res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; - // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes), - // computes per lane: - // product = a * b = (hi, lo) - // using the splitting method since there is no MultiplyHigh intrinsic. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void Mul64Vector(Vector512 a, Vector512 b, - out Vector512 lo, out Vector512 hi) - { - // Mask for the lower 32 bits. - Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); - - // Split each 64-bit operand into 32-bit halves: - // a0 = lower 32 bits, a1 = upper 32 bits - Vector512 a0 = Avx512F.And(a, mask32); - Vector512 a1 = Avx512F.ShiftRightLogical(a, 32); - Vector512 b0 = Avx512F.And(b, mask32); - Vector512 b1 = Avx512F.ShiftRightLogical(b, 32); - - // Compute the four 32x32 partial products. - // Each multiplication here is on 32-bit values, so the result fits in 64 bits. - Vector512 u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0 - Vector512 u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1 - Vector512 u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0 - Vector512 u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1 - - // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) - Vector512 u0_hi = Avx512F.ShiftRightLogical(u0, 32); - Vector512 u1_lo = Avx512F.And(u1, mask32); - Vector512 u2_lo = Avx512F.And(u2, mask32); - Vector512 t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo); - - // The extra carry: c = t >> 32. - Vector512 c = Avx512F.ShiftRightLogical(t, 32); - - // Now, assemble the lower 64 bits: - // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. - Vector512 u0_lo = Avx512F.And(u0, mask32); - Vector512 t_lo = Avx512F.And(t, mask32); - lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32)); - - // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. - Vector512 u1_hi = Avx512F.ShiftRightLogical(u1, 32); - Vector512 u2_hi = Avx512F.ShiftRightLogical(u2, 32); - hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c); - } + ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); + + res = new UInt256(r0, r1, r2, r3); + } + + private static void MultiplyULong(UInt256 x, UInt256 y, out UInt256 res) + { + // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615) + ulong high = Math.BigMul(x.u0, y.u0, out ulong low); + // Assignment to res after multiply in case is used as input for x or y (by ref aliasing) + res = default; + Unsafe.AsRef(in res.u0) = low; + Unsafe.AsRef(in res.u1) = high; } public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res); From ff5584171858d1f17695b8f191affb4a6b7a8e44 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 06:15:16 +0000 Subject: [PATCH 16/38] Improved comments --- src/Nethermind.Int256/UInt256.cs | 69 +++++++++++++++++--------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 16adf6f..3fe2716 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1012,56 +1012,61 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } // 1. Load the 256‐bit inputs into 256‐bit vector registers. - Vector256 aVector = Unsafe.As>(ref Unsafe.AsRef(in x)); - Vector256 bVector = Unsafe.As>(ref Unsafe.AsRef(in y)); + Vector256 x0123 = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 y0123 = Unsafe.As>(ref Unsafe.AsRef(in y)); - // 2. Rearrange the 64‐bit limbs into 512‐bit vectors. - Vector256 aPerm0 = Avx2.Permute4x64(aVector, 16); - Vector256 aPerm1 = Avx2.Permute4x64(aVector, 73); - Vector512 rearrangedA = Vector512.Create(aPerm0, aPerm1); + // Mask for the lower 32 bits. + Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); - Vector256 bPerm0 = Avx2.Permute4x64(bVector, 132); - Vector256 bPerm1 = Avx2.Permute4x64(bVector, 177); - Vector512 rearrangedB = Vector512.Create(bPerm0, bPerm1); + // 2. Rearrange the 64‐bit limbs into 512‐bit vectors. + // x0010 = [ x0, x0, x1, x0 ] + Vector256 x0010 = Avx2.Permute4x64(x0123, 16); + // x1201 = [ x1, x2, x0, x1 ] + Vector256 x1201 = Avx2.Permute4x64(x0123, 73); + // x00101201 = [ x0, x0, x1, x0, x1, x2, x0, x1 ] + Vector512 x00101201 = Vector512.Create(x0010, x1201); + + // y0102 = [ y0, y1, y0, y2 ] + Vector256 y0102 = Avx2.Permute4x64(y0123, 132); + // y1032 = [ y1, y0, y3, y2 ] + Vector256 y1032 = Avx2.Permute4x64(y0123, 177); + // y01021032 = [ y0, y1, y0, y2, y1, y0, y3, y2 ] + Vector512 y01021032 = Vector512.Create(y0102, y1032); // 3. Multiply the corresponding 64‐bit limbs. - // Mask for the lower 32 bits. - Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); - // Split each 64-bit operand into 32-bit halves: - // a0 = lower 32 bits, a1 = upper 32 bits - Vector512 a0 = Avx512F.And(rearrangedA, mask32); - Vector512 a1 = Avx512F.ShiftRightLogical(rearrangedA, 32); - Vector512 b0 = Avx512F.And(rearrangedB, mask32); - Vector512 b1 = Avx512F.ShiftRightLogical(rearrangedB, 32); + Vector512 xLo = Avx512F.And(x00101201, mask32); + Vector512 xHi = Avx512F.ShiftRightLogical(x00101201, 32); + Vector512 yLo = Avx512F.And(y01021032, mask32); + Vector512 yHi = Avx512F.ShiftRightLogical(y01021032, 32); // Compute the four 32x32 partial products. // Each multiplication here is on 32-bit values, so the result fits in 64 bits. - Vector512 u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0 - Vector512 u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1 - Vector512 u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0 - Vector512 u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1 + Vector512 u0 = Avx512DQ.MultiplyLow(xLo, yLo); + Vector512 u1 = Avx512DQ.MultiplyLow(xLo, yHi); + Vector512 u2 = Avx512DQ.MultiplyLow(xHi, yLo); + Vector512 u3 = Avx512DQ.MultiplyLow(xHi, yHi); // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) - Vector512 u0_hi = Avx512F.ShiftRightLogical(u0, 32); - Vector512 u1_lo = Avx512F.And(u1, mask32); - Vector512 u2_lo = Avx512F.And(u2, mask32); - Vector512 t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo); + Vector512 u0Hi = Avx512F.ShiftRightLogical(u0, 32); + Vector512 u1Lo = Avx512F.And(u1, mask32); + Vector512 u2Lo = Avx512F.And(u2, mask32); + Vector512 t = Avx512F.Add(Avx512F.Add(u0Hi, u1Lo), u2Lo); // The extra carry: c = t >> 32. - Vector512 c = Avx512F.ShiftRightLogical(t, 32); + Vector512 carry = Avx512F.ShiftRightLogical(t, 32); // Now, assemble the lower 64 bits: // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. - Vector512 u0_lo = Avx512F.And(u0, mask32); - Vector512 t_lo = Avx512F.And(t, mask32); - Vector512 partialLo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32)); + Vector512 u0Lo = Avx512F.And(u0, mask32); + Vector512 tLo = Avx512F.And(t, mask32); + Vector512 partialLo = Avx512F.Or(u0Lo, Avx512F.ShiftLeftLogical(tLo, 32)); // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. - Vector512 u1_hi = Avx512F.ShiftRightLogical(u1, 32); - Vector512 u2_hi = Avx512F.ShiftRightLogical(u2, 32); - Vector512 partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c); + Vector512 u1Hi = Avx512F.ShiftRightLogical(u1, 32); + Vector512 u2Hi = Avx512F.ShiftRightLogical(u2, 32); + Vector512 partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1Hi), u2Hi), carry); // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20) // into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.) From 5c8329c31678d877ef4f7efbd198b84737e98cc7 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 07:16:42 +0000 Subject: [PATCH 17/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 3fe2716..a10f800 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1049,24 +1049,14 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector512 u3 = Avx512DQ.MultiplyLow(xHi, yHi); // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) - Vector512 u0Hi = Avx512F.ShiftRightLogical(u0, 32); - Vector512 u1Lo = Avx512F.And(u1, mask32); - Vector512 u2Lo = Avx512F.And(u2, mask32); - Vector512 t = Avx512F.Add(Avx512F.Add(u0Hi, u1Lo), u2Lo); - - // The extra carry: c = t >> 32. - Vector512 carry = Avx512F.ShiftRightLogical(t, 32); + Vector512 t = Avx512F.Add(Avx512F.Add(Avx512F.ShiftRightLogical(u0, 32), Avx512F.And(u1, mask32)), Avx512F.And(u2, mask32)); // Now, assemble the lower 64 bits: // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. - Vector512 u0Lo = Avx512F.And(u0, mask32); - Vector512 tLo = Avx512F.And(t, mask32); - Vector512 partialLo = Avx512F.Or(u0Lo, Avx512F.ShiftLeftLogical(tLo, 32)); + Vector512 partialLo = Avx512F.Or(Avx512F.And(u0, mask32), Avx512F.ShiftLeftLogical(Avx512F.And(t, mask32), 32)); - // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c. - Vector512 u1Hi = Avx512F.ShiftRightLogical(u1, 32); - Vector512 u2Hi = Avx512F.ShiftRightLogical(u2, 32); - Vector512 partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1Hi), u2Hi), carry); + // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + (t >> 32). + Vector512 partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, Avx512F.ShiftRightLogical(u1, 32)), Avx512F.ShiftRightLogical(u2, 32)), Avx512F.ShiftRightLogical(t, 32)); // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20) // into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.) @@ -1133,7 +1123,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) intermediateResult = WithUpper(intermediateResult, newUpper); // 9. Process group‑3: - // Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts. + // Multiply x23 and y10 (with the proper reversed order) then add in the remaining lower parts. Vector128 aHigh = Vector128.Create(x.u2, x.u3); Vector128 bLow = Vector128.Create(y.u1, y.u0); // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits) @@ -1149,10 +1139,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. Vector128 horizontalSum = HorizontalAdd(finalProdLow); // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. - Vector128 upperTemp = intermediateResult.GetUpper(); - Vector128 hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask); - Vector128 newUpperTemp = Sse2.Add(upperTemp, hsBroadcast); - intermediateResult = WithUpper(intermediateResult, newUpperTemp); + intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(default, Sse2.And(horizontalSum, highMask))); // 10. Write out the final 256‑bit result. Unsafe.SkipInit(out res); From cc6cac73695892df8d305b0c27ae95c39e2bb8ca Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 07:42:59 +0000 Subject: [PATCH 18/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index a10f800..ef3ea1e 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1086,10 +1086,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb. // Instead of extracting a scalar, we broadcast the lower 64 bits to a vector. // (Assume BroadcastLower128 returns a copy with both lanes equal to element0.) - Vector128 csLow = BroadcastLower128(crossSum); - // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue} - Vector128 highMask = Vector128.Create(0ul, ulong.MaxValue); - Vector128 addMask = Sse2.And(csLow, highMask); + Vector128 addMask = Sse2.Shuffle(Vector128.Zero, crossSum.AsDouble(), 0).AsUInt64(); Vector128 prod0Updated = Sse2.Add(prod0, addMask); // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) @@ -1139,6 +1136,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. Vector128 horizontalSum = HorizontalAdd(finalProdLow); // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. + Vector128 highMask = Vector128.Create(0ul, ulong.MaxValue); intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(default, Sse2.And(horizontalSum, highMask))); // 10. Write out the final 256‑bit result. @@ -1170,13 +1168,6 @@ static Vector128 ExtractHighLimb(Vector128 vec) return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64(); } - // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 BroadcastLower128(Vector128 vec) - { - // Replicate element0 to both lanes. - return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64(); - } [MethodImpl(MethodImplOptions.AggressiveInlining)] static Vector128 BroadcastUpper128(Vector128 vec) { From bfaa88cfaa2766d4de2dad443b600aa9bed25ca7 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 07:50:47 +0000 Subject: [PATCH 19/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index ef3ea1e..a0facc1 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1136,8 +1136,9 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. Vector128 horizontalSum = HorizontalAdd(finalProdLow); // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. - Vector128 highMask = Vector128.Create(0ul, ulong.MaxValue); - intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(default, Sse2.And(horizontalSum, highMask))); + // 2. Use a shuffle with a zero vector to directly form { 0, horizontalSum[0] } + Vector128 high = Sse2.Shuffle(Vector128.Zero, horizontalSum.AsDouble(), 0).AsUInt64(); + intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128.Zero, high)); // 10. Write out the final 256‑bit result. Unsafe.SkipInit(out res); From f7152ad3ddd480f049703fce56530c3b9f8e2494 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:12:13 +0000 Subject: [PATCH 20/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index a0facc1..5872224 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1199,14 +1199,9 @@ static Vector128 Add128(Vector128 left, Vector128 right) // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); - - // Promote the carry from the lower lane (element 0) into the upper lane. - // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane. - Vector128 swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64(); - - // Next, clear the (now swapped) lower lane by blending with a zero vector. - // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged. - Vector128 promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128.Zero, 0x1).AsUInt64(); + // Next, clear the (now swapped) lower lane by shuffle with a zero vector. + // The immediate mask 0x0 indicates that lane 0 should come from the zero vector and lane 1 from overflow. + Vector128 promotedCarry = Sse2.Shuffle(Vector128.Zero, overflowMask.AsDouble(), 0).AsUInt64(); // Add the propagated carry to the sum. return Sse2.Add(sum, promotedCarry); From 17f5729ab91f455d4f2ab9a9afce90392a7dbfb7 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:27:06 +0000 Subject: [PATCH 21/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 5872224..1650aa6 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1133,28 +1133,21 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”): // (Note: prod6 already holds both lower parts.) finalProdLow = Sse2.Add(finalProdLow, prod6); - // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value. - Vector128 horizontalSum = HorizontalAdd(finalProdLow); + // Reinterpret the 64-bit integer vector as a vector of two doubles. + // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes. + Vector128 swapped = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64(); + // Add the original vector and the swapped vector. + // This results in a vector where both lanes equal (vec[0] + vec[1]). + Vector128 horizontalSum = Sse2.Add(finalProdLow, swapped); // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. - // 2. Use a shuffle with a zero vector to directly form { 0, horizontalSum[0] } - Vector128 high = Sse2.Shuffle(Vector128.Zero, horizontalSum.AsDouble(), 0).AsUInt64(); + // 2. Use a unpackHigh with a zero vector to directly form { 0, horizontalSum[0] } + Vector128 high = Sse2.UnpackHigh(Vector128.Zero, horizontalSum); intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128.Zero, high)); // 10. Write out the final 256‑bit result. Unsafe.SkipInit(out res); Unsafe.As>(ref res) = intermediateResult; - static Vector128 HorizontalAdd(Vector128 vec) - { - // Reinterpret the 64-bit integer vector as a vector of two doubles. - // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes. - Vector128 swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64(); - - // Add the original vector and the swapped vector. - // This results in a vector where both lanes equal (vec[0] + vec[1]). - return Sse2.Add(vec, swapped); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] static Vector256 WithUpper(Vector256 vec, Vector128 upper) { From abb4081b4fde40c93bfc8626164a2e75b61544dc Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:31:19 +0000 Subject: [PATCH 22/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 1650aa6..63d9c4d 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1194,7 +1194,7 @@ static Vector128 Add128(Vector128 left, Vector128 right) overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); // Next, clear the (now swapped) lower lane by shuffle with a zero vector. // The immediate mask 0x0 indicates that lane 0 should come from the zero vector and lane 1 from overflow. - Vector128 promotedCarry = Sse2.Shuffle(Vector128.Zero, overflowMask.AsDouble(), 0).AsUInt64(); + Vector128 promotedCarry = Sse2.UnpackLow(Vector128.Zero, overflowMask); // Add the propagated carry to the sum. return Sse2.Add(sum, promotedCarry); From 684ce56b3b9e6673819f09244a444a24152b8f74 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:33:54 +0000 Subject: [PATCH 23/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 63d9c4d..3b4343e 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1086,7 +1086,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb. // Instead of extracting a scalar, we broadcast the lower 64 bits to a vector. // (Assume BroadcastLower128 returns a copy with both lanes equal to element0.) - Vector128 addMask = Sse2.Shuffle(Vector128.Zero, crossSum.AsDouble(), 0).AsUInt64(); + Vector128 addMask = Sse2.UnpackLow(Vector128.Zero, crossSum); Vector128 prod0Updated = Sse2.Add(prod0, addMask); // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) From c9118f2b4ee683d7aa2391d0e24794b35720de1b Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:38:49 +0000 Subject: [PATCH 24/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 3b4343e..d447f5d 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1091,8 +1091,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan( - ExtractHighLimb(prod0Updated), // compare updated high limb... - ExtractHighLimb(prod0) // ...with the original high limb + Sse2.UnpackHigh(prod0Updated, prod0Updated), // compare updated high limb... + Sse2.UnpackHigh(prod0, prod0) // ...with the original high limb ).AsByte()) & 1; // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) Vector128 csHigh = BroadcastUpper128(crossSum); @@ -1100,7 +1100,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan( - ExtractHighLimb(prod1), csHigh).AsByte()) & 1); + Sse2.UnpackHigh(prod1, prod1), csHigh).AsByte()) & 1); Vector128 limb3Vec = Vector128.CreateScalar((ulong)limb3); // 7. Build the 256‑bit “intermediate” result from group‑1: @@ -1154,13 +1154,6 @@ static Vector256 WithUpper(Vector256 vec, Vector128 upper) // Replace the upper 128 bits of vec with upper. return Avx2.InsertVector128(vec, upper, 1); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 ExtractHighLimb(Vector128 vec) - { - // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb, - // then reinterpret back as 64-bit. - return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64(); - } [MethodImpl(MethodImplOptions.AggressiveInlining)] static Vector128 BroadcastUpper128(Vector128 vec) From 8fa3b377715477a2cd9d818114e0191bfc78e51c Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:41:47 +0000 Subject: [PATCH 25/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index d447f5d..3732faf 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1095,7 +1095,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Sse2.UnpackHigh(prod0, prod0) // ...with the original high limb ).AsByte()) & 1; // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) - Vector128 csHigh = BroadcastUpper128(crossSum); + Vector128 csHigh = Sse2.UnpackHigh(crossSum, crossSum); Vector128 limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag)); // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: @@ -1155,12 +1155,6 @@ static Vector256 WithUpper(Vector256 vec, Vector128 upper) return Avx2.InsertVector128(vec, upper, 1); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 BroadcastUpper128(Vector128 vec) - { - // Replicate element1 to both lanes. - return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1 - } /// /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits From ae34bf9f5b826b41b72ba7f2ef8c742ac9c3adb6 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:48:11 +0000 Subject: [PATCH 26/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 3732faf..493c5c6 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1099,16 +1099,16 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector128 limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag)); // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: - uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan( - Sse2.UnpackHigh(prod1, prod1), csHigh).AsByte()) & 1); - Vector128 limb3Vec = Vector128.CreateScalar((ulong)limb3); + // Shift right each lane by 63 bits, so that 0 becomes 0 and 0xFFFFFFFFFFFFFFFF becomes 1. + Vector128 limb3Vec = Sse2.ShiftRightLogical(Avx512F.VL.CompareGreaterThan( + Sse2.UnpackHigh(prod1, prod1), csHigh), 63); + // Pack limb2 into the lower half and limb3 into the upper half. + Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec); // 7. Build the 256‑bit “intermediate” result from group‑1: // Lower 128 bits = prod00 (with updated high limb) // Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector. Vector128 lowerIntermediate = prod0Updated; - // Pack limb2 into the lower half and limb3 into the upper half. - Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec); Vector256 intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate); // 8. Process group‑2: (prod02, prod11, prod20) From 450ec9e4d2203d32dda7905f860664cf6a0fb43a Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 08:51:15 +0000 Subject: [PATCH 27/38] Otpimize --- src/Nethermind.Int256/UInt256.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 493c5c6..28b4a62 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1090,13 +1090,13 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector128 prod0Updated = Sse2.Add(prod0, addMask); // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) - uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan( + Vector128 carryFlag = Sse2.ShiftRightLogical(Avx512F.VL.CompareLessThan( Sse2.UnpackHigh(prod0Updated, prod0Updated), // compare updated high limb... Sse2.UnpackHigh(prod0, prod0) // ...with the original high limb - ).AsByte()) & 1; + ), 63); // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) Vector128 csHigh = Sse2.UnpackHigh(crossSum, crossSum); - Vector128 limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag)); + Vector128 limb2 = Sse2.Add(csHigh, carryFlag); // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: // Shift right each lane by 63 bits, so that 0 becomes 0 and 0xFFFFFFFFFFFFFFFF becomes 1. From 18ba5fc1c86a792f3f70fc161fbe42afca4df4b7 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 09:02:53 +0000 Subject: [PATCH 28/38] Refactor --- src/Nethermind.Int256/UInt256.cs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 28b4a62..e11e435 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1117,7 +1117,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Add totalGroup2 into the current upper 128 bits of intermediateResult. Vector128 currentUpper = intermediateResult.GetUpper(); Vector128 newUpper = Add128(currentUpper, totalGroup2); - intermediateResult = WithUpper(intermediateResult, newUpper); + intermediateResult = Avx2.InsertVector128(intermediateResult, newUpper, 1); // 9. Process group‑3: // Multiply x23 and y10 (with the proper reversed order) then add in the remaining lower parts. @@ -1148,13 +1148,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Unsafe.SkipInit(out res); Unsafe.As>(ref res) = intermediateResult; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector256 WithUpper(Vector256 vec, Vector128 upper) - { - // Replace the upper 128 bits of vec with upper. - return Avx2.InsertVector128(vec, upper, 1); - } - /// /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits From 040b8addf4fb61d94efc8c1e65073c9d3ba13041 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 09:47:57 +0000 Subject: [PATCH 29/38] Clean up comments --- src/Nethermind.Int256/UInt256.cs | 261 ++++++++++++++++--------------- 1 file changed, 134 insertions(+), 127 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index e11e435..eb30ef2 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -996,153 +996,160 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u res = a - b - borrow; borrow = (((~a) & b) | (~(a ^ b)) & res) >> 63; } - - // Multiply sets res to the product x*y + /// + /// Multiplies two 256‑bit unsigned integers ( and ) and + /// writes the 256‑bit product to . This implementation uses AVX‑512, + /// AVX2, and SSE2 intrinsics for high‑performance multi‑precision arithmetic. + /// + /// The first 256‑bit unsigned integer. + /// The second 256‑bit unsigned integer. + /// When this method returns, contains the 256‑bit product of x and y. public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) { + // If both inputs fit in 64 bits, use a simple multiplication routine. if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0) { MultiplyULong(x, y, out res); return; } + // Fallback to a non‑AVX‑512 implementation if the required intrinsics are not supported. if (!Avx512F.IsSupported || !Avx512DQ.IsSupported) { MultiplyNonAvx512(x, y, out res); return; } - // 1. Load the 256‐bit inputs into 256‐bit vector registers. - Vector256 x0123 = Unsafe.As>(ref Unsafe.AsRef(in x)); - Vector256 y0123 = Unsafe.As>(ref Unsafe.AsRef(in y)); + // 1. Load the 256‑bit inputs into 256‑bit vector registers. + Vector256 vecX = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 vecY = Unsafe.As>(ref Unsafe.AsRef(in y)); - // Mask for the lower 32 bits. + // Create a 512‑bit mask to isolate the lower 32 bits of each 64‑bit limb. Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); - // 2. Rearrange the 64‐bit limbs into 512‐bit vectors. - // x0010 = [ x0, x0, x1, x0 ] - Vector256 x0010 = Avx2.Permute4x64(x0123, 16); - // x1201 = [ x1, x2, x0, x1 ] - Vector256 x1201 = Avx2.Permute4x64(x0123, 73); - // x00101201 = [ x0, x0, x1, x0, x1, x2, x0, x1 ] - Vector512 x00101201 = Vector512.Create(x0010, x1201); - - // y0102 = [ y0, y1, y0, y2 ] - Vector256 y0102 = Avx2.Permute4x64(y0123, 132); - // y1032 = [ y1, y0, y3, y2 ] - Vector256 y1032 = Avx2.Permute4x64(y0123, 177); - // y01021032 = [ y0, y1, y0, y2, y1, y0, y3, y2 ] - Vector512 y01021032 = Vector512.Create(y0102, y1032); - - // 3. Multiply the corresponding 64‐bit limbs. - - // Split each 64-bit operand into 32-bit halves: - Vector512 xLo = Avx512F.And(x00101201, mask32); - Vector512 xHi = Avx512F.ShiftRightLogical(x00101201, 32); - Vector512 yLo = Avx512F.And(y01021032, mask32); - Vector512 yHi = Avx512F.ShiftRightLogical(y01021032, 32); - - // Compute the four 32x32 partial products. - // Each multiplication here is on 32-bit values, so the result fits in 64 bits. - Vector512 u0 = Avx512DQ.MultiplyLow(xLo, yLo); - Vector512 u1 = Avx512DQ.MultiplyLow(xLo, yHi); - Vector512 u2 = Avx512DQ.MultiplyLow(xHi, yLo); - Vector512 u3 = Avx512DQ.MultiplyLow(xHi, yHi); - - // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32) - Vector512 t = Avx512F.Add(Avx512F.Add(Avx512F.ShiftRightLogical(u0, 32), Avx512F.And(u1, mask32)), Avx512F.And(u2, mask32)); - - // Now, assemble the lower 64 bits: - // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32. - Vector512 partialLo = Avx512F.Or(Avx512F.And(u0, mask32), Avx512F.ShiftLeftLogical(Avx512F.And(t, mask32), 32)); - - // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + (t >> 32). - Vector512 partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, Avx512F.ShiftRightLogical(u1, 32)), Avx512F.ShiftRightLogical(u2, 32)), Avx512F.ShiftRightLogical(t, 32)); - - // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20) - // into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.) - // – Products 0 and 1 come from index 0: - Vector128 pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo - Vector128 pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi - // Unpack lower (lane0) and upper (lane1) to form product0 and product1: - Vector128 prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi); // prod00 = {lo, hi} - Vector128 prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi} - - // – Products 2 and 3 come from index 1: - Vector128 pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo - Vector128 pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi - Vector128 prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi); // prod10 - Vector128 prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02 - - // – Products 4 and 5 come from index 2: - Vector128 pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo - Vector128 pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi - Vector128 prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi); // prod11 - Vector128 prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20 - - // 5. Group‑1 “cross‑term” addition: - // crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2) - Vector128 crossSum = Add128(prod1, prod2); - - // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb. - // Instead of extracting a scalar, we broadcast the lower 64 bits to a vector. - // (Assume BroadcastLower128 returns a copy with both lanes equal to element0.) - Vector128 addMask = Sse2.UnpackLow(Vector128.Zero, crossSum); - Vector128 prod0Updated = Sse2.Add(prod0, addMask); - - // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.) - Vector128 carryFlag = Sse2.ShiftRightLogical(Avx512F.VL.CompareLessThan( - Sse2.UnpackHigh(prod0Updated, prod0Updated), // compare updated high limb... - Sse2.UnpackHigh(prod0, prod0) // ...with the original high limb - ), 63); - // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.) - Vector128 csHigh = Sse2.UnpackHigh(crossSum, crossSum); - Vector128 limb2 = Sse2.Add(csHigh, carryFlag); - - // And form limb3 from a comparison of prod01’s high limb with crossSum’s high: - // Shift right each lane by 63 bits, so that 0 becomes 0 and 0xFFFFFFFFFFFFFFFF becomes 1. - Vector128 limb3Vec = Sse2.ShiftRightLogical(Avx512F.VL.CompareGreaterThan( - Sse2.UnpackHigh(prod1, prod1), csHigh), 63); - - // Pack limb2 into the lower half and limb3 into the upper half. - Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec); - // 7. Build the 256‑bit “intermediate” result from group‑1: - // Lower 128 bits = prod00 (with updated high limb) - // Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector. - Vector128 lowerIntermediate = prod0Updated; - Vector256 intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate); - - // 8. Process group‑2: (prod02, prod11, prod20) - Vector128 group2Sum = Add128(prod3, prod4); - Vector128 totalGroup2 = Add128(group2Sum, prod5); - // Add totalGroup2 into the current upper 128 bits of intermediateResult. + // 2. Rearrange the 64‑bit limbs into 512‑bit vectors for the partial products. + // For x: + // xPerm1 = [ x0, x0, x1, x0 ] + Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); + // xPerm2 = [ x1, x2, x0, x1 ] + Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); + // xRearranged = [ x0, x0, x1, x0, x1, x2, x0, x1 ] + Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2); + + // For y: + // yPerm1 = [ y0, y1, y0, y2 ] + Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); + // yPerm2 = [ y1, y0, y3, y2 ] + Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); + // yRearranged = [ y0, y1, y0, y2, y1, y0, y3, y2 ] + Vector512 yRearranged = Vector512.Create(yPerm1, yPerm2); + + // 3. Split each 64‑bit limb into its lower and upper 32‑bit halves. + Vector512 xLowerParts = Avx512F.And(xRearranged, mask32); + Vector512 xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32); + Vector512 yLowerParts = Avx512F.And(yRearranged, mask32); + Vector512 yUpperParts = Avx512F.ShiftRightLogical(yRearranged, 32); + + // Compute four 32x32‑bit partial products (each fits in 64 bits). + Vector512 prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower×lower + Vector512 prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts); // lower×upper + Vector512 prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts); // upper×lower + Vector512 prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts); // upper×upper + + // Compute an intermediate term: + // termT = (prodLL >> 32) + (prodLH & mask32) + (prodHL & mask32) + Vector512 termT = Avx512F.Add( + Avx512F.Add(Avx512F.ShiftRightLogical(prodLL, 32), + Avx512F.And(prodLH, mask32)), + Avx512F.And(prodHL, mask32)); + + // Assemble the lower 64 bits of each partial product: + // lowerPartial = (prodLL & mask32) OR ((termT & mask32) << 32) + Vector512 lowerPartial = Avx512F.Or( + Avx512F.And(prodLL, mask32), + Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32)); + + // Assemble the higher 64 bits: + // higherPartial = prodHH + (prodLH >> 32) + (prodHL >> 32) + (termT >> 32) + Vector512 higherPartial = Avx512F.Add( + Avx512F.Add( + Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)), + Avx512F.ShiftRightLogical(prodHL, 32)), + Avx512F.ShiftRightLogical(termT, 32)); + + // 4. Unpack the 512‑bit partial results into six 128‑bit values. + // Group 1 (products 0 and 1): + Vector128 pair01Lo = Avx512F.ExtractVector128(lowerPartial, 0); // lanes 0–1: product0 (low), product1 (low) + Vector128 pair01Hi = Avx512F.ExtractVector128(higherPartial, 0); // lanes 0–1: product0 (high), product1 (high) + Vector128 product0 = Sse2.UnpackLow(pair01Lo, pair01Hi); // product0 = { low, high } + Vector128 product1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // product1 = { low, high } + + // Group 2 (products 2 and 3): + Vector128 pair23Lo = Avx512F.ExtractVector128(lowerPartial, 1); // lanes 2–3 + Vector128 pair23Hi = Avx512F.ExtractVector128(higherPartial, 1); // lanes 2–3 + Vector128 product2 = Sse2.UnpackLow(pair23Lo, pair23Hi); + Vector128 product3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); + + // Group 3 (products 4 and 5): + Vector128 pair45Lo = Avx512F.ExtractVector128(lowerPartial, 2); // lanes 4–5 + Vector128 pair45Hi = Avx512F.ExtractVector128(higherPartial, 2); // lanes 4–5 + Vector128 product4 = Sse2.UnpackLow(pair45Lo, pair45Hi); + Vector128 product5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); + + // 5. Group 1 cross‑term addition: + // Compute crossSum = product1 + product2 (as 128‑bit numbers). + Vector128 crossSum = Add128(product1, product2); + + // 6. Add the lower 64 bits of crossSum to the high limb of product0. + // Broadcast crossSum’s low 64 bits into both lanes. + Vector128 crossAddMask = Sse2.UnpackLow(Vector128.Zero, crossSum); + Vector128 updatedProduct0 = Sse2.Add(product0, crossAddMask); + + // Compute the carry from that addition by comparing the high limbs before and after. + Vector128 product0HighBefore = Sse2.UnpackHigh(product0, product0); + Vector128 product0HighAfter = Sse2.UnpackHigh(updatedProduct0, updatedProduct0); + Vector128 carryFlag = Sse2.ShiftRightLogical( + Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore), + 63); + // Propagate the carry by adding it to crossSum’s high limb. + Vector128 crossSumHigh = Sse2.UnpackHigh(crossSum, crossSum); + Vector128 limb2 = Sse2.Add(crossSumHigh, carryFlag); + + // Determine an extra carry if product1’s high limb exceeds crossSum’s high limb. + Vector128 limb3 = Sse2.ShiftRightLogical( + Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh), + 63); + + // Pack limb2 (low) and limb3 (high) to form the new upper half. + Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3); + + // 7. Build the intermediate 256‑bit result. + // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3). + Vector256 intermediateResult = Vector256.Create(updatedProduct0, upperIntermediate); + + // 8. Group 2 combination: + // Sum product3, product4, and product5. + Vector128 group2Sum = Add128(product3, product4); + Vector128 totalGroup2 = Add128(group2Sum, product5); + // Add this total into the upper 128 bits of the intermediate result. Vector128 currentUpper = intermediateResult.GetUpper(); Vector128 newUpper = Add128(currentUpper, totalGroup2); intermediateResult = Avx2.InsertVector128(intermediateResult, newUpper, 1); - // 9. Process group‑3: - // Multiply x23 and y10 (with the proper reversed order) then add in the remaining lower parts. - Vector128 aHigh = Vector128.Create(x.u2, x.u3); - Vector128 bLow = Vector128.Create(y.u1, y.u0); - // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits) - Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow); - - // Extract from partialLo the two lower parts for prod03 and prod12. - // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7), - // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes. - Vector128 prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1); - // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”): - // (Note: prod6 already holds both lower parts.) - finalProdLow = Sse2.Add(finalProdLow, prod6); - // Reinterpret the 64-bit integer vector as a vector of two doubles. - // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes. - Vector128 swapped = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64(); - // Add the original vector and the swapped vector. - // This results in a vector where both lanes equal (vec[0] + vec[1]). - Vector128 horizontalSum = Sse2.Add(finalProdLow, swapped); - // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult. - // 2. Use a unpackHigh with a zero vector to directly form { 0, horizontalSum[0] } - Vector128 high = Sse2.UnpackHigh(Vector128.Zero, horizontalSum); - intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128.Zero, high)); + // 9. Group 3 cross‑terms: + // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order. + Vector128 xHigh = Vector128.Create(x.u2, x.u3); + Vector128 yLow = Vector128.Create(y.u1, y.u0); + Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow); + + // Add in the extra lower parts from the upper half of lowerPartial. + Vector128 extraLow = Avx2.ExtractVector128(lowerPartial.GetUpper(), 1); + finalProdLow = Sse2.Add(finalProdLow, extraLow); + // Perform a horizontal sum so that both lanes contain the same result. + Vector128 swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow); + Vector128 horizontalSum = Sse2.Add(finalProdLow, swappedFinal); + // Add the horizontal sum (broadcast into the high lane) to the most‑significant limb. + Vector128 highCarry = Sse2.UnpackHigh(Vector128.Zero, horizontalSum); + intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128.Zero, highCarry)); // 10. Write out the final 256‑bit result. Unsafe.SkipInit(out res); From de893e8d9485a9fb478a7d14ecc2fdc5128d8688 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 10:23:48 +0000 Subject: [PATCH 30/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index eb30ef2..2633f48 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1076,24 +1076,21 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Avx512F.ShiftRightLogical(prodHL, 32)), Avx512F.ShiftRightLogical(termT, 32)); + Vector512 productLow = Avx512F.UnpackLow(lowerPartial, higherPartial); + Vector512 productHi = Avx512F.UnpackHigh(lowerPartial, higherPartial); + // 4. Unpack the 512‑bit partial results into six 128‑bit values. // Group 1 (products 0 and 1): - Vector128 pair01Lo = Avx512F.ExtractVector128(lowerPartial, 0); // lanes 0–1: product0 (low), product1 (low) - Vector128 pair01Hi = Avx512F.ExtractVector128(higherPartial, 0); // lanes 0–1: product0 (high), product1 (high) - Vector128 product0 = Sse2.UnpackLow(pair01Lo, pair01Hi); // product0 = { low, high } - Vector128 product1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // product1 = { low, high } + Vector128 product0 = Avx512F.ExtractVector128(productLow, 0); + Vector128 product1 = Avx512F.ExtractVector128(productHi, 0); // Group 2 (products 2 and 3): - Vector128 pair23Lo = Avx512F.ExtractVector128(lowerPartial, 1); // lanes 2–3 - Vector128 pair23Hi = Avx512F.ExtractVector128(higherPartial, 1); // lanes 2–3 - Vector128 product2 = Sse2.UnpackLow(pair23Lo, pair23Hi); - Vector128 product3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); + Vector128 product2 = Avx512F.ExtractVector128(productLow, 1); + Vector128 product3 = Avx512F.ExtractVector128(productHi, 1); // Group 3 (products 4 and 5): - Vector128 pair45Lo = Avx512F.ExtractVector128(lowerPartial, 2); // lanes 4–5 - Vector128 pair45Hi = Avx512F.ExtractVector128(higherPartial, 2); // lanes 4–5 - Vector128 product4 = Sse2.UnpackLow(pair45Lo, pair45Hi); - Vector128 product5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); + Vector128 product4 = Avx512F.ExtractVector128(productLow, 2); + Vector128 product5 = Avx512F.ExtractVector128(productHi, 2); // 5. Group 1 cross‑term addition: // Compute crossSum = product1 + product2 (as 128‑bit numbers). From d163c6590ba4b2f3e44542fe0c7e06644af89bdf Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 10:29:56 +0000 Subject: [PATCH 31/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 2633f48..d787dab 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1119,18 +1119,16 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Pack limb2 (low) and limb3 (high) to form the new upper half. Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3); - // 7. Build the intermediate 256‑bit result. - // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3). - Vector256 intermediateResult = Vector256.Create(updatedProduct0, upperIntermediate); // 8. Group 2 combination: // Sum product3, product4, and product5. Vector128 group2Sum = Add128(product3, product4); Vector128 totalGroup2 = Add128(group2Sum, product5); // Add this total into the upper 128 bits of the intermediate result. - Vector128 currentUpper = intermediateResult.GetUpper(); - Vector128 newUpper = Add128(currentUpper, totalGroup2); - intermediateResult = Avx2.InsertVector128(intermediateResult, newUpper, 1); + Vector128 newUpper = Add128(upperIntermediate, totalGroup2); + // 7. Build the intermediate 256‑bit result. + // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3). + Vector256 intermediateResult = Vector256.Create(updatedProduct0, newUpper); // 9. Group 3 cross‑terms: // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order. From 0bae58314e06c92d868e2a689cebdbf80e5dc838 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 10:33:33 +0000 Subject: [PATCH 32/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index d787dab..6fe0ae0 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1128,7 +1128,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector128 newUpper = Add128(upperIntermediate, totalGroup2); // 7. Build the intermediate 256‑bit result. // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3). - Vector256 intermediateResult = Vector256.Create(updatedProduct0, newUpper); // 9. Group 3 cross‑terms: // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order. @@ -1144,7 +1143,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector128 horizontalSum = Sse2.Add(finalProdLow, swappedFinal); // Add the horizontal sum (broadcast into the high lane) to the most‑significant limb. Vector128 highCarry = Sse2.UnpackHigh(Vector128.Zero, horizontalSum); - intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128.Zero, highCarry)); + newUpper = Sse2.Add(newUpper, highCarry); + Vector256 intermediateResult = Vector256.Create(updatedProduct0, newUpper); // 10. Write out the final 256‑bit result. Unsafe.SkipInit(out res); From 8a92748f87e4b60fd0eed619a3e96168c433fefc Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 10:45:59 +0000 Subject: [PATCH 33/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 6fe0ae0..a88322e 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1092,6 +1092,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Vector128 product4 = Avx512F.ExtractVector128(productLow, 2); Vector128 product5 = Avx512F.ExtractVector128(productHi, 2); + Vector128 xHigh = Vector128.Create(x.u2, x.u3); + Vector128 yLow = Vector128.Create(y.u1, y.u0); // 5. Group 1 cross‑term addition: // Compute crossSum = product1 + product2 (as 128‑bit numbers). Vector128 crossSum = Add128(product1, product2); @@ -1119,7 +1121,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // Pack limb2 (low) and limb3 (high) to form the new upper half. Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3); - // 8. Group 2 combination: // Sum product3, product4, and product5. Vector128 group2Sum = Add128(product3, product4); From 9b843d521d1203ef5c2b7dcab2b458422356b2d4 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 11:09:32 +0000 Subject: [PATCH 34/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index a88322e..3f6a851 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1132,12 +1132,10 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // 9. Group 3 cross‑terms: // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order. - Vector128 xHigh = Vector128.Create(x.u2, x.u3); - Vector128 yLow = Vector128.Create(y.u1, y.u0); Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow); // Add in the extra lower parts from the upper half of lowerPartial. - Vector128 extraLow = Avx2.ExtractVector128(lowerPartial.GetUpper(), 1); + Vector128 extraLow = Avx512F.ExtractVector128(lowerPartial, 3); finalProdLow = Sse2.Add(finalProdLow, extraLow); // Perform a horizontal sum so that both lanes contain the same result. Vector128 swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow); From bca256d2fa0916fffcec1015460ad48adc99cbae Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 11:35:21 +0000 Subject: [PATCH 35/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 202 ++++++++++++------------------- 1 file changed, 80 insertions(+), 122 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 3f6a851..2ea030e 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -998,8 +998,7 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u } /// /// Multiplies two 256‑bit unsigned integers ( and ) and - /// writes the 256‑bit product to . This implementation uses AVX‑512, - /// AVX2, and SSE2 intrinsics for high‑performance multi‑precision arithmetic. + /// writes the 256‑bit product to . /// /// The first 256‑bit unsigned integer. /// The second 256‑bit unsigned integer. @@ -1009,145 +1008,136 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) // If both inputs fit in 64 bits, use a simple multiplication routine. if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0) { - MultiplyULong(x, y, out res); + // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615) + ulong high = Math.BigMul(x.u0, y.u0, out ulong low); + // Assignment to res after multiply in case is used as input for x or y (by ref aliasing) + res = default; + Unsafe.AsRef(in res.u0) = low; + Unsafe.AsRef(in res.u1) = high; return; } - // Fallback to a non‑AVX‑512 implementation if the required intrinsics are not supported. + // Fallback if the required AVX‑512 intrinsics are not supported. if (!Avx512F.IsSupported || !Avx512DQ.IsSupported) { - MultiplyNonAvx512(x, y, out res); + ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); + ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); + + (ulong carry, ulong r0) = Multiply64(rx, ry); + UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); + UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); + ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; + + UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); + UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); + res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; + + UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); + res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; + + ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); + + res = new UInt256(r0, r1, r2, r3); return; } - // 1. Load the 256‑bit inputs into 256‑bit vector registers. + // Step 1: load the inputs and prepare the mask constant. Vector256 vecX = Unsafe.As>(ref Unsafe.AsRef(in x)); Vector256 vecY = Unsafe.As>(ref Unsafe.AsRef(in y)); - - // Create a 512‑bit mask to isolate the lower 32 bits of each 64‑bit limb. Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); - // 2. Rearrange the 64‑bit limbs into 512‑bit vectors for the partial products. - // For x: - // xPerm1 = [ x0, x0, x1, x0 ] - Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); - // xPerm2 = [ x1, x2, x0, x1 ] - Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); - // xRearranged = [ x0, x0, x1, x0, x1, x2, x0, x1 ] - Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2); + // Step 2: permute x and y. These operations are independent. + Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); // [ x0, x0, x1, x0 ] + Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ] + Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); // [ x1, x2, x0, x1 ] + Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ] - // For y: - // yPerm1 = [ y0, y1, y0, y2 ] - Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); - // yPerm2 = [ y1, y0, y3, y2 ] - Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); - // yRearranged = [ y0, y1, y0, y2, y1, y0, y3, y2 ] + Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2); Vector512 yRearranged = Vector512.Create(yPerm1, yPerm2); - // 3. Split each 64‑bit limb into its lower and upper 32‑bit halves. + // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts. Vector512 xLowerParts = Avx512F.And(xRearranged, mask32); - Vector512 xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32); Vector512 yLowerParts = Avx512F.And(yRearranged, mask32); + Vector512 xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32); Vector512 yUpperParts = Avx512F.ShiftRightLogical(yRearranged, 32); - // Compute four 32x32‑bit partial products (each fits in 64 bits). - Vector512 prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower×lower - Vector512 prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts); // lower×upper - Vector512 prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts); // upper×lower - Vector512 prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts); // upper×upper - - // Compute an intermediate term: - // termT = (prodLL >> 32) + (prodLH & mask32) + (prodHL & mask32) - Vector512 termT = Avx512F.Add( - Avx512F.Add(Avx512F.ShiftRightLogical(prodLL, 32), - Avx512F.And(prodLH, mask32)), - Avx512F.And(prodHL, mask32)); - - // Assemble the lower 64 bits of each partial product: - // lowerPartial = (prodLL & mask32) OR ((termT & mask32) << 32) - Vector512 lowerPartial = Avx512F.Or( - Avx512F.And(prodLL, mask32), - Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32)); - - // Assemble the higher 64 bits: - // higherPartial = prodHH + (prodLH >> 32) + (prodHL >> 32) + (termT >> 32) - Vector512 higherPartial = Avx512F.Add( - Avx512F.Add( - Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)), - Avx512F.ShiftRightLogical(prodHL, 32)), - Avx512F.ShiftRightLogical(termT, 32)); - + // Step 4: launch four 32×32‑bit multiplications in parallel. + Vector512 prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower × lower + Vector512 prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts); // lower × upper + Vector512 prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts); // upper × lower + Vector512 prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts); // upper × upper + + // Step 5: compute the intermediate term while the multiplications are in flight. + Vector512 prodLL_hi = Avx512F.ShiftRightLogical(prodLL, 32); + Vector512 prodLH_lo = Avx512F.And(prodLH, mask32); + Vector512 prodHL_lo = Avx512F.And(prodHL, mask32); + Vector512 termT = Avx512F.Add(Avx512F.Add(prodLL_hi, prodLH_lo), prodHL_lo); + + // Step 6: assemble the lower and higher partial results. + Vector512 lowerPartial = + Avx512F.Or( + Avx512F.And(prodLL, mask32), + Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32)); + Vector512 higherPartial = + Avx512F.Add( + Avx512F.Add( + Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)), + Avx512F.ShiftRightLogical(prodHL, 32)), + Avx512F.ShiftRightLogical(termT, 32)); + + // Step 7: unpack the 512‑bit results into two groups. Vector512 productLow = Avx512F.UnpackLow(lowerPartial, higherPartial); Vector512 productHi = Avx512F.UnpackHigh(lowerPartial, higherPartial); - // 4. Unpack the 512‑bit partial results into six 128‑bit values. - // Group 1 (products 0 and 1): + // Step 8: extract the 128‑bit groups. Vector128 product0 = Avx512F.ExtractVector128(productLow, 0); Vector128 product1 = Avx512F.ExtractVector128(productHi, 0); - - // Group 2 (products 2 and 3): Vector128 product2 = Avx512F.ExtractVector128(productLow, 1); Vector128 product3 = Avx512F.ExtractVector128(productHi, 1); - - // Group 3 (products 4 and 5): Vector128 product4 = Avx512F.ExtractVector128(productLow, 2); Vector128 product5 = Avx512F.ExtractVector128(productHi, 2); + // Step 9: issue memory request for remaining parts. Vector128 xHigh = Vector128.Create(x.u2, x.u3); Vector128 yLow = Vector128.Create(y.u1, y.u0); - // 5. Group 1 cross‑term addition: - // Compute crossSum = product1 + product2 (as 128‑bit numbers). - Vector128 crossSum = Add128(product1, product2); - // 6. Add the lower 64 bits of crossSum to the high limb of product0. - // Broadcast crossSum’s low 64 bits into both lanes. + // Step 10: perform the group 1 cross‑term addition. + Vector128 crossSum = Add128(product1, product2); Vector128 crossAddMask = Sse2.UnpackLow(Vector128.Zero, crossSum); Vector128 updatedProduct0 = Sse2.Add(product0, crossAddMask); - // Compute the carry from that addition by comparing the high limbs before and after. + // Compute the carry from adding crossSum’s low 64 bits. Vector128 product0HighBefore = Sse2.UnpackHigh(product0, product0); Vector128 product0HighAfter = Sse2.UnpackHigh(updatedProduct0, updatedProduct0); - Vector128 carryFlag = Sse2.ShiftRightLogical( - Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore), - 63); - // Propagate the carry by adding it to crossSum’s high limb. + Vector128 carryFlag = + Sse2.ShiftRightLogical( + Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore), + 63); Vector128 crossSumHigh = Sse2.UnpackHigh(crossSum, crossSum); Vector128 limb2 = Sse2.Add(crossSumHigh, carryFlag); - - // Determine an extra carry if product1’s high limb exceeds crossSum’s high limb. - Vector128 limb3 = Sse2.ShiftRightLogical( - Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh), - 63); - - // Pack limb2 (low) and limb3 (high) to form the new upper half. + Vector128 limb3 = + Sse2.ShiftRightLogical( + Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh), + 63); Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3); - // 8. Group 2 combination: - // Sum product3, product4, and product5. + // Step 11: combine group 2 partial results. Vector128 group2Sum = Add128(product3, product4); Vector128 totalGroup2 = Add128(group2Sum, product5); - // Add this total into the upper 128 bits of the intermediate result. - Vector128 newUpper = Add128(upperIntermediate, totalGroup2); - // 7. Build the intermediate 256‑bit result. - // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3). + Vector128 newHalf = Add128(upperIntermediate, totalGroup2); - // 9. Group 3 cross‑terms: - // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order. + // Step 12: process group 3 cross‑terms. Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow); - - // Add in the extra lower parts from the upper half of lowerPartial. Vector128 extraLow = Avx512F.ExtractVector128(lowerPartial, 3); finalProdLow = Sse2.Add(finalProdLow, extraLow); - // Perform a horizontal sum so that both lanes contain the same result. Vector128 swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow); Vector128 horizontalSum = Sse2.Add(finalProdLow, swappedFinal); - // Add the horizontal sum (broadcast into the high lane) to the most‑significant limb. Vector128 highCarry = Sse2.UnpackHigh(Vector128.Zero, horizontalSum); - newUpper = Sse2.Add(newUpper, highCarry); - Vector256 intermediateResult = Vector256.Create(updatedProduct0, newUpper); + newHalf = Sse2.Add(newHalf, highCarry); - // 10. Write out the final 256‑bit result. + // Combine the results into the final 256‑bit value. + Vector256 finalResult = Vector256.Create(updatedProduct0, newHalf); Unsafe.SkipInit(out res); - Unsafe.As>(ref res) = intermediateResult; + Unsafe.As>(ref res) = finalResult; /// /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. @@ -1182,38 +1172,6 @@ static Vector128 Add128(Vector128 left, Vector128 right) } } - private static void MultiplyNonAvx512(UInt256 x, UInt256 y, out UInt256 res) - { - ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); - ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); - - (ulong carry, ulong r0) = Multiply64(rx, ry); - UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); - UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); - ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; - - UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); - UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); - res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; - - UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); - res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; - - ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); - - res = new UInt256(r0, r1, r2, r3); - } - - private static void MultiplyULong(UInt256 x, UInt256 y, out UInt256 res) - { - // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615) - ulong high = Math.BigMul(x.u0, y.u0, out ulong low); - // Assignment to res after multiply in case is used as input for x or y (by ref aliasing) - res = default; - Unsafe.AsRef(in res.u0) = low; - Unsafe.AsRef(in res.u1) = high; - } - public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res); public static bool MultiplyOverflow(in UInt256 x, in UInt256 y, out UInt256 res) From f380ecf55157073e113c8376d93bcd37b86c7fdf Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 21:12:08 +0000 Subject: [PATCH 36/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 2ea030e..6bdd8b8 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1003,6 +1003,7 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u /// The first 256‑bit unsigned integer. /// The second 256‑bit unsigned integer. /// When this method returns, contains the 256‑bit product of x and y. + [SkipLocalsInit] public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) { // If both inputs fit in 64 bits, use a simple multiplication routine. @@ -1041,18 +1042,17 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } // Step 1: load the inputs and prepare the mask constant. - Vector256 vecX = Unsafe.As>(ref Unsafe.AsRef(in x)); - Vector256 vecY = Unsafe.As>(ref Unsafe.AsRef(in y)); + Vector512 xPermute = Vector512.Create(0ul, 0, 1, 0, 1, 2, 0, 1); + Vector512 yPermute = Vector512.Create(0ul, 1, 0, 2, 1, 0, 3, 2); + Unsafe.SkipInit(out Vector512 vecX); + Unsafe.SkipInit(out Vector512 vecY); + vecX = Avx512F.InsertVector256(vecX, Unsafe.As>(ref Unsafe.AsRef(in x)), 0); + vecY = Avx512F.InsertVector256(vecY, Unsafe.As>(ref Unsafe.AsRef(in y)), 0); Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); // Step 2: permute x and y. These operations are independent. - Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); // [ x0, x0, x1, x0 ] - Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ] - Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); // [ x1, x2, x0, x1 ] - Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ] - - Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2); - Vector512 yRearranged = Vector512.Create(yPerm1, yPerm2); + Vector512 xRearranged = Avx512F.PermuteVar8x64(vecX, xPermute); + Vector512 yRearranged = Avx512F.PermuteVar8x64(vecY, yPermute); // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts. Vector512 xLowerParts = Avx512F.And(xRearranged, mask32); From 2e4113b15d6de40c39a9425608f23ebf28f058f3 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 21:28:52 +0000 Subject: [PATCH 37/38] Revert "Optimize" This reverts commit f380ecf55157073e113c8376d93bcd37b86c7fdf. --- src/Nethermind.Int256/UInt256.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 6bdd8b8..2ea030e 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1003,7 +1003,6 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u /// The first 256‑bit unsigned integer. /// The second 256‑bit unsigned integer. /// When this method returns, contains the 256‑bit product of x and y. - [SkipLocalsInit] public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) { // If both inputs fit in 64 bits, use a simple multiplication routine. @@ -1042,17 +1041,18 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) } // Step 1: load the inputs and prepare the mask constant. - Vector512 xPermute = Vector512.Create(0ul, 0, 1, 0, 1, 2, 0, 1); - Vector512 yPermute = Vector512.Create(0ul, 1, 0, 2, 1, 0, 3, 2); - Unsafe.SkipInit(out Vector512 vecX); - Unsafe.SkipInit(out Vector512 vecY); - vecX = Avx512F.InsertVector256(vecX, Unsafe.As>(ref Unsafe.AsRef(in x)), 0); - vecY = Avx512F.InsertVector256(vecY, Unsafe.As>(ref Unsafe.AsRef(in y)), 0); + Vector256 vecX = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 vecY = Unsafe.As>(ref Unsafe.AsRef(in y)); Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); // Step 2: permute x and y. These operations are independent. - Vector512 xRearranged = Avx512F.PermuteVar8x64(vecX, xPermute); - Vector512 yRearranged = Avx512F.PermuteVar8x64(vecY, yPermute); + Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); // [ x0, x0, x1, x0 ] + Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ] + Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); // [ x1, x2, x0, x1 ] + Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ] + + Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2); + Vector512 yRearranged = Vector512.Create(yPerm1, yPerm2); // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts. Vector512 xLowerParts = Avx512F.And(xRearranged, mask32); From f4045aa318cf6cb47ec3b53dd0021555a32543a6 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Feb 2025 21:37:46 +0000 Subject: [PATCH 38/38] Optimize --- src/Nethermind.Int256/UInt256.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 2ea030e..0905a83 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -1003,6 +1003,7 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u /// The first 256‑bit unsigned integer. /// The second 256‑bit unsigned integer. /// When this method returns, contains the 256‑bit product of x and y. + [SkipLocalsInit] public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) { // If both inputs fit in 64 bits, use a simple multiplication routine.