diff --git a/src/Nethermind.Int256.Benchmark/Benchmarks.cs b/src/Nethermind.Int256.Benchmark/Benchmarks.cs
index 616aac1..b22a1da 100644
--- a/src/Nethermind.Int256.Benchmark/Benchmarks.cs
+++ b/src/Nethermind.Int256.Benchmark/Benchmarks.cs
@@ -89,8 +89,8 @@ public class SignedIntTwoParamBenchmarkBase : SignedBenchmarkBase
public (int, Int256) D;
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class AddUnsigned : UnsignedTwoParamBenchmarkBase
{
@@ -108,8 +108,8 @@ public UInt256 Add_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class AddSigned : SignedTwoParamBenchmarkBase
{
@@ -127,8 +127,8 @@ public Int256 Add_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class SubtractUnsigned : UnsignedTwoParamBenchmarkBase
{
@@ -146,8 +146,8 @@ public UInt256 Subtract_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class SubtractSigned : SignedTwoParamBenchmarkBase
{
@@ -165,8 +165,8 @@ public Int256 Subtract_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class AddModUnsinged : UnsignedThreeParamBenchmarkBase
{
@@ -184,8 +184,8 @@ public UInt256 AddMod_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class AddModSinged : SignedThreeParamBenchmarkBase
{
@@ -203,8 +203,8 @@ public Int256 AddMod_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class SubtractModUnsinged : UnsignedThreeParamBenchmarkBase
{
@@ -222,8 +222,8 @@ public UInt256 SubtractMod_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class SubtractModSigned : SignedThreeParamBenchmarkBase
{
@@ -241,8 +241,8 @@ public Int256 SubtractMod_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class MultiplyUnsigned : UnsignedTwoParamBenchmarkBase
{
@@ -260,8 +260,8 @@ public UInt256 Multiply_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class MultiplySigned : SignedTwoParamBenchmarkBase
{
@@ -279,8 +279,8 @@ public Int256 Multiply_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class MultiplyModUnsigned : UnsignedThreeParamBenchmarkBase
{
@@ -298,8 +298,8 @@ public UInt256 MultiplyMod_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class MultiplyModSigned : SignedThreeParamBenchmarkBase
{
@@ -317,8 +317,8 @@ public Int256 MultiplyMod_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class DivideUnsigned : UnsignedTwoParamBenchmarkBase
{
@@ -336,8 +336,8 @@ public UInt256 Divide_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class DivideSigned : SignedTwoParamBenchmarkBase
{
@@ -355,8 +355,8 @@ public Int256 Divide_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class ExpUnsigned : UnsignedIntTwoParamBenchmarkBase
{
@@ -374,8 +374,8 @@ public UInt256 Exp_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class ExpSigned : SignedIntTwoParamBenchmarkBase
{
@@ -393,8 +393,8 @@ public Int256 Exp_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class ExpModUnsigned : UnsignedThreeParamBenchmarkBase
{
@@ -412,8 +412,8 @@ public UInt256 ExpMod_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class ExpModSigned : SignedBenchmarkBase
{
@@ -440,8 +440,8 @@ public Int256 ExpMod_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class LeftShiftUnsigned : UnsignedIntTwoParamBenchmarkBase
{
@@ -459,8 +459,8 @@ public UInt256 LeftShift_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class LeftShiftSigned : SignedIntTwoParamBenchmarkBase
{
@@ -478,8 +478,8 @@ public Int256 LeftShift_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class RightShiftUnsigned : UnsignedIntTwoParamBenchmarkBase
{
@@ -497,8 +497,8 @@ public UInt256 RightShift_UInt256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class RightShiftSigned : SignedIntTwoParamBenchmarkBase
{
@@ -516,8 +516,8 @@ public Int256 RightShift_Int256()
}
}
- [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
- [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+ [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+ [NoIntrinsicsJob(RuntimeMoniker.Net90)]
[MemoryDiagnoser]
public class IsZeroOne
{
diff --git a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs
index d90679e..f3307cb 100644
--- a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs
+++ b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs
@@ -1,4 +1,4 @@
-using System;
+using System;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
@@ -116,6 +116,8 @@ internal static Runtime GetRuntime(this RuntimeMoniker runtimeMoniker)
return CoreRuntime.Core70;
case RuntimeMoniker.Net80:
return CoreRuntime.Core80;
+ case RuntimeMoniker.Net90:
+ return CoreRuntime.Core90;
case RuntimeMoniker.Mono:
return MonoRuntime.Default;
case RuntimeMoniker.NativeAot60:
diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 75a4c73..0905a83 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -996,10 +996,17 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u
res = a - b - borrow;
borrow = (((~a) & b) | (~(a ^ b)) & res) >> 63;
}
-
- // Multiply sets res to the product x*y
+ ///
+ /// Multiplies two 256‑bit unsigned integers ( and ) and
+ /// writes the 256‑bit product to .
+ ///
+ /// The first 256‑bit unsigned integer.
+ /// The second 256‑bit unsigned integer.
+ /// When this method returns, contains the 256‑bit product of x and y.
+ [SkipLocalsInit]
public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
{
+ // If both inputs fit in 64 bits, use a simple multiplication routine.
if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0)
{
// Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615)
@@ -1010,25 +1017,160 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
Unsafe.AsRef(in res.u1) = high;
return;
}
+ // Fallback if the required AVX‑512 intrinsics are not supported.
+ if (!Avx512F.IsSupported || !Avx512DQ.IsSupported)
+ {
+ ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x));
+ ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y));
- ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x));
- ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y));
+ (ulong carry, ulong r0) = Multiply64(rx, ry);
+ UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
+ UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
+ ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
- (ulong carry, ulong r0) = Multiply64(rx, ry);
- UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
- UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
- ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
+ UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
+ UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
+ res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
- UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
- UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
- res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
+ UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
+ res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
- UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
- res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
+ ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
- ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
+ res = new UInt256(r0, r1, r2, r3);
+ return;
+ }
- res = new UInt256(r0, r1, r2, r3);
+ // Step 1: load the inputs and prepare the mask constant.
+ Vector256 vecX = Unsafe.As>(ref Unsafe.AsRef(in x));
+ Vector256 vecY = Unsafe.As>(ref Unsafe.AsRef(in y));
+ Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL);
+
+ // Step 2: permute x and y. These operations are independent.
+ Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); // [ x0, x0, x1, x0 ]
+ Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ]
+ Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); // [ x1, x2, x0, x1 ]
+ Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ]
+
+ Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2);
+ Vector512 yRearranged = Vector512.Create(yPerm1, yPerm2);
+
+ // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts.
+ Vector512 xLowerParts = Avx512F.And(xRearranged, mask32);
+ Vector512 yLowerParts = Avx512F.And(yRearranged, mask32);
+ Vector512 xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32);
+ Vector512 yUpperParts = Avx512F.ShiftRightLogical(yRearranged, 32);
+
+ // Step 4: launch four 32×32‑bit multiplications in parallel.
+ Vector512 prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower × lower
+ Vector512 prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts); // lower × upper
+ Vector512 prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts); // upper × lower
+ Vector512 prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts); // upper × upper
+
+ // Step 5: compute the intermediate term while the multiplications are in flight.
+ Vector512 prodLL_hi = Avx512F.ShiftRightLogical(prodLL, 32);
+ Vector512 prodLH_lo = Avx512F.And(prodLH, mask32);
+ Vector512 prodHL_lo = Avx512F.And(prodHL, mask32);
+ Vector512 termT = Avx512F.Add(Avx512F.Add(prodLL_hi, prodLH_lo), prodHL_lo);
+
+ // Step 6: assemble the lower and higher partial results.
+ Vector512 lowerPartial =
+ Avx512F.Or(
+ Avx512F.And(prodLL, mask32),
+ Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32));
+ Vector512 higherPartial =
+ Avx512F.Add(
+ Avx512F.Add(
+ Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)),
+ Avx512F.ShiftRightLogical(prodHL, 32)),
+ Avx512F.ShiftRightLogical(termT, 32));
+
+ // Step 7: unpack the 512‑bit results into two groups.
+ Vector512 productLow = Avx512F.UnpackLow(lowerPartial, higherPartial);
+ Vector512 productHi = Avx512F.UnpackHigh(lowerPartial, higherPartial);
+
+ // Step 8: extract the 128‑bit groups.
+ Vector128 product0 = Avx512F.ExtractVector128(productLow, 0);
+ Vector128 product1 = Avx512F.ExtractVector128(productHi, 0);
+ Vector128 product2 = Avx512F.ExtractVector128(productLow, 1);
+ Vector128 product3 = Avx512F.ExtractVector128(productHi, 1);
+ Vector128 product4 = Avx512F.ExtractVector128(productLow, 2);
+ Vector128 product5 = Avx512F.ExtractVector128(productHi, 2);
+
+ // Step 9: issue memory request for remaining parts.
+ Vector128 xHigh = Vector128.Create(x.u2, x.u3);
+ Vector128 yLow = Vector128.Create(y.u1, y.u0);
+
+ // Step 10: perform the group 1 cross‑term addition.
+ Vector128 crossSum = Add128(product1, product2);
+ Vector128 crossAddMask = Sse2.UnpackLow(Vector128.Zero, crossSum);
+ Vector128 updatedProduct0 = Sse2.Add(product0, crossAddMask);
+
+ // Compute the carry from adding crossSum’s low 64 bits.
+ Vector128 product0HighBefore = Sse2.UnpackHigh(product0, product0);
+ Vector128 product0HighAfter = Sse2.UnpackHigh(updatedProduct0, updatedProduct0);
+ Vector128 carryFlag =
+ Sse2.ShiftRightLogical(
+ Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore),
+ 63);
+ Vector128 crossSumHigh = Sse2.UnpackHigh(crossSum, crossSum);
+ Vector128 limb2 = Sse2.Add(crossSumHigh, carryFlag);
+ Vector128 limb3 =
+ Sse2.ShiftRightLogical(
+ Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh),
+ 63);
+ Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3);
+
+ // Step 11: combine group 2 partial results.
+ Vector128 group2Sum = Add128(product3, product4);
+ Vector128 totalGroup2 = Add128(group2Sum, product5);
+ Vector128 newHalf = Add128(upperIntermediate, totalGroup2);
+
+ // Step 12: process group 3 cross‑terms.
+ Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow);
+ Vector128 extraLow = Avx512F.ExtractVector128(lowerPartial, 3);
+ finalProdLow = Sse2.Add(finalProdLow, extraLow);
+ Vector128 swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow);
+ Vector128 horizontalSum = Sse2.Add(finalProdLow, swappedFinal);
+ Vector128 highCarry = Sse2.UnpackHigh(Vector128.Zero, horizontalSum);
+ newHalf = Sse2.Add(newHalf, highCarry);
+
+ // Combine the results into the final 256‑bit value.
+ Vector256 finalResult = Vector256.Create(updatedProduct0, newHalf);
+ Unsafe.SkipInit(out res);
+ Unsafe.As>(ref res) = finalResult;
+
+ ///
+ /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
+ /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits
+ /// and element 1 holding the higher 64 bits.
+ ///
+ /// The first 128-bit unsigned integer operand.
+ /// The second 128-bit unsigned integer operand.
+ ///
+ /// A representing the sum of the two operands, with any carry from the lower lane added
+ /// into the higher lane.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static Vector128 Add128(Vector128 left, Vector128 right)
+ {
+ // Perform a lane-wise addition of the two operands.
+ Vector128 sum = Sse2.Add(left, right);
+
+ // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands.
+ // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise.
+ Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left);
+
+ // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
+ // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
+ overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
+ // Next, clear the (now swapped) lower lane by shuffle with a zero vector.
+ // The immediate mask 0x0 indicates that lane 0 should come from the zero vector and lane 1 from overflow.
+ Vector128 promotedCarry = Sse2.UnpackLow(Vector128.Zero, overflowMask);
+
+ // Add the propagated carry to the sum.
+ return Sse2.Add(sum, promotedCarry);
+ }
}
public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res);