From 8ea6712019184dabec4d7108216c7ba1e88c6c61 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 01:46:06 +0000
Subject: [PATCH 01/38] avx512 multiply

---
 src/Nethermind.Int256/UInt256.cs | 143 ++++++++++++++++++++++++++++---
 1 file changed, 130 insertions(+), 13 deletions(-)
diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 75a4c73..454e16c 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1010,25 +1010,142 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 Unsafe.AsRef(in res.u1) = high;
                 return;
             }
+            
+            if (!Avx512F.IsSupported || !Avx512DQ.IsSupported)
+            {
+                ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
+                ref ulong ry = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in y));
 
-            ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
-            ref ulong ry = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in y));
+                (ulong carry, ulong r0) = Multiply64(rx, ry);
+                UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
+                UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
+                ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
 
-            (ulong carry, ulong r0) = Multiply64(rx, ry);
-            UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
-            UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
-            ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
+                UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
+                UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
+                res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
 
-            UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
-            UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
-            res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
+                UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
+                res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
 
-            UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
-            res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
+                ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
 
-            ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
+                res = new UInt256(r0, r1, r2, r3);
+            }
+            else
+            {
 
-            res = new UInt256(r0, r1, r2, r3);
+                // Unpack the four 64-bit limbs (little-endian: u0 is least-significant)
+                ulong a0 = x.u0, a1 = x.u1, a2 = x.u2, a3 = x.u3;
+                ulong b0 = y.u0, b1 = y.u1, b2 = y.u2, b3 = y.u3;
+
+                // --- Compute the 10 64x64–bit products using our vectorized method ---
+
+                // Group 1: 8 products
+                Vector512<ulong> vecA1 = Vector512.Create(a0, a0, a1, a0, a1, a2, a0, a1);
+                Vector512<ulong> vecB1 = Vector512.Create(b0, b1, b0, b2, b1, b0, b3, b2);
+                Mul64Vector(vecA1, vecB1, out Vector512<ulong> lo1, out Vector512<ulong> hi1);
+
+                // Extract products from group1
+                ulong P00_lo = lo1.GetElement(0), P00_hi = hi1.GetElement(0);
+                ulong P01_lo = lo1.GetElement(1), P01_hi = hi1.GetElement(1);
+                ulong P10_lo = lo1.GetElement(2), P10_hi = hi1.GetElement(2);
+                ulong P02_lo = lo1.GetElement(3), P02_hi = hi1.GetElement(3);
+                ulong P11_lo = lo1.GetElement(4), P11_hi = hi1.GetElement(4);
+                ulong P20_lo = lo1.GetElement(5), P20_hi = hi1.GetElement(5);
+                ulong P03_lo = lo1.GetElement(6), P03_hi = hi1.GetElement(6);
+                ulong P12_lo = lo1.GetElement(7), P12_hi = hi1.GetElement(7);
+
+                // Group 2: 2 products
+                Vector512<ulong> vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+                Vector512<ulong> vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+                Mul64Vector(vecA2, vecB2, out Vector512<ulong> lo2, out Vector512<ulong> hi2);
+                ulong P21_lo = lo2.GetElement(0); // P21_hi is not needed (contributes only above 256 bits)
+                ulong P30_lo = lo2.GetElement(1); // Likewise for P30_hi
+
+                // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
+                // (Recall: a 128–bit product is given as (lo, hi), where lo is the lower 64 bits and hi the upper 64 bits.)
+
+                // P00 (no shift)
+                UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
+
+                // P01 and P10 (each shifted left by 64 bits)
+                UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0);
+                UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0);
+                UInt256 sum64;
+                AddImpl(part64a, part64b, out sum64);
+
+                // P02, P11 and P20 (each shifted left by 128 bits)
+                UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi);
+                UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi);
+                UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi);
+                UInt256 sum128, temp;
+                AddImpl(part128a, part128b, out temp);
+                AddImpl(temp, part128c, out sum128);
+
+                // P03, P12, P21 and P30 (shifted left by 192 bits – note only the low 64 bits matter)
+                UInt256 part192a = new UInt256(0, 0, 0, P03_lo);
+                UInt256 part192b = new UInt256(0, 0, 0, P12_lo);
+                UInt256 part192c = new UInt256(0, 0, 0, P21_lo);
+                UInt256 part192d = new UInt256(0, 0, 0, P30_lo);
+                UInt256 sum192;
+                AddImpl(part192a, part192b, out temp);
+                AddImpl(temp, part192c, out temp);
+                AddImpl(temp, part192d, out sum192);
+
+                // --- Sum all the partial products using AddImpl ---
+                UInt256 intermediate;
+                AddImpl(part0, sum64, out intermediate);
+                AddImpl(intermediate, sum128, out intermediate);
+                AddImpl(intermediate, sum192, out res);
+            }
+        }
+
+        
+        // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes),
+        // computes per lane:
+        //   product = a * b = (hi, lo)
+        // using the splitting method since there is no MultiplyHigh intrinsic.
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void Mul64Vector(Vector512<ulong> a, Vector512<ulong> b,
+                                          out Vector512<ulong> lo, out Vector512<ulong> hi)
+        {
+            // Mask for the lower 32 bits.
+            Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
+
+            // Split each 64-bit operand into 32-bit halves:
+            // a0 = lower 32 bits, a1 = upper 32 bits
+            Vector512<ulong> a0 = Avx512F.And(a, mask32);
+            Vector512<ulong> a1 = Avx512F.ShiftRightLogical(a, 32);
+            Vector512<ulong> b0 = Avx512F.And(b, mask32);
+            Vector512<ulong> b1 = Avx512F.ShiftRightLogical(b, 32);
+
+            // Compute the four 32x32 partial products.
+            // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
+            Vector512<ulong> u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0
+            Vector512<ulong> u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1
+            Vector512<ulong> u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0
+            Vector512<ulong> u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1
+
+            // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
+            Vector512<ulong> u0_hi = Avx512F.ShiftRightLogical(u0, 32);
+            Vector512<ulong> u1_lo = Avx512F.And(u1, mask32);
+            Vector512<ulong> u2_lo = Avx512F.And(u2, mask32);
+            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo);
+
+            // The extra carry: c = t >> 32.
+            Vector512<ulong> c = Avx512F.ShiftRightLogical(t, 32);
+
+            // Now, assemble the lower 64 bits:
+            // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
+            Vector512<ulong> u0_lo = Avx512F.And(u0, mask32);
+            Vector512<ulong> t_lo = Avx512F.And(t, mask32);
+            lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32));
+
+            // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
+            Vector512<ulong> u1_hi = Avx512F.ShiftRightLogical(u1, 32);
+            Vector512<ulong> u2_hi = Avx512F.ShiftRightLogical(u2, 32);
+            hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c);
         }
 
         public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res);

From f69cad2098d4ee99631c2d86b16a01a37cd01bcf Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 02:36:22 +0000
Subject: [PATCH 02/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 55 ++++++++++++++------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 454e16c..d59efed 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1034,19 +1034,19 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
             else
             {
-
+                // Vectorized branch using AVX-512.
                 // Unpack the four 64-bit limbs (little-endian: u0 is least-significant)
                 ulong a0 = x.u0, a1 = x.u1, a2 = x.u2, a3 = x.u3;
                 ulong b0 = y.u0, b1 = y.u1, b2 = y.u2, b3 = y.u3;
 
-                // --- Compute the 10 64x64–bit products using our vectorized method ---
+                // --- Compute the 10 64x64–bit partial products using our vectorized method ---
 
                 // Group 1: 8 products
                 Vector512<ulong> vecA1 = Vector512.Create(a0, a0, a1, a0, a1, a2, a0, a1);
                 Vector512<ulong> vecB1 = Vector512.Create(b0, b1, b0, b2, b1, b0, b3, b2);
                 Mul64Vector(vecA1, vecB1, out Vector512<ulong> lo1, out Vector512<ulong> hi1);
 
-                // Extract products from group1
+                // Extract products from group 1.
                 ulong P00_lo = lo1.GetElement(0), P00_hi = hi1.GetElement(0);
                 ulong P01_lo = lo1.GetElement(1), P01_hi = hi1.GetElement(1);
                 ulong P10_lo = lo1.GetElement(2), P10_hi = hi1.GetElement(2);
@@ -1056,51 +1056,44 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 ulong P03_lo = lo1.GetElement(6), P03_hi = hi1.GetElement(6);
                 ulong P12_lo = lo1.GetElement(7), P12_hi = hi1.GetElement(7);
 
-                // Group 2: 2 products
-                Vector512<ulong> vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-                Vector512<ulong> vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-                Mul64Vector(vecA2, vecB2, out Vector512<ulong> lo2, out Vector512<ulong> hi2);
-                ulong P21_lo = lo2.GetElement(0); // P21_hi is not needed (contributes only above 256 bits)
-                ulong P30_lo = lo2.GetElement(1); // Likewise for P30_hi
-
                 // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
-                // (Recall: a 128–bit product is given as (lo, hi), where lo is the lower 64 bits and hi the upper 64 bits.)
 
-                // P00 (no shift)
+                // Group with no shift.
                 UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
 
-                // P01 and P10 (each shifted left by 64 bits)
+                // Group shifted left by 64 bits.
                 UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0);
                 UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0);
                 UInt256 sum64;
                 AddImpl(part64a, part64b, out sum64);
 
-                // P02, P11 and P20 (each shifted left by 128 bits)
+                // Group shifted left by 128 bits.
                 UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi);
                 UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi);
                 UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi);
-                UInt256 sum128, temp;
-                AddImpl(part128a, part128b, out temp);
-                AddImpl(temp, part128c, out sum128);
-
-                // P03, P12, P21 and P30 (shifted left by 192 bits – note only the low 64 bits matter)
-                UInt256 part192a = new UInt256(0, 0, 0, P03_lo);
-                UInt256 part192b = new UInt256(0, 0, 0, P12_lo);
-                UInt256 part192c = new UInt256(0, 0, 0, P21_lo);
-                UInt256 part192d = new UInt256(0, 0, 0, P30_lo);
-                UInt256 sum192;
-                AddImpl(part192a, part192b, out temp);
-                AddImpl(temp, part192c, out temp);
-                AddImpl(temp, part192d, out sum192);
-
-                // --- Sum all the partial products using AddImpl ---
+                UInt256 sum128, temp256;
+                AddImpl(part128a, part128b, out temp256);
+                AddImpl(temp256, part128c, out sum128);
+
+
+                // Group 2: 2 products
+                Vector512<ulong> vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+                Vector512<ulong> vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+                Mul64Vector(vecA2, vecB2, out Vector512<ulong> lo2, out Vector512<ulong> hi2);
+                ulong P21_lo = lo2.GetElement(0); // Only lower 64 bits matter.
+                ulong P30_lo = lo2.GetElement(1);
+
+                // Group shifted left by 192 bits – only the lower 64 bits contribute.
+                // Any carry is discarded, so just use normal addition.
+                UInt256 part192256 = new UInt256(0, 0, 0, (P03_lo + P12_lo + P21_lo + P30_lo));
+
+                // --- Sum all the partial products using the proven UInt256 adder (AddImpl) ---
                 UInt256 intermediate;
                 AddImpl(part0, sum64, out intermediate);
                 AddImpl(intermediate, sum128, out intermediate);
-                AddImpl(intermediate, sum192, out res);
+                AddImpl(intermediate, part192256, out res);
             }
         }
-
         
         // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes),
         // computes per lane:

From e8c03a6fa78a647921c589ed1cbb423a83c26841 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 02:54:58 +0000
Subject: [PATCH 03/38] optimize

---
 src/Nethermind.Int256/UInt256.cs | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index d59efed..d8e5656 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1034,16 +1034,17 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
             else
             {
+                Vector256<ulong> vecA = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
+                Vector256<ulong> vecB = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
+
                 // Vectorized branch using AVX-512.
                 // Unpack the four 64-bit limbs (little-endian: u0 is least-significant)
-                ulong a0 = x.u0, a1 = x.u1, a2 = x.u2, a3 = x.u3;
-                ulong b0 = y.u0, b1 = y.u1, b2 = y.u2, b3 = y.u3;
 
                 // --- Compute the 10 64x64–bit partial products using our vectorized method ---
 
                 // Group 1: 8 products
-                Vector512<ulong> vecA1 = Vector512.Create(a0, a0, a1, a0, a1, a2, a0, a1);
-                Vector512<ulong> vecB1 = Vector512.Create(b0, b1, b0, b2, b1, b0, b3, b2);
+                Vector512<ulong> vecA1 = Vector512.Create(Avx2.Permute4x64(vecA, 16), Avx2.Permute4x64(vecA, 73));
+                Vector512<ulong> vecB1 = Vector512.Create(Avx2.Permute4x64(vecB, 132), Avx2.Permute4x64(vecB, 177));
                 Mul64Vector(vecA1, vecB1, out Vector512<ulong> lo1, out Vector512<ulong> hi1);
 
                 // Extract products from group 1.
@@ -1077,6 +1078,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
 
                 // Group 2: 2 products
+                ulong a2 = x.u2, a3 = x.u3;
+                ulong b0 = y.u0, b1 = y.u1;
                 Vector512<ulong> vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
                 Vector512<ulong> vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
                 Mul64Vector(vecA2, vecB2, out Vector512<ulong> lo2, out Vector512<ulong> hi2);

From 30aea1a50657f39d039a7211b07d4a02e0ad3e4a Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 02:59:03 +0000
Subject: [PATCH 04/38] optimize

---
 src/Nethermind.Int256/UInt256.cs | 41 +++++++++++++++++---------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index d8e5656..76b358b 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1059,22 +1059,28 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
 
-                // Group with no shift.
-                UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
-
-                // Group shifted left by 64 bits.
-                UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0);
-                UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0);
-                UInt256 sum64;
-                AddImpl(part64a, part64b, out sum64);
-
-                // Group shifted left by 128 bits.
-                UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi);
-                UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi);
-                UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi);
-                UInt256 sum128, temp256;
-                AddImpl(part128a, part128b, out temp256);
-                AddImpl(temp256, part128c, out sum128);
+                UInt256 intermediate;
+                {
+                    // Group with no shift.
+                    UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
+
+                    // Group shifted left by 64 bits.
+                    UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0);
+                    UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0);
+                    UInt256 sum64;
+                    AddImpl(part64a, part64b, out sum64);
+                    AddImpl(part0, sum64, out intermediate);
+                }
+                {
+                    // Group shifted left by 128 bits.
+                    UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi);
+                    UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi);
+                    UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi);
+                    UInt256 sum128, temp256;
+                    AddImpl(part128a, part128b, out temp256);
+                    AddImpl(temp256, part128c, out sum128);
+                    AddImpl(intermediate, sum128, out intermediate);
+                }
 
 
                 // Group 2: 2 products
@@ -1091,9 +1097,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 UInt256 part192256 = new UInt256(0, 0, 0, (P03_lo + P12_lo + P21_lo + P30_lo));
 
                 // --- Sum all the partial products using the proven UInt256 adder (AddImpl) ---
-                UInt256 intermediate;
-                AddImpl(part0, sum64, out intermediate);
-                AddImpl(intermediate, sum128, out intermediate);
                 AddImpl(intermediate, part192256, out res);
             }
         }

From 11428ea467d9601391ccf6190ee29c8bc0bf84d9 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 03:33:31 +0000
Subject: [PATCH 05/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 76b358b..a579ff7 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1084,20 +1084,26 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
 
                 // Group 2: 2 products
-                ulong a2 = x.u2, a3 = x.u3;
-                ulong b0 = y.u0, b1 = y.u1;
-                Vector512<ulong> vecA2 = Vector512.Create(a2, a3, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-                Vector512<ulong> vecB2 = Vector512.Create(b1, b0, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-                Mul64Vector(vecA2, vecB2, out Vector512<ulong> lo2, out Vector512<ulong> hi2);
-                ulong P21_lo = lo2.GetElement(0); // Only lower 64 bits matter.
+                // Pack the two 64-bit values from x into a Vector128<ulong>
+                Vector128<ulong> vecA2 = Vector128.Create(x.u2, x.u3);
+                // Pack the two 64-bit values from y into a Vector128<ulong> in the required order.
+                // Here, we want lane 0 to contain b1 (for P21_lo) and lane 1 to contain b0 (for P30_lo).
+                Vector128<ulong> vecB2 = Vector128.Create(y.u1, y.u0);
+
+                // Use MultiplyLow to multiply corresponding lanes and keep only the lower 64 bits.
+                Vector128<ulong> lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2);
+
+                // Extract the results:
+                ulong P21_lo = lo2.GetElement(0);
                 ulong P30_lo = lo2.GetElement(1);
 
-                // Group shifted left by 192 bits – only the lower 64 bits contribute.
-                // Any carry is discarded, so just use normal addition.
-                UInt256 part192256 = new UInt256(0, 0, 0, (P03_lo + P12_lo + P21_lo + P30_lo));
+                ulong group192 = P03_lo + P12_lo + P21_lo + P30_lo;
 
-                // --- Sum all the partial products using the proven UInt256 adder (AddImpl) ---
-                AddImpl(intermediate, part192256, out res);
+                // Now add that to the most-significant limb of the intermediate result.
+                res = new UInt256(intermediate.u0, 
+                                  intermediate.u1, 
+                                  intermediate.u2, 
+                                  intermediate.u3 + group192);  // any carry here is dropped modulo 2^256
             }
         }
         

From 33c724208e59355faa5102744266ac4ca93975a9 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 03:57:57 +0000
Subject: [PATCH 06/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index a579ff7..eaeb4ca 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1092,18 +1092,25 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 // Use MultiplyLow to multiply corresponding lanes and keep only the lower 64 bits.
                 Vector128<ulong> lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2);
+                
+                lo2 = Sse2.Add(lo2, Vector128.Create(P03_lo, P12_lo));
+                // Reinterpret lo2 as a vector of doubles.
+                Vector128<double> lo2Double = lo2.AsDouble();
 
-                // Extract the results:
-                ulong P21_lo = lo2.GetElement(0);
-                ulong P30_lo = lo2.GetElement(1);
+                // Use Sse2.Shuffle (which is _mm_shuffle_pd) with control mask 0x1 to swap the two lanes.
+                Vector128<double> shufDouble = Sse2.Shuffle(lo2Double, lo2Double, 0x1);
 
-                ulong group192 = P03_lo + P12_lo + P21_lo + P30_lo;
+                // Reinterpret back to ulong.
+                Vector128<ulong> shuf = shufDouble.AsUInt64();
 
-                // Now add that to the most-significant limb of the intermediate result.
-                res = new UInt256(intermediate.u0, 
-                                  intermediate.u1, 
-                                  intermediate.u2, 
-                                  intermediate.u3 + group192);  // any carry here is dropped modulo 2^256
+                // Add the original vector and the shuffled one.
+                Vector128<ulong> sumVec = Sse2.Add(lo2, shuf);
+
+                // Now the horizontal sum is in lane 0.
+                ulong group192 = intermediate.u3 + sumVec.GetElement(0);
+
+                Unsafe.SkipInit(out res);
+                Unsafe.As<UInt256, Vector256<ulong>>(ref res) = Unsafe.As<UInt256, Vector256<ulong>>(ref intermediate).WithElement(3, group192);
             }
         }
         

From 6ccee9980b1f7ff15c771967b229d17d955501cf Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 04:31:28 +0000
Subject: [PATCH 07/38] Simplify

---
 src/Nethermind.Int256/UInt256.cs | 74 ++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index eaeb4ca..153e6a1 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1037,12 +1037,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 Vector256<ulong> vecA = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
                 Vector256<ulong> vecB = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
 
-                // Vectorized branch using AVX-512.
-                // Unpack the four 64-bit limbs (little-endian: u0 is least-significant)
-
-                // --- Compute the 10 64x64–bit partial products using our vectorized method ---
-
-                // Group 1: 8 products
                 Vector512<ulong> vecA1 = Vector512.Create(Avx2.Permute4x64(vecA, 16), Avx2.Permute4x64(vecA, 73));
                 Vector512<ulong> vecB1 = Vector512.Create(Avx2.Permute4x64(vecB, 132), Avx2.Permute4x64(vecB, 177));
                 Mul64Vector(vecA1, vecB1, out Vector512<ulong> lo1, out Vector512<ulong> hi1);
@@ -1054,63 +1048,79 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 ulong P02_lo = lo1.GetElement(3), P02_hi = hi1.GetElement(3);
                 ulong P11_lo = lo1.GetElement(4), P11_hi = hi1.GetElement(4);
                 ulong P20_lo = lo1.GetElement(5), P20_hi = hi1.GetElement(5);
-                ulong P03_lo = lo1.GetElement(6), P03_hi = hi1.GetElement(6);
-                ulong P12_lo = lo1.GetElement(7), P12_hi = hi1.GetElement(7);
+                ulong P03_lo = lo1.GetElement(6);
+                ulong P12_lo = lo1.GetElement(7);
 
                 // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
 
                 UInt256 intermediate;
                 {
-                    // Group with no shift.
-                    UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
-
-                    // Group shifted left by 64 bits.
                     UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0);
                     UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0);
                     UInt256 sum64;
                     AddImpl(part64a, part64b, out sum64);
+
+                    UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
                     AddImpl(part0, sum64, out intermediate);
                 }
                 {
-                    // Group shifted left by 128 bits.
-                    UInt256 part128a = new UInt256(0, 0, P02_lo, P02_hi);
-                    UInt256 part128b = new UInt256(0, 0, P11_lo, P11_hi);
-                    UInt256 part128c = new UInt256(0, 0, P20_lo, P20_hi);
-                    UInt256 sum128, temp256;
-                    AddImpl(part128a, part128b, out temp256);
-                    AddImpl(temp256, part128c, out sum128);
-                    AddImpl(intermediate, sum128, out intermediate);
+                    // Pack the nonzero (upper 128-bit) parts into Vector128<ulong>
+                    Vector128<ulong> v128a = Vector128.Create(P02_lo, P02_hi);
+                    Vector128<ulong> v128b = Vector128.Create(P11_lo, P11_hi);
+                    Vector128<ulong> v128c = Vector128.Create(P20_lo, P20_hi);
+    
+                    // Use our 128-bit adder to sum these.
+                    // (This helper adds two 128-bit values with proper carry propagation.)
+                    Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
+                    Vector128<ulong> sum128 = Vector128AddWithCarry(temp128, v128c);
+    
+                    // Now, these two 64-bit lanes represent the contribution from group 128.
+                    // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result.
+                    // Extract the current upper half of the intermediate sum.
+                    Vector128<ulong> interUpper = Vector128.Create(intermediate.u2, intermediate.u3);
+                    // Add the computed 128-bit group sum to that upper half.
+                    Vector128<ulong> newInterUpper = Vector128AddWithCarry(interUpper, sum128);
+    
+                    // Update the intermediate result—its lower half (u0 and u1) remains unchanged.
+                    intermediate = new UInt256(
+                                        intermediate.u0,
+                                        intermediate.u1,
+                                        newInterUpper.GetElement(0),
+                                        newInterUpper.GetElement(1));
                 }
 
-
-                // Group 2: 2 products
-                // Pack the two 64-bit values from x into a Vector128<ulong>
                 Vector128<ulong> vecA2 = Vector128.Create(x.u2, x.u3);
-                // Pack the two 64-bit values from y into a Vector128<ulong> in the required order.
-                // Here, we want lane 0 to contain b1 (for P21_lo) and lane 1 to contain b0 (for P30_lo).
                 Vector128<ulong> vecB2 = Vector128.Create(y.u1, y.u0);
 
-                // Use MultiplyLow to multiply corresponding lanes and keep only the lower 64 bits.
                 Vector128<ulong> lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2);
                 
                 lo2 = Sse2.Add(lo2, Vector128.Create(P03_lo, P12_lo));
-                // Reinterpret lo2 as a vector of doubles.
                 Vector128<double> lo2Double = lo2.AsDouble();
 
-                // Use Sse2.Shuffle (which is _mm_shuffle_pd) with control mask 0x1 to swap the two lanes.
                 Vector128<double> shufDouble = Sse2.Shuffle(lo2Double, lo2Double, 0x1);
 
-                // Reinterpret back to ulong.
                 Vector128<ulong> shuf = shufDouble.AsUInt64();
 
-                // Add the original vector and the shuffled one.
                 Vector128<ulong> sumVec = Sse2.Add(lo2, shuf);
 
-                // Now the horizontal sum is in lane 0.
                 ulong group192 = intermediate.u3 + sumVec.GetElement(0);
 
                 Unsafe.SkipInit(out res);
-                Unsafe.As<UInt256, Vector256<ulong>>(ref res) = Unsafe.As<UInt256, Vector256<ulong>>(ref intermediate).WithElement(3, group192);
+                Unsafe.As<UInt256, Vector256<ulong>>(ref res) =
+                    Unsafe.As<UInt256, Vector256<ulong>>(ref intermediate).WithElement(3, group192);
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static Vector128<ulong> Vector128AddWithCarry(Vector128<ulong> a, Vector128<ulong> b)
+            {
+                Vector128<ulong> sum = Sse2.Add(a, b);
+                Vector128<ulong> carryMask = Avx512F.VL.CompareLessThan(sum, a);
+                carryMask = Sse2.ShiftRightLogical(carryMask, 63);
+                ulong s0 = sum.GetElement(0);
+                ulong s1 = sum.GetElement(1);
+                ulong c0 = carryMask.GetElement(0);
+                s1 += c0;
+                return Vector128.Create(s0, s1);
             }
         }
         

From 1be23d338bcf77c590f1d525358d245f7df50568 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 06:08:07 +0000
Subject: [PATCH 08/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 153e6a1..5a3a6c5 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1055,23 +1055,29 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 UInt256 intermediate;
                 {
-                    UInt256 part64a = new UInt256(0, P01_lo, P01_hi, 0);
-                    UInt256 part64b = new UInt256(0, P10_lo, P10_hi, 0);
-                    UInt256 sum64;
-                    AddImpl(part64a, part64b, out sum64);
-
-                    UInt256 part0 = new UInt256(P00_lo, P00_hi, 0, 0);
-                    AddImpl(part0, sum64, out intermediate);
+                    Vector128<ulong> v128a = Vector128.Create(P01_lo, P01_hi);
+                    Vector128<ulong> v128b = Vector128.Create(P10_lo, P10_hi);
+                    Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
+                    
+                    var hi = temp128.GetElement(1);
+                    var combine = P00_hi + temp128.GetElement(0);
+                    
+                    intermediate = new UInt256(
+                                        P00_lo,
+                                        combine,
+                                        hi + (P00_hi > combine ? 1ul : 0ul),
+                                        P01_hi > hi ? 1ul : 0ul);
                 }
                 {
                     // Pack the nonzero (upper 128-bit) parts into Vector128<ulong>
                     Vector128<ulong> v128a = Vector128.Create(P02_lo, P02_hi);
                     Vector128<ulong> v128b = Vector128.Create(P11_lo, P11_hi);
-                    Vector128<ulong> v128c = Vector128.Create(P20_lo, P20_hi);
     
                     // Use our 128-bit adder to sum these.
                     // (This helper adds two 128-bit values with proper carry propagation.)
                     Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
+
+                    Vector128<ulong> v128c = Vector128.Create(P20_lo, P20_hi);
                     Vector128<ulong> sum128 = Vector128AddWithCarry(temp128, v128c);
     
                     // Now, these two 64-bit lanes represent the contribution from group 128.

From 726fbccb3b0dbe2a7f10208ce8dcd1d519282bf9 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 06:12:42 +0000
Subject: [PATCH 09/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 5a3a6c5..8670695 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1053,7 +1053,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
 
-                UInt256 intermediate;
+                Vector256<ulong> intermediate;
                 {
                     Vector128<ulong> v128a = Vector128.Create(P01_lo, P01_hi);
                     Vector128<ulong> v128b = Vector128.Create(P10_lo, P10_hi);
@@ -1062,7 +1062,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                     var hi = temp128.GetElement(1);
                     var combine = P00_hi + temp128.GetElement(0);
                     
-                    intermediate = new UInt256(
+                    intermediate = Vector256.Create(
                                         P00_lo,
                                         combine,
                                         hi + (P00_hi > combine ? 1ul : 0ul),
@@ -1083,16 +1083,14 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                     // Now, these two 64-bit lanes represent the contribution from group 128.
                     // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result.
                     // Extract the current upper half of the intermediate sum.
-                    Vector128<ulong> interUpper = Vector128.Create(intermediate.u2, intermediate.u3);
+                    Vector128<ulong> interUpper = intermediate.GetUpper();
                     // Add the computed 128-bit group sum to that upper half.
                     Vector128<ulong> newInterUpper = Vector128AddWithCarry(interUpper, sum128);
     
                     // Update the intermediate result—its lower half (u0 and u1) remains unchanged.
-                    intermediate = new UInt256(
-                                        intermediate.u0,
-                                        intermediate.u1,
-                                        newInterUpper.GetElement(0),
-                                        newInterUpper.GetElement(1));
+                    intermediate = Vector256.Create(
+                                        intermediate.GetLower(),
+                                        newInterUpper);
                 }
 
                 Vector128<ulong> vecA2 = Vector128.Create(x.u2, x.u3);
@@ -1109,11 +1107,11 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 Vector128<ulong> sumVec = Sse2.Add(lo2, shuf);
 
-                ulong group192 = intermediate.u3 + sumVec.GetElement(0);
+                ulong group192 = intermediate.GetElement(3) + sumVec.GetElement(0);
 
                 Unsafe.SkipInit(out res);
                 Unsafe.As<UInt256, Vector256<ulong>>(ref res) =
-                    Unsafe.As<UInt256, Vector256<ulong>>(ref intermediate).WithElement(3, group192);
+                    intermediate.WithElement(3, group192);
             }
 
             [MethodImpl(MethodImplOptions.AggressiveInlining)]

From 503bdb8c5a19fede30ff0132a55d6be9efde6d8c Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 06:15:35 +0000
Subject: [PATCH 10/38] optimize

---
 src/Nethermind.Int256/UInt256.cs | 62 +++++++++++++++-----------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 8670695..6b89e5d 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1053,45 +1053,41 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
 
-                Vector256<ulong> intermediate;
-                {
-                    Vector128<ulong> v128a = Vector128.Create(P01_lo, P01_hi);
-                    Vector128<ulong> v128b = Vector128.Create(P10_lo, P10_hi);
-                    Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
+                Vector128<ulong> v128aa = Vector128.Create(P01_lo, P01_hi);
+                Vector128<ulong> v128bb = Vector128.Create(P10_lo, P10_hi);
+                Vector128<ulong> temp128a = Vector128AddWithCarry(v128aa, v128bb);
                     
-                    var hi = temp128.GetElement(1);
-                    var combine = P00_hi + temp128.GetElement(0);
+                var hi = temp128a.GetElement(1);
+                var combine = P00_hi + temp128a.GetElement(0);
                     
-                    intermediate = Vector256.Create(
-                                        P00_lo,
-                                        combine,
-                                        hi + (P00_hi > combine ? 1ul : 0ul),
-                                        P01_hi > hi ? 1ul : 0ul);
-                }
-                {
-                    // Pack the nonzero (upper 128-bit) parts into Vector128<ulong>
-                    Vector128<ulong> v128a = Vector128.Create(P02_lo, P02_hi);
-                    Vector128<ulong> v128b = Vector128.Create(P11_lo, P11_hi);
+                Vector256<ulong> intermediate = Vector256.Create(
+                                    P00_lo,
+                                    combine,
+                                    hi + (P00_hi > combine ? 1ul : 0ul),
+                                    P01_hi > hi ? 1ul : 0ul);
+
+                // Pack the nonzero (upper 128-bit) parts into Vector128<ulong>
+                Vector128<ulong> v128a = Vector128.Create(P02_lo, P02_hi);
+                Vector128<ulong> v128b = Vector128.Create(P11_lo, P11_hi);
     
-                    // Use our 128-bit adder to sum these.
-                    // (This helper adds two 128-bit values with proper carry propagation.)
-                    Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
+                // Use our 128-bit adder to sum these.
+                // (This helper adds two 128-bit values with proper carry propagation.)
+                Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
 
-                    Vector128<ulong> v128c = Vector128.Create(P20_lo, P20_hi);
-                    Vector128<ulong> sum128 = Vector128AddWithCarry(temp128, v128c);
+                Vector128<ulong> v128c = Vector128.Create(P20_lo, P20_hi);
+                Vector128<ulong> sum128 = Vector128AddWithCarry(temp128, v128c);
     
-                    // Now, these two 64-bit lanes represent the contribution from group 128.
-                    // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result.
-                    // Extract the current upper half of the intermediate sum.
-                    Vector128<ulong> interUpper = intermediate.GetUpper();
-                    // Add the computed 128-bit group sum to that upper half.
-                    Vector128<ulong> newInterUpper = Vector128AddWithCarry(interUpper, sum128);
+                // Now, these two 64-bit lanes represent the contribution from group 128.
+                // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result.
+                // Extract the current upper half of the intermediate sum.
+                Vector128<ulong> interUpper = intermediate.GetUpper();
+                // Add the computed 128-bit group sum to that upper half.
+                Vector128<ulong> newInterUpper = Vector128AddWithCarry(interUpper, sum128);
     
-                    // Update the intermediate result—its lower half (u0 and u1) remains unchanged.
-                    intermediate = Vector256.Create(
-                                        intermediate.GetLower(),
-                                        newInterUpper);
-                }
+                // Update the intermediate result—its lower half (u0 and u1) remains unchanged.
+                intermediate = Vector256.Create(
+                                    intermediate.GetLower(),
+                                    newInterUpper);
 
                 Vector128<ulong> vecA2 = Vector128.Create(x.u2, x.u3);
                 Vector128<ulong> vecB2 = Vector128.Create(y.u1, y.u0);

From 71d893f30ed8d632a2b4f99229de7dc7af3a1a62 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 06:18:20 +0000
Subject: [PATCH 11/38] Recoment and rename

---
 src/Nethermind.Int256/UInt256.cs | 171 ++++++++++++++++++-------------
 1 file changed, 99 insertions(+), 72 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 6b89e5d..2d01415 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1034,80 +1034,107 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
             else
             {
-                Vector256<ulong> vecA = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
-                Vector256<ulong> vecB = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
-
-                Vector512<ulong> vecA1 = Vector512.Create(Avx2.Permute4x64(vecA, 16), Avx2.Permute4x64(vecA, 73));
-                Vector512<ulong> vecB1 = Vector512.Create(Avx2.Permute4x64(vecB, 132), Avx2.Permute4x64(vecB, 177));
-                Mul64Vector(vecA1, vecB1, out Vector512<ulong> lo1, out Vector512<ulong> hi1);
-
-                // Extract products from group 1.
-                ulong P00_lo = lo1.GetElement(0), P00_hi = hi1.GetElement(0);
-                ulong P01_lo = lo1.GetElement(1), P01_hi = hi1.GetElement(1);
-                ulong P10_lo = lo1.GetElement(2), P10_hi = hi1.GetElement(2);
-                ulong P02_lo = lo1.GetElement(3), P02_hi = hi1.GetElement(3);
-                ulong P11_lo = lo1.GetElement(4), P11_hi = hi1.GetElement(4);
-                ulong P20_lo = lo1.GetElement(5), P20_hi = hi1.GetElement(5);
-                ulong P03_lo = lo1.GetElement(6);
-                ulong P12_lo = lo1.GetElement(7);
-
-                // --- Package each 128-bit partial product into a UInt256 (with proper shifting) ---
-
-                Vector128<ulong> v128aa = Vector128.Create(P01_lo, P01_hi);
-                Vector128<ulong> v128bb = Vector128.Create(P10_lo, P10_hi);
-                Vector128<ulong> temp128a = Vector128AddWithCarry(v128aa, v128bb);
-                    
-                var hi = temp128a.GetElement(1);
-                var combine = P00_hi + temp128a.GetElement(0);
-                    
-                Vector256<ulong> intermediate = Vector256.Create(
-                                    P00_lo,
-                                    combine,
-                                    hi + (P00_hi > combine ? 1ul : 0ul),
-                                    P01_hi > hi ? 1ul : 0ul);
-
-                // Pack the nonzero (upper 128-bit) parts into Vector128<ulong>
-                Vector128<ulong> v128a = Vector128.Create(P02_lo, P02_hi);
-                Vector128<ulong> v128b = Vector128.Create(P11_lo, P11_hi);
-    
-                // Use our 128-bit adder to sum these.
-                // (This helper adds two 128-bit values with proper carry propagation.)
-                Vector128<ulong> temp128 = Vector128AddWithCarry(v128a, v128b);
-
-                Vector128<ulong> v128c = Vector128.Create(P20_lo, P20_hi);
-                Vector128<ulong> sum128 = Vector128AddWithCarry(temp128, v128c);
-    
-                // Now, these two 64-bit lanes represent the contribution from group 128.
-                // They belong in the upper half (limbs u2 and u3) of our full 256-bit intermediate result.
-                // Extract the current upper half of the intermediate sum.
-                Vector128<ulong> interUpper = intermediate.GetUpper();
-                // Add the computed 128-bit group sum to that upper half.
-                Vector128<ulong> newInterUpper = Vector128AddWithCarry(interUpper, sum128);
-    
-                // Update the intermediate result—its lower half (u0 and u1) remains unchanged.
-                intermediate = Vector256.Create(
-                                    intermediate.GetLower(),
-                                    newInterUpper);
-
-                Vector128<ulong> vecA2 = Vector128.Create(x.u2, x.u3);
-                Vector128<ulong> vecB2 = Vector128.Create(y.u1, y.u0);
-
-                Vector128<ulong> lo2 = Avx512DQ.VL.MultiplyLow(vecA2, vecB2);
-                
-                lo2 = Sse2.Add(lo2, Vector128.Create(P03_lo, P12_lo));
-                Vector128<double> lo2Double = lo2.AsDouble();
-
-                Vector128<double> shufDouble = Sse2.Shuffle(lo2Double, lo2Double, 0x1);
-
-                Vector128<ulong> shuf = shufDouble.AsUInt64();
-
-                Vector128<ulong> sumVec = Sse2.Add(lo2, shuf);
-
-                ulong group192 = intermediate.GetElement(3) + sumVec.GetElement(0);
-
+                // Load the 256‐bit inputs into 256‐bit vector registers.
+                Vector256<ulong> aVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
+                Vector256<ulong> bVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
+
+                // Rearrange the 64‐bit limbs of each input into 512‐bit vectors.
+                // The chosen permutations align the limbs so that later 64‐bit multiplications yield the correct cross‐products.
+                Vector512<ulong> rearrangedA = Vector512.Create(
+                    Avx2.Permute4x64(aVector, 16),  // Lower part permutation for A
+                    Avx2.Permute4x64(aVector, 73)); // Upper part permutation for A
+
+                Vector512<ulong> rearrangedB = Vector512.Create(
+                    Avx2.Permute4x64(bVector, 132), // Lower part permutation for B
+                    Avx2.Permute4x64(bVector, 177)); // Upper part permutation for B
+
+                // Multiply the corresponding 64‐bit limbs of the rearranged inputs.
+                // Each multiplication yields a 128‐bit product split into a low and high 64‐bit part.
+                Mul64Vector(rearrangedA, rearrangedB, out Vector512<ulong> partialLo, out Vector512<ulong> partialHi);
+
+                // --- Extract Partial Products from the First Group ---
+                //
+                // The following partial products (with both low and high parts) result from the 64‐bit multiplications.
+                // They are named to indicate their source position in the multiplication grid.
+                ulong prod00_Lo = partialLo.GetElement(0), prod00_Hi = partialHi.GetElement(0);
+                ulong prod01_Lo = partialLo.GetElement(1), prod01_Hi = partialHi.GetElement(1);
+                ulong prod10_Lo = partialLo.GetElement(2), prod10_Hi = partialHi.GetElement(2);
+                ulong prod02_Lo = partialLo.GetElement(3), prod02_Hi = partialHi.GetElement(3);
+                ulong prod11_Lo = partialLo.GetElement(4), prod11_Hi = partialHi.GetElement(4);
+                ulong prod20_Lo = partialLo.GetElement(5), prod20_Hi = partialHi.GetElement(5);
+                ulong prod03_Lo = partialLo.GetElement(6); // Only lower 64‐bits produced.
+                ulong prod12_Lo = partialLo.GetElement(7); // Only lower 64‐bits produced.
+
+                // --- Combine Lower-Group Partial Products into an Intermediate 256-bit Result ---
+                //
+                // The cross-terms prod01 and prod10 contribute to the middle limbs of the full product.
+                // First, add these two 128‐bit values (each stored as two 64‐bit limbs) with proper carry propagation.
+                Vector128<ulong> crossTermA = Vector128.Create(prod01_Lo, prod01_Hi);
+                Vector128<ulong> crossTermB = Vector128.Create(prod10_Lo, prod10_Hi);
+                Vector128<ulong> crossSum = Vector128AddWithCarry(crossTermA, crossTermB);
+
+                // The lower 64‐bit lane of the cross‐sum will be added to the high part of prod00.
+                ulong crossLowPart = crossSum.GetElement(0);
+                ulong combinedProd00_Hi = prod00_Hi + crossLowPart;
+
+                // Build the initial 256‐bit intermediate result from the lower-group products:
+                // • Limb 0: prod00_Lo (lowest 64 bits of prod00)
+                // • Limb 1: combinedProd00_Hi (prod00_Hi plus the low cross‐term)
+                // • Limb 2: The high lane of the cross‐sum plus a carry if the addition in limb 1 overflowed.
+                // • Limb 3: A final carry from the cross‐term addition (if prod01_Hi exceeds crossSum’s high lane).
+                Vector256<ulong> intermediateResult = Vector256.Create(
+                    prod00_Lo,
+                    combinedProd00_Hi,
+                    crossSum.GetElement(1) + (prod00_Hi > combinedProd00_Hi ? 1ul : 0ul),
+                    (prod01_Hi > crossSum.GetElement(1) ? 1ul : 0ul));
+
+                // --- Add Contributions from the Upper Group Partial Products ---
+                //
+                // The products prod02 and prod11 form one 128‐bit group.
+                Vector128<ulong> group2_A = Vector128.Create(prod02_Lo, prod02_Hi);
+                Vector128<ulong> group2_B = Vector128.Create(prod11_Lo, prod11_Hi);
+                Vector128<ulong> group2Sum = Vector128AddWithCarry(group2_A, group2_B);
+
+                // Include the contribution from prod20 into the group sum.
+                Vector128<ulong> group2_C = Vector128.Create(prod20_Lo, prod20_Hi);
+                Vector128<ulong> totalGroup2 = Vector128AddWithCarry(group2Sum, group2_C);
+
+                // These 128 bits (two 64-bit lanes) belong in the upper half (limbs 2 and 3) of the intermediate result.
+                // Retrieve the current upper 128 bits of the intermediate result and add the group2 sum.
+                Vector128<ulong> currentUpperHalf = intermediateResult.GetUpper();
+                Vector128<ulong> newUpperHalf = Vector128AddWithCarry(currentUpperHalf, totalGroup2);
+
+                // Update the intermediate result with the new upper half (the lower half remains unchanged).
+                intermediateResult = Vector256.Create(
+                    intermediateResult.GetLower(),
+                    newUpperHalf);
+
+                // --- Process and Add the Final (Group 3) Contributions ---
+                //
+                // For the remaining contribution, multiply selected limbs from the inputs.
+                // Here, the upper 128 bits of x and the lower 128 bits of y (in reversed order) are multiplied.
+                Vector128<ulong> aHigh = Vector128.Create(x.u2, x.u3);
+                Vector128<ulong> bLow = Vector128.Create(y.u1, y.u0);
+                Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow);
+
+                // Add the remaining lower parts from prod03 and prod12.
+                finalProdLow = Sse2.Add(finalProdLow, Vector128.Create(prod03_Lo, prod12_Lo));
+
+                // Perform a horizontal add on finalProdLow to collapse its two 64‐bit lanes into one sum.
+                // (This is done by shuffling the 64-bit lanes using a double-precision view and then adding them.)
+                Vector128<double> finalProdAsDouble = finalProdLow.AsDouble();
+                Vector128<double> shuffledDouble = Sse2.Shuffle(finalProdAsDouble, finalProdAsDouble, 0x1);
+                Vector128<ulong> shuffledULong = shuffledDouble.AsUInt64();
+                Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, shuffledULong);
+
+                // Add the horizontal sum (the final contribution) to the most-significant limb (limb 3) of the intermediate result.
+                ulong updatedMostSignificant = intermediateResult.GetElement(3) + horizontalSum.GetElement(0);
+
+                // Write the final 256-bit product, updating limb 3 with the new value.
                 Unsafe.SkipInit(out res);
                 Unsafe.As<UInt256, Vector256<ulong>>(ref res) =
-                    intermediate.WithElement(3, group192);
+                    intermediateResult.WithElement(3, updatedMostSignificant);
+
             }
 
             [MethodImpl(MethodImplOptions.AggressiveInlining)]

From 81a49f5541baeb589f442066a631ba5a961a4273 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 11:49:17 +0000
Subject: [PATCH 12/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 48 +++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 2d01415..726269d 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1122,9 +1122,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
                 // Perform a horizontal add on finalProdLow to collapse its two 64‐bit lanes into one sum.
                 // (This is done by shuffling the 64-bit lanes using a double-precision view and then adding them.)
-                Vector128<double> finalProdAsDouble = finalProdLow.AsDouble();
-                Vector128<double> shuffledDouble = Sse2.Shuffle(finalProdAsDouble, finalProdAsDouble, 0x1);
-                Vector128<ulong> shuffledULong = shuffledDouble.AsUInt64();
+                Vector128<ulong> shuffledULong = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64();
                 Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, shuffledULong);
 
                 // Add the horizontal sum (the final contribution) to the most-significant limb (limb 3) of the intermediate result.
@@ -1137,17 +1135,41 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
             }
 
+            /// <summary>
+            /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
+            /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits
+            /// and element 1 holding the higher 64 bits.
+            /// </summary>
+            /// <param name="operand1">The first 128-bit unsigned integer operand.</param>
+            /// <param name="operand2">The second 128-bit unsigned integer operand.</param>
+            /// <returns>
+            /// A <see cref="Vector128{ulong}"/> representing the sum of the two operands, with any carry from the lower lane added
+            /// into the higher lane.
+            /// </returns>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector128<ulong> Vector128AddWithCarry(Vector128<ulong> a, Vector128<ulong> b)
-            {
-                Vector128<ulong> sum = Sse2.Add(a, b);
-                Vector128<ulong> carryMask = Avx512F.VL.CompareLessThan(sum, a);
-                carryMask = Sse2.ShiftRightLogical(carryMask, 63);
-                ulong s0 = sum.GetElement(0);
-                ulong s1 = sum.GetElement(1);
-                ulong c0 = carryMask.GetElement(0);
-                s1 += c0;
-                return Vector128.Create(s0, s1);
+            static Vector128<ulong> Vector128AddWithCarry(Vector128<ulong> left, Vector128<ulong> right)
+            {
+                // Perform a lane-wise addition of the two operands.
+                Vector128<ulong> sum = Sse2.Add(left, right);
+
+                // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands.
+                // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise.
+                Vector128<ulong> overflowMask = Avx512F.VL.CompareLessThan(sum, left);
+
+                // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
+                // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
+                overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
+
+                // Promote the carry from the lower lane (element 0) into the upper lane.
+                // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane.
+                Vector128<ulong> swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64();
+
+                // Next, clear the (now swapped) lower lane by blending with a zero vector.
+                // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged.
+                Vector128<ulong> promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128<double>.Zero, 0x1).AsUInt64();
+
+                // Add the propagated carry to the sum.
+                return Sse2.Add(sum, promotedCarry);
             }
         }
         

From 2c5c7a38e90e2a4fca26e12c7b30f212d48fb755 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Fri, 7 Feb 2025 13:59:15 +0000
Subject: [PATCH 13/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 380 ++++++++++++++++++-------------
 1 file changed, 216 insertions(+), 164 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 726269d..95149d7 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1010,7 +1010,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 Unsafe.AsRef(in res.u1) = high;
                 return;
             }
-            
+
             if (!Avx512F.IsSupported || !Avx512DQ.IsSupported)
             {
                 ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
@@ -1034,189 +1034,241 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
             else
             {
-                // Load the 256‐bit inputs into 256‐bit vector registers.
+                // 1. Load the 256‐bit inputs into 256‐bit vector registers.
                 Vector256<ulong> aVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
                 Vector256<ulong> bVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
 
-                // Rearrange the 64‐bit limbs of each input into 512‐bit vectors.
-                // The chosen permutations align the limbs so that later 64‐bit multiplications yield the correct cross‐products.
-                Vector512<ulong> rearrangedA = Vector512.Create(
-                    Avx2.Permute4x64(aVector, 16),  // Lower part permutation for A
-                    Avx2.Permute4x64(aVector, 73)); // Upper part permutation for A
+                // 2. Rearrange the 64‐bit limbs into 512‐bit vectors.
+                Vector256<ulong> aPerm0 = Avx2.Permute4x64(aVector, 16);
+                Vector256<ulong> aPerm1 = Avx2.Permute4x64(aVector, 73);
+                Vector512<ulong> rearrangedA = Vector512.Create(aPerm0, aPerm1);
 
-                Vector512<ulong> rearrangedB = Vector512.Create(
-                    Avx2.Permute4x64(bVector, 132), // Lower part permutation for B
-                    Avx2.Permute4x64(bVector, 177)); // Upper part permutation for B
+                Vector256<ulong> bPerm0 = Avx2.Permute4x64(bVector, 132);
+                Vector256<ulong> bPerm1 = Avx2.Permute4x64(bVector, 177);
+                Vector512<ulong> rearrangedB = Vector512.Create(bPerm0, bPerm1);
 
-                // Multiply the corresponding 64‐bit limbs of the rearranged inputs.
-                // Each multiplication yields a 128‐bit product split into a low and high 64‐bit part.
+                // 3. Multiply the corresponding 64‐bit limbs.
                 Mul64Vector(rearrangedA, rearrangedB, out Vector512<ulong> partialLo, out Vector512<ulong> partialHi);
 
-                // --- Extract Partial Products from the First Group ---
-                //
-                // The following partial products (with both low and high parts) result from the 64‐bit multiplications.
-                // They are named to indicate their source position in the multiplication grid.
-                ulong prod00_Lo = partialLo.GetElement(0), prod00_Hi = partialHi.GetElement(0);
-                ulong prod01_Lo = partialLo.GetElement(1), prod01_Hi = partialHi.GetElement(1);
-                ulong prod10_Lo = partialLo.GetElement(2), prod10_Hi = partialHi.GetElement(2);
-                ulong prod02_Lo = partialLo.GetElement(3), prod02_Hi = partialHi.GetElement(3);
-                ulong prod11_Lo = partialLo.GetElement(4), prod11_Hi = partialHi.GetElement(4);
-                ulong prod20_Lo = partialLo.GetElement(5), prod20_Hi = partialHi.GetElement(5);
-                ulong prod03_Lo = partialLo.GetElement(6); // Only lower 64‐bits produced.
-                ulong prod12_Lo = partialLo.GetElement(7); // Only lower 64‐bits produced.
-
-                // --- Combine Lower-Group Partial Products into an Intermediate 256-bit Result ---
-                //
-                // The cross-terms prod01 and prod10 contribute to the middle limbs of the full product.
-                // First, add these two 128‐bit values (each stored as two 64‐bit limbs) with proper carry propagation.
-                Vector128<ulong> crossTermA = Vector128.Create(prod01_Lo, prod01_Hi);
-                Vector128<ulong> crossTermB = Vector128.Create(prod10_Lo, prod10_Hi);
-                Vector128<ulong> crossSum = Vector128AddWithCarry(crossTermA, crossTermB);
-
-                // The lower 64‐bit lane of the cross‐sum will be added to the high part of prod00.
-                ulong crossLowPart = crossSum.GetElement(0);
-                ulong combinedProd00_Hi = prod00_Hi + crossLowPart;
-
-                // Build the initial 256‐bit intermediate result from the lower-group products:
-                // • Limb 0: prod00_Lo (lowest 64 bits of prod00)
-                // • Limb 1: combinedProd00_Hi (prod00_Hi plus the low cross‐term)
-                // • Limb 2: The high lane of the cross‐sum plus a carry if the addition in limb 1 overflowed.
-                // • Limb 3: A final carry from the cross‐term addition (if prod01_Hi exceeds crossSum’s high lane).
-                Vector256<ulong> intermediateResult = Vector256.Create(
-                    prod00_Lo,
-                    combinedProd00_Hi,
-                    crossSum.GetElement(1) + (prod00_Hi > combinedProd00_Hi ? 1ul : 0ul),
-                    (prod01_Hi > crossSum.GetElement(1) ? 1ul : 0ul));
-
-                // --- Add Contributions from the Upper Group Partial Products ---
-                //
-                // The products prod02 and prod11 form one 128‐bit group.
-                Vector128<ulong> group2_A = Vector128.Create(prod02_Lo, prod02_Hi);
-                Vector128<ulong> group2_B = Vector128.Create(prod11_Lo, prod11_Hi);
-                Vector128<ulong> group2Sum = Vector128AddWithCarry(group2_A, group2_B);
-
-                // Include the contribution from prod20 into the group sum.
-                Vector128<ulong> group2_C = Vector128.Create(prod20_Lo, prod20_Hi);
-                Vector128<ulong> totalGroup2 = Vector128AddWithCarry(group2Sum, group2_C);
-
-                // These 128 bits (two 64-bit lanes) belong in the upper half (limbs 2 and 3) of the intermediate result.
-                // Retrieve the current upper 128 bits of the intermediate result and add the group2 sum.
-                Vector128<ulong> currentUpperHalf = intermediateResult.GetUpper();
-                Vector128<ulong> newUpperHalf = Vector128AddWithCarry(currentUpperHalf, totalGroup2);
-
-                // Update the intermediate result with the new upper half (the lower half remains unchanged).
-                intermediateResult = Vector256.Create(
-                    intermediateResult.GetLower(),
-                    newUpperHalf);
-
-                // --- Process and Add the Final (Group 3) Contributions ---
-                //
-                // For the remaining contribution, multiply selected limbs from the inputs.
-                // Here, the upper 128 bits of x and the lower 128 bits of y (in reversed order) are multiplied.
+                // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20)
+                //    into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.)
+                //    – Products 0 and 1 come from index 0:
+                Vector128<ulong> pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo
+                Vector128<ulong> pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi
+                // Unpack lower (lane0) and upper (lane1) to form product0 and product1:
+                Vector128<ulong> prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi);  // prod00 = {lo, hi}
+                Vector128<ulong> prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi}
+
+                // – Products 2 and 3 come from index 1:
+                Vector128<ulong> pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo
+                Vector128<ulong> pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi
+                Vector128<ulong> prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi);  // prod10
+                Vector128<ulong> prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02
+
+                // – Products 4 and 5 come from index 2:
+                Vector128<ulong> pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo
+                Vector128<ulong> pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi
+                Vector128<ulong> prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi);  // prod11
+                Vector128<ulong> prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20
+
+                // 5. Group‑1 “cross‑term” addition:
+                //    crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2)
+                Vector128<ulong> crossSum = Add128(prod1, prod2);
+
+                // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb.
+                //    Instead of extracting a scalar, we broadcast the lower 64 bits to a vector.
+                //    (Assume BroadcastLower128 returns a copy with both lanes equal to element0.)
+                Vector128<ulong> csLow = BroadcastLower128(crossSum);
+                // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue}
+                Vector128<ulong> highMask = Vector128.Create(0ul, ulong.MaxValue);
+                Vector128<ulong> addMask = Sse2.And(csLow, highMask);
+                Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
+
+                // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
+                uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan(
+                    ExtractHighLimb(prod0Updated), // compare updated high limb...
+                    ExtractHighLimb(prod0)         // ...with the original high limb
+                    ).AsByte()) & 1;
+                // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
+                Vector128<ulong> csHigh = BroadcastUpper128(crossSum);
+                Vector128<ulong> limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag));
+
+                // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
+                uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan(
+                    ExtractHighLimb(prod1), csHigh).AsByte()) & 1);
+                Vector128<ulong> limb3Vec = Vector128.CreateScalar((ulong)limb3);
+
+                // 7. Build the 256‑bit “intermediate” result from group‑1:
+                //    Lower 128 bits = prod00 (with updated high limb)
+                //    Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector.
+                Vector128<ulong> lowerIntermediate = prod0Updated;
+                // Pack limb2 into the lower half and limb3 into the upper half.
+                Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec);
+                Vector256<ulong> intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate);
+
+                // 8. Process group‑2: (prod02, prod11, prod20)
+                Vector128<ulong> group2Sum = Add128(prod3, prod4);
+                Vector128<ulong> totalGroup2 = Add128(group2Sum, prod5);
+                // Add totalGroup2 into the current upper 128 bits of intermediateResult.
+                Vector128<ulong> currentUpper = GetUpper(intermediateResult);
+                Vector128<ulong> newUpper = Add128(currentUpper, totalGroup2);
+                intermediateResult = WithUpper(intermediateResult, newUpper);
+
+                // 9. Process group‑3:
+                //    Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts.
                 Vector128<ulong> aHigh = Vector128.Create(x.u2, x.u3);
                 Vector128<ulong> bLow = Vector128.Create(y.u1, y.u0);
+                // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits)
                 Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow);
 
-                // Add the remaining lower parts from prod03 and prod12.
-                finalProdLow = Sse2.Add(finalProdLow, Vector128.Create(prod03_Lo, prod12_Lo));
+                // Extract from partialLo the two lower parts for prod03 and prod12.
+                // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7),
+                // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes.
+                Vector128<ulong> prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1);
+                // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”):
+                // (Note: prod6 already holds both lower parts.)
+                finalProdLow = Sse2.Add(finalProdLow, prod6);
+                // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
+                Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
+                // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
+                Vector128<ulong> upperTemp = GetUpper(intermediateResult);
+                Vector128<ulong> hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask);
+                Vector128<ulong> newUpperTemp = Sse2.Add(upperTemp, hsBroadcast);
+                intermediateResult = WithUpper(intermediateResult, newUpperTemp);
+
+                // 10. Write out the final 256‑bit result.
+                Unsafe.SkipInit(out res);
+                Unsafe.As<UInt256, Vector256<ulong>>(ref res) = intermediateResult;
 
-                // Perform a horizontal add on finalProdLow to collapse its two 64‐bit lanes into one sum.
-                // (This is done by shuffling the 64-bit lanes using a double-precision view and then adding them.)
-                Vector128<ulong> shuffledULong = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64();
-                Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, shuffledULong);
+                static Vector128<ulong> HorizontalAdd(Vector128<ulong> vec)
+                {
+                    // Reinterpret the 64-bit integer vector as a vector of two doubles.
+                    // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes.
+                    Vector128<ulong> swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64();
 
-                // Add the horizontal sum (the final contribution) to the most-significant limb (limb 3) of the intermediate result.
-                ulong updatedMostSignificant = intermediateResult.GetElement(3) + horizontalSum.GetElement(0);
+                    // Add the original vector and the swapped vector.
+                    // This results in a vector where both lanes equal (vec[0] + vec[1]).
+                    return Sse2.Add(vec, swapped);
+                }
 
-                // Write the final 256-bit product, updating limb 3 with the new value.
-                Unsafe.SkipInit(out res);
-                Unsafe.As<UInt256, Vector256<ulong>>(ref res) =
-                    intermediateResult.WithElement(3, updatedMostSignificant);
-
-            }
-
-            /// <summary>
-            /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
-            /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits
-            /// and element 1 holding the higher 64 bits.
-            /// </summary>
-            /// <param name="operand1">The first 128-bit unsigned integer operand.</param>
-            /// <param name="operand2">The second 128-bit unsigned integer operand.</param>
-            /// <returns>
-            /// A <see cref="Vector128{ulong}"/> representing the sum of the two operands, with any carry from the lower lane added
-            /// into the higher lane.
-            /// </returns>
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector128<ulong> Vector128AddWithCarry(Vector128<ulong> left, Vector128<ulong> right)
-            {
-                // Perform a lane-wise addition of the two operands.
-                Vector128<ulong> sum = Sse2.Add(left, right);
+                // Helpers that mimic “GetUpper” and “WithUpper” on a 256‑bit vector.
+                // (You might implement these as extension methods on Vector256<T>.)
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector128<ulong> GetUpper(Vector256<ulong> vec)
+                {
+                    // For example, using Avx2.ExtractVector128:
+                    return Avx2.ExtractVector128(vec, 1);
+                }
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
+                {
+                    // Replace the upper 128 bits of vec with upper.
+                    return Avx2.InsertVector128(vec, upper, 1);
+                }
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector128<ulong> ExtractHighLimb(Vector128<ulong> vec)
+                {
+                    // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb,
+                    // then reinterpret back as 64-bit.
+                    return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64();
+                }
 
-                // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands.
-                // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise.
-                Vector128<ulong> overflowMask = Avx512F.VL.CompareLessThan(sum, left);
+                // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128<ulong>.
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector128<ulong> BroadcastLower128(Vector128<ulong> vec)
+                {
+                    // Replicate element0 to both lanes.
+                    return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64();
+                }
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector128<ulong> BroadcastUpper128(Vector128<ulong> vec)
+                {
+                    // Replicate element1 to both lanes.
+                    return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1
+                }
+                /// <summary>
+                /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
+                /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits
+                /// and element 1 holding the higher 64 bits.
+                /// </summary>
+                /// <param name="operand1">The first 128-bit unsigned integer operand.</param>
+                /// <param name="operand2">The second 128-bit unsigned integer operand.</param>
+                /// <returns>
+                /// A <see cref="Vector128{ulong}"/> representing the sum of the two operands, with any carry from the lower lane added
+                /// into the higher lane.
+                /// </returns>
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                static Vector128<ulong> Add128(Vector128<ulong> left, Vector128<ulong> right)
+                {
+                    // Perform a lane-wise addition of the two operands.
+                    Vector128<ulong> sum = Sse2.Add(left, right);
+
+                    // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands.
+                    // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise.
+                    Vector128<ulong> overflowMask = Avx512F.VL.CompareLessThan(sum, left);
 
-                // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
-                // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
-                overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
+                    // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
+                    // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
+                    overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
 
-                // Promote the carry from the lower lane (element 0) into the upper lane.
-                // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane.
-                Vector128<ulong> swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64();
+                    // Promote the carry from the lower lane (element 0) into the upper lane.
+                    // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane.
+                    Vector128<ulong> swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64();
 
-                // Next, clear the (now swapped) lower lane by blending with a zero vector.
-                // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged.
-                Vector128<ulong> promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128<double>.Zero, 0x1).AsUInt64();
+                    // Next, clear the (now swapped) lower lane by blending with a zero vector.
+                    // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged.
+                    Vector128<ulong> promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128<double>.Zero, 0x1).AsUInt64();
 
-                // Add the propagated carry to the sum.
-                return Sse2.Add(sum, promotedCarry);
+                    // Add the propagated carry to the sum.
+                    return Sse2.Add(sum, promotedCarry);
+                }
+            }
+
+            // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes),
+            // computes per lane:
+            //   product = a * b = (hi, lo)
+            // using the splitting method since there is no MultiplyHigh intrinsic.
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static void Mul64Vector(Vector512<ulong> a, Vector512<ulong> b,
+                                              out Vector512<ulong> lo, out Vector512<ulong> hi)
+            {
+                // Mask for the lower 32 bits.
+                Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
+
+                // Split each 64-bit operand into 32-bit halves:
+                // a0 = lower 32 bits, a1 = upper 32 bits
+                Vector512<ulong> a0 = Avx512F.And(a, mask32);
+                Vector512<ulong> a1 = Avx512F.ShiftRightLogical(a, 32);
+                Vector512<ulong> b0 = Avx512F.And(b, mask32);
+                Vector512<ulong> b1 = Avx512F.ShiftRightLogical(b, 32);
+
+                // Compute the four 32x32 partial products.
+                // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
+                Vector512<ulong> u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0
+                Vector512<ulong> u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1
+                Vector512<ulong> u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0
+                Vector512<ulong> u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1
+
+                // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
+                Vector512<ulong> u0_hi = Avx512F.ShiftRightLogical(u0, 32);
+                Vector512<ulong> u1_lo = Avx512F.And(u1, mask32);
+                Vector512<ulong> u2_lo = Avx512F.And(u2, mask32);
+                Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo);
+
+                // The extra carry: c = t >> 32.
+                Vector512<ulong> c = Avx512F.ShiftRightLogical(t, 32);
+
+                // Now, assemble the lower 64 bits:
+                // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
+                Vector512<ulong> u0_lo = Avx512F.And(u0, mask32);
+                Vector512<ulong> t_lo = Avx512F.And(t, mask32);
+                lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32));
+
+                // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
+                Vector512<ulong> u1_hi = Avx512F.ShiftRightLogical(u1, 32);
+                Vector512<ulong> u2_hi = Avx512F.ShiftRightLogical(u2, 32);
+                hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c);
             }
-        }
-        
-        // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes),
-        // computes per lane:
-        //   product = a * b = (hi, lo)
-        // using the splitting method since there is no MultiplyHigh intrinsic.
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void Mul64Vector(Vector512<ulong> a, Vector512<ulong> b,
-                                          out Vector512<ulong> lo, out Vector512<ulong> hi)
-        {
-            // Mask for the lower 32 bits.
-            Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
-
-            // Split each 64-bit operand into 32-bit halves:
-            // a0 = lower 32 bits, a1 = upper 32 bits
-            Vector512<ulong> a0 = Avx512F.And(a, mask32);
-            Vector512<ulong> a1 = Avx512F.ShiftRightLogical(a, 32);
-            Vector512<ulong> b0 = Avx512F.And(b, mask32);
-            Vector512<ulong> b1 = Avx512F.ShiftRightLogical(b, 32);
-
-            // Compute the four 32x32 partial products.
-            // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
-            Vector512<ulong> u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0
-            Vector512<ulong> u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1
-            Vector512<ulong> u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0
-            Vector512<ulong> u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1
-
-            // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
-            Vector512<ulong> u0_hi = Avx512F.ShiftRightLogical(u0, 32);
-            Vector512<ulong> u1_lo = Avx512F.And(u1, mask32);
-            Vector512<ulong> u2_lo = Avx512F.And(u2, mask32);
-            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo);
-
-            // The extra carry: c = t >> 32.
-            Vector512<ulong> c = Avx512F.ShiftRightLogical(t, 32);
-
-            // Now, assemble the lower 64 bits:
-            // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
-            Vector512<ulong> u0_lo = Avx512F.And(u0, mask32);
-            Vector512<ulong> t_lo = Avx512F.And(t, mask32);
-            lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32));
-
-            // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
-            Vector512<ulong> u1_hi = Avx512F.ShiftRightLogical(u1, 32);
-            Vector512<ulong> u2_hi = Avx512F.ShiftRightLogical(u2, 32);
-            hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c);
         }
 
         public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res);

From d0195e9671b7ac19ea3e4bff100fe849ab169944 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 04:59:50 +0000
Subject: [PATCH 14/38] Fix benchmarks

---
 src/Nethermind.Int256.Benchmark/Benchmarks.cs | 92 +++++++++----------
 .../NoIntrinsicsJobAttribute.cs               |  4 +-
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/src/Nethermind.Int256.Benchmark/Benchmarks.cs b/src/Nethermind.Int256.Benchmark/Benchmarks.cs
index 616aac1..b22a1da 100644
--- a/src/Nethermind.Int256.Benchmark/Benchmarks.cs
+++ b/src/Nethermind.Int256.Benchmark/Benchmarks.cs
@@ -89,8 +89,8 @@ public class SignedIntTwoParamBenchmarkBase : SignedBenchmarkBase
         public (int, Int256) D;
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class AddUnsigned : UnsignedTwoParamBenchmarkBase
     {
@@ -108,8 +108,8 @@ public UInt256 Add_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class AddSigned : SignedTwoParamBenchmarkBase
     {
@@ -127,8 +127,8 @@ public Int256 Add_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class SubtractUnsigned : UnsignedTwoParamBenchmarkBase
     {
@@ -146,8 +146,8 @@ public UInt256 Subtract_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class SubtractSigned : SignedTwoParamBenchmarkBase
     {
@@ -165,8 +165,8 @@ public Int256 Subtract_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class AddModUnsinged : UnsignedThreeParamBenchmarkBase
     {
@@ -184,8 +184,8 @@ public UInt256 AddMod_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class AddModSinged : SignedThreeParamBenchmarkBase
     {
@@ -203,8 +203,8 @@ public Int256 AddMod_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class SubtractModUnsinged : UnsignedThreeParamBenchmarkBase
     {
@@ -222,8 +222,8 @@ public UInt256 SubtractMod_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class SubtractModSigned : SignedThreeParamBenchmarkBase
     {
@@ -241,8 +241,8 @@ public Int256 SubtractMod_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class MultiplyUnsigned : UnsignedTwoParamBenchmarkBase
     {
@@ -260,8 +260,8 @@ public UInt256 Multiply_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class MultiplySigned : SignedTwoParamBenchmarkBase
     {
@@ -279,8 +279,8 @@ public Int256 Multiply_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class MultiplyModUnsigned : UnsignedThreeParamBenchmarkBase
     {
@@ -298,8 +298,8 @@ public UInt256 MultiplyMod_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class MultiplyModSigned : SignedThreeParamBenchmarkBase
     {
@@ -317,8 +317,8 @@ public Int256 MultiplyMod_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class DivideUnsigned : UnsignedTwoParamBenchmarkBase
     {
@@ -336,8 +336,8 @@ public UInt256 Divide_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class DivideSigned : SignedTwoParamBenchmarkBase
     {
@@ -355,8 +355,8 @@ public Int256 Divide_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class ExpUnsigned : UnsignedIntTwoParamBenchmarkBase
     {
@@ -374,8 +374,8 @@ public UInt256 Exp_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class ExpSigned : SignedIntTwoParamBenchmarkBase
     {
@@ -393,8 +393,8 @@ public Int256 Exp_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class ExpModUnsigned : UnsignedThreeParamBenchmarkBase
     {
@@ -412,8 +412,8 @@ public UInt256 ExpMod_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class ExpModSigned : SignedBenchmarkBase
     {
@@ -440,8 +440,8 @@ public Int256 ExpMod_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class LeftShiftUnsigned : UnsignedIntTwoParamBenchmarkBase
     {
@@ -459,8 +459,8 @@ public UInt256 LeftShift_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class LeftShiftSigned : SignedIntTwoParamBenchmarkBase
     {
@@ -478,8 +478,8 @@ public Int256 LeftShift_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class RightShiftUnsigned : UnsignedIntTwoParamBenchmarkBase
     {
@@ -497,8 +497,8 @@ public UInt256 RightShift_UInt256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class RightShiftSigned : SignedIntTwoParamBenchmarkBase
     {
@@ -516,8 +516,8 @@ public Int256 RightShift_Int256()
         }
     }
 
-    [SimpleJob(RuntimeMoniker.Net70, baseline: true)]
-    [NoIntrinsicsJob(RuntimeMoniker.Net70)]
+    [SimpleJob(RuntimeMoniker.Net90, baseline: true)]
+    [NoIntrinsicsJob(RuntimeMoniker.Net90)]
     [MemoryDiagnoser]
     public class IsZeroOne
     {
diff --git a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs
index d90679e..f3307cb 100644
--- a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs
+++ b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs
@@ -1,4 +1,4 @@
-﻿using System;
+using System;
 
 using BenchmarkDotNet.Attributes;
 using BenchmarkDotNet.Jobs;
@@ -116,6 +116,8 @@ internal static Runtime GetRuntime(this RuntimeMoniker runtimeMoniker)
                     return CoreRuntime.Core70;
                 case RuntimeMoniker.Net80:
                     return CoreRuntime.Core80;
+                case RuntimeMoniker.Net90:
+                    return CoreRuntime.Core90;
                 case RuntimeMoniker.Mono:
                     return MonoRuntime.Default;
                 case RuntimeMoniker.NativeAot60:

From 27f98c4eebca35ac2a7d36e366c1ab1e7966aeeb Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 05:00:06 +0000
Subject: [PATCH 15/38] Temp refactor

---
 src/Nethermind.Int256/UInt256.cs | 486 +++++++++++++++----------------
 1 file changed, 238 insertions(+), 248 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 95149d7..16adf6f 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1002,273 +1002,263 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
         {
             if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0)
             {
-                // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615)
-                ulong high = Math.BigMul(x.u0, y.u0, out ulong low);
-                // Assignment to res after multiply in case is used as input for x or y (by ref aliasing)
-                res = default;
-                Unsafe.AsRef(in res.u0) = low;
-                Unsafe.AsRef(in res.u1) = high;
+                MultiplyULong(x, y, out res);
                 return;
             }
-
             if (!Avx512F.IsSupported || !Avx512DQ.IsSupported)
             {
-                ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
-                ref ulong ry = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in y));
-
-                (ulong carry, ulong r0) = Multiply64(rx, ry);
-                UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
-                UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
-                ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
-
-                UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
-                UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
-                res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
+                MultiplyNonAvx512(x, y, out res);
+                return;
+            }
 
-                UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
-                res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
+            // 1. Load the 256‐bit inputs into 256‐bit vector registers.
+            Vector256<ulong> aVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
+            Vector256<ulong> bVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
+
+            // 2. Rearrange the 64‐bit limbs into 512‐bit vectors.
+            Vector256<ulong> aPerm0 = Avx2.Permute4x64(aVector, 16);
+            Vector256<ulong> aPerm1 = Avx2.Permute4x64(aVector, 73);
+            Vector512<ulong> rearrangedA = Vector512.Create(aPerm0, aPerm1);
+
+            Vector256<ulong> bPerm0 = Avx2.Permute4x64(bVector, 132);
+            Vector256<ulong> bPerm1 = Avx2.Permute4x64(bVector, 177);
+            Vector512<ulong> rearrangedB = Vector512.Create(bPerm0, bPerm1);
+
+            // 3. Multiply the corresponding 64‐bit limbs.
+
+            // Mask for the lower 32 bits.
+            Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
+
+            // Split each 64-bit operand into 32-bit halves:
+            // a0 = lower 32 bits, a1 = upper 32 bits
+            Vector512<ulong> a0 = Avx512F.And(rearrangedA, mask32);
+            Vector512<ulong> a1 = Avx512F.ShiftRightLogical(rearrangedA, 32);
+            Vector512<ulong> b0 = Avx512F.And(rearrangedB, mask32);
+            Vector512<ulong> b1 = Avx512F.ShiftRightLogical(rearrangedB, 32);
+
+            // Compute the four 32x32 partial products.
+            // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
+            Vector512<ulong> u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0
+            Vector512<ulong> u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1
+            Vector512<ulong> u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0
+            Vector512<ulong> u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1
+
+            // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
+            Vector512<ulong> u0_hi = Avx512F.ShiftRightLogical(u0, 32);
+            Vector512<ulong> u1_lo = Avx512F.And(u1, mask32);
+            Vector512<ulong> u2_lo = Avx512F.And(u2, mask32);
+            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo);
+
+            // The extra carry: c = t >> 32.
+            Vector512<ulong> c = Avx512F.ShiftRightLogical(t, 32);
+
+            // Now, assemble the lower 64 bits:
+            // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
+            Vector512<ulong> u0_lo = Avx512F.And(u0, mask32);
+            Vector512<ulong> t_lo = Avx512F.And(t, mask32);
+            Vector512<ulong> partialLo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32));
+
+            // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
+            Vector512<ulong> u1_hi = Avx512F.ShiftRightLogical(u1, 32);
+            Vector512<ulong> u2_hi = Avx512F.ShiftRightLogical(u2, 32);
+            Vector512<ulong> partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c);
+
+            // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20)
+            //    into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.)
+            //    – Products 0 and 1 come from index 0:
+            Vector128<ulong> pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo
+            Vector128<ulong> pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi
+                                                                                // Unpack lower (lane0) and upper (lane1) to form product0 and product1:
+            Vector128<ulong> prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi);  // prod00 = {lo, hi}
+            Vector128<ulong> prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi}
+
+            // – Products 2 and 3 come from index 1:
+            Vector128<ulong> pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo
+            Vector128<ulong> pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi
+            Vector128<ulong> prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi);  // prod10
+            Vector128<ulong> prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02
+
+            // – Products 4 and 5 come from index 2:
+            Vector128<ulong> pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo
+            Vector128<ulong> pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi
+            Vector128<ulong> prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi);  // prod11
+            Vector128<ulong> prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20
+
+            // 5. Group‑1 “cross‑term” addition:
+            //    crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2)
+            Vector128<ulong> crossSum = Add128(prod1, prod2);
+
+            // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb.
+            //    Instead of extracting a scalar, we broadcast the lower 64 bits to a vector.
+            //    (Assume BroadcastLower128 returns a copy with both lanes equal to element0.)
+            Vector128<ulong> csLow = BroadcastLower128(crossSum);
+            // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue}
+            Vector128<ulong> highMask = Vector128.Create(0ul, ulong.MaxValue);
+            Vector128<ulong> addMask = Sse2.And(csLow, highMask);
+            Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
+
+            // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
+            uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan(
+                ExtractHighLimb(prod0Updated), // compare updated high limb...
+                ExtractHighLimb(prod0)         // ...with the original high limb
+                ).AsByte()) & 1;
+            // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
+            Vector128<ulong> csHigh = BroadcastUpper128(crossSum);
+            Vector128<ulong> limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag));
+
+            // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
+            uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan(
+                ExtractHighLimb(prod1), csHigh).AsByte()) & 1);
+            Vector128<ulong> limb3Vec = Vector128.CreateScalar((ulong)limb3);
+
+            // 7. Build the 256‑bit “intermediate” result from group‑1:
+            //    Lower 128 bits = prod00 (with updated high limb)
+            //    Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector.
+            Vector128<ulong> lowerIntermediate = prod0Updated;
+            // Pack limb2 into the lower half and limb3 into the upper half.
+            Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec);
+            Vector256<ulong> intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate);
+
+            // 8. Process group‑2: (prod02, prod11, prod20)
+            Vector128<ulong> group2Sum = Add128(prod3, prod4);
+            Vector128<ulong> totalGroup2 = Add128(group2Sum, prod5);
+            // Add totalGroup2 into the current upper 128 bits of intermediateResult.
+            Vector128<ulong> currentUpper = intermediateResult.GetUpper();
+            Vector128<ulong> newUpper = Add128(currentUpper, totalGroup2);
+            intermediateResult = WithUpper(intermediateResult, newUpper);
+
+            // 9. Process group‑3:
+            //    Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts.
+            Vector128<ulong> aHigh = Vector128.Create(x.u2, x.u3);
+            Vector128<ulong> bLow = Vector128.Create(y.u1, y.u0);
+            // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits)
+            Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow);
+
+            // Extract from partialLo the two lower parts for prod03 and prod12.
+            // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7),
+            // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes.
+            Vector128<ulong> prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1);
+            // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”):
+            // (Note: prod6 already holds both lower parts.)
+            finalProdLow = Sse2.Add(finalProdLow, prod6);
+            // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
+            Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
+            // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
+            Vector128<ulong> upperTemp = intermediateResult.GetUpper();
+            Vector128<ulong> hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask);
+            Vector128<ulong> newUpperTemp = Sse2.Add(upperTemp, hsBroadcast);
+            intermediateResult = WithUpper(intermediateResult, newUpperTemp);
+
+            // 10. Write out the final 256‑bit result.
+            Unsafe.SkipInit(out res);
+            Unsafe.As<UInt256, Vector256<ulong>>(ref res) = intermediateResult;
+
+            static Vector128<ulong> HorizontalAdd(Vector128<ulong> vec)
+            {
+                // Reinterpret the 64-bit integer vector as a vector of two doubles.
+                // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes.
+                Vector128<ulong> swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64();
+
+                // Add the original vector and the swapped vector.
+                // This results in a vector where both lanes equal (vec[0] + vec[1]).
+                return Sse2.Add(vec, swapped);
+            }
 
-                ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
+            {
+                // Replace the upper 128 bits of vec with upper.
+                return Avx2.InsertVector128(vec, upper, 1);
+            }
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static Vector128<ulong> ExtractHighLimb(Vector128<ulong> vec)
+            {
+                // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb,
+                // then reinterpret back as 64-bit.
+                return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64();
+            }
 
-                res = new UInt256(r0, r1, r2, r3);
+            // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128<ulong>.
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static Vector128<ulong> BroadcastLower128(Vector128<ulong> vec)
+            {
+                // Replicate element0 to both lanes.
+                return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64();
             }
-            else
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static Vector128<ulong> BroadcastUpper128(Vector128<ulong> vec)
+            {
+                // Replicate element1 to both lanes.
+                return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1
+            }
+            /// <summary>
+            /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
+            /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits
+            /// and element 1 holding the higher 64 bits.
+            /// </summary>
+            /// <param name="operand1">The first 128-bit unsigned integer operand.</param>
+            /// <param name="operand2">The second 128-bit unsigned integer operand.</param>
+            /// <returns>
+            /// A <see cref="Vector128{ulong}"/> representing the sum of the two operands, with any carry from the lower lane added
+            /// into the higher lane.
+            /// </returns>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            static Vector128<ulong> Add128(Vector128<ulong> left, Vector128<ulong> right)
             {
-                // 1. Load the 256‐bit inputs into 256‐bit vector registers.
-                Vector256<ulong> aVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
-                Vector256<ulong> bVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
-
-                // 2. Rearrange the 64‐bit limbs into 512‐bit vectors.
-                Vector256<ulong> aPerm0 = Avx2.Permute4x64(aVector, 16);
-                Vector256<ulong> aPerm1 = Avx2.Permute4x64(aVector, 73);
-                Vector512<ulong> rearrangedA = Vector512.Create(aPerm0, aPerm1);
-
-                Vector256<ulong> bPerm0 = Avx2.Permute4x64(bVector, 132);
-                Vector256<ulong> bPerm1 = Avx2.Permute4x64(bVector, 177);
-                Vector512<ulong> rearrangedB = Vector512.Create(bPerm0, bPerm1);
-
-                // 3. Multiply the corresponding 64‐bit limbs.
-                Mul64Vector(rearrangedA, rearrangedB, out Vector512<ulong> partialLo, out Vector512<ulong> partialHi);
-
-                // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20)
-                //    into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.)
-                //    – Products 0 and 1 come from index 0:
-                Vector128<ulong> pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo
-                Vector128<ulong> pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi
-                // Unpack lower (lane0) and upper (lane1) to form product0 and product1:
-                Vector128<ulong> prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi);  // prod00 = {lo, hi}
-                Vector128<ulong> prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi}
-
-                // – Products 2 and 3 come from index 1:
-                Vector128<ulong> pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo
-                Vector128<ulong> pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi
-                Vector128<ulong> prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi);  // prod10
-                Vector128<ulong> prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02
-
-                // – Products 4 and 5 come from index 2:
-                Vector128<ulong> pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo
-                Vector128<ulong> pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi
-                Vector128<ulong> prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi);  // prod11
-                Vector128<ulong> prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20
-
-                // 5. Group‑1 “cross‑term” addition:
-                //    crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2)
-                Vector128<ulong> crossSum = Add128(prod1, prod2);
-
-                // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb.
-                //    Instead of extracting a scalar, we broadcast the lower 64 bits to a vector.
-                //    (Assume BroadcastLower128 returns a copy with both lanes equal to element0.)
-                Vector128<ulong> csLow = BroadcastLower128(crossSum);
-                // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue}
-                Vector128<ulong> highMask = Vector128.Create(0ul, ulong.MaxValue);
-                Vector128<ulong> addMask = Sse2.And(csLow, highMask);
-                Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
-
-                // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
-                uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan(
-                    ExtractHighLimb(prod0Updated), // compare updated high limb...
-                    ExtractHighLimb(prod0)         // ...with the original high limb
-                    ).AsByte()) & 1;
-                // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
-                Vector128<ulong> csHigh = BroadcastUpper128(crossSum);
-                Vector128<ulong> limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag));
-
-                // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
-                uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan(
-                    ExtractHighLimb(prod1), csHigh).AsByte()) & 1);
-                Vector128<ulong> limb3Vec = Vector128.CreateScalar((ulong)limb3);
-
-                // 7. Build the 256‑bit “intermediate” result from group‑1:
-                //    Lower 128 bits = prod00 (with updated high limb)
-                //    Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector.
-                Vector128<ulong> lowerIntermediate = prod0Updated;
-                // Pack limb2 into the lower half and limb3 into the upper half.
-                Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec);
-                Vector256<ulong> intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate);
-
-                // 8. Process group‑2: (prod02, prod11, prod20)
-                Vector128<ulong> group2Sum = Add128(prod3, prod4);
-                Vector128<ulong> totalGroup2 = Add128(group2Sum, prod5);
-                // Add totalGroup2 into the current upper 128 bits of intermediateResult.
-                Vector128<ulong> currentUpper = GetUpper(intermediateResult);
-                Vector128<ulong> newUpper = Add128(currentUpper, totalGroup2);
-                intermediateResult = WithUpper(intermediateResult, newUpper);
-
-                // 9. Process group‑3:
-                //    Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts.
-                Vector128<ulong> aHigh = Vector128.Create(x.u2, x.u3);
-                Vector128<ulong> bLow = Vector128.Create(y.u1, y.u0);
-                // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits)
-                Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow);
-
-                // Extract from partialLo the two lower parts for prod03 and prod12.
-                // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7),
-                // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes.
-                Vector128<ulong> prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1);
-                // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”):
-                // (Note: prod6 already holds both lower parts.)
-                finalProdLow = Sse2.Add(finalProdLow, prod6);
-                // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
-                Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
-                // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
-                Vector128<ulong> upperTemp = GetUpper(intermediateResult);
-                Vector128<ulong> hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask);
-                Vector128<ulong> newUpperTemp = Sse2.Add(upperTemp, hsBroadcast);
-                intermediateResult = WithUpper(intermediateResult, newUpperTemp);
-
-                // 10. Write out the final 256‑bit result.
-                Unsafe.SkipInit(out res);
-                Unsafe.As<UInt256, Vector256<ulong>>(ref res) = intermediateResult;
+                // Perform a lane-wise addition of the two operands.
+                Vector128<ulong> sum = Sse2.Add(left, right);
 
-                static Vector128<ulong> HorizontalAdd(Vector128<ulong> vec)
-                {
-                    // Reinterpret the 64-bit integer vector as a vector of two doubles.
-                    // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes.
-                    Vector128<ulong> swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64();
+                // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands.
+                // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise.
+                Vector128<ulong> overflowMask = Avx512F.VL.CompareLessThan(sum, left);
 
-                    // Add the original vector and the swapped vector.
-                    // This results in a vector where both lanes equal (vec[0] + vec[1]).
-                    return Sse2.Add(vec, swapped);
-                }
+                // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
+                // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
+                overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
 
-                // Helpers that mimic “GetUpper” and “WithUpper” on a 256‑bit vector.
-                // (You might implement these as extension methods on Vector256<T>.)
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                static Vector128<ulong> GetUpper(Vector256<ulong> vec)
-                {
-                    // For example, using Avx2.ExtractVector128:
-                    return Avx2.ExtractVector128(vec, 1);
-                }
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
-                {
-                    // Replace the upper 128 bits of vec with upper.
-                    return Avx2.InsertVector128(vec, upper, 1);
-                }
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                static Vector128<ulong> ExtractHighLimb(Vector128<ulong> vec)
-                {
-                    // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb,
-                    // then reinterpret back as 64-bit.
-                    return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64();
-                }
+                // Promote the carry from the lower lane (element 0) into the upper lane.
+                // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane.
+                Vector128<ulong> swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64();
 
-                // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128<ulong>.
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                static Vector128<ulong> BroadcastLower128(Vector128<ulong> vec)
-                {
-                    // Replicate element0 to both lanes.
-                    return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64();
-                }
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                static Vector128<ulong> BroadcastUpper128(Vector128<ulong> vec)
-                {
-                    // Replicate element1 to both lanes.
-                    return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1
-                }
-                /// <summary>
-                /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
-                /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits
-                /// and element 1 holding the higher 64 bits.
-                /// </summary>
-                /// <param name="operand1">The first 128-bit unsigned integer operand.</param>
-                /// <param name="operand2">The second 128-bit unsigned integer operand.</param>
-                /// <returns>
-                /// A <see cref="Vector128{ulong}"/> representing the sum of the two operands, with any carry from the lower lane added
-                /// into the higher lane.
-                /// </returns>
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                static Vector128<ulong> Add128(Vector128<ulong> left, Vector128<ulong> right)
-                {
-                    // Perform a lane-wise addition of the two operands.
-                    Vector128<ulong> sum = Sse2.Add(left, right);
+                // Next, clear the (now swapped) lower lane by blending with a zero vector.
+                // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged.
+                Vector128<ulong> promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128<double>.Zero, 0x1).AsUInt64();
 
-                    // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands.
-                    // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise.
-                    Vector128<ulong> overflowMask = Avx512F.VL.CompareLessThan(sum, left);
+                // Add the propagated carry to the sum.
+                return Sse2.Add(sum, promotedCarry);
+            }
+        }
 
-                    // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
-                    // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
-                    overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
+        private static void MultiplyNonAvx512(UInt256 x, UInt256 y, out UInt256 res)
+        {
+            ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
+            ref ulong ry = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in y));
 
-                    // Promote the carry from the lower lane (element 0) into the upper lane.
-                    // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane.
-                    Vector128<ulong> swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64();
+            (ulong carry, ulong r0) = Multiply64(rx, ry);
+            UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
+            UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
+            ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
 
-                    // Next, clear the (now swapped) lower lane by blending with a zero vector.
-                    // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged.
-                    Vector128<ulong> promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128<double>.Zero, 0x1).AsUInt64();
+            UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
+            UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
+            res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
 
-                    // Add the propagated carry to the sum.
-                    return Sse2.Add(sum, promotedCarry);
-                }
-            }
+            UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
+            res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
 
-            // Vectorized 64x64 multiply: given vectors 'a' and 'b' (each 8 lanes),
-            // computes per lane:
-            //   product = a * b = (hi, lo)
-            // using the splitting method since there is no MultiplyHigh intrinsic.
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static void Mul64Vector(Vector512<ulong> a, Vector512<ulong> b,
-                                              out Vector512<ulong> lo, out Vector512<ulong> hi)
-            {
-                // Mask for the lower 32 bits.
-                Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
-
-                // Split each 64-bit operand into 32-bit halves:
-                // a0 = lower 32 bits, a1 = upper 32 bits
-                Vector512<ulong> a0 = Avx512F.And(a, mask32);
-                Vector512<ulong> a1 = Avx512F.ShiftRightLogical(a, 32);
-                Vector512<ulong> b0 = Avx512F.And(b, mask32);
-                Vector512<ulong> b1 = Avx512F.ShiftRightLogical(b, 32);
-
-                // Compute the four 32x32 partial products.
-                // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
-                Vector512<ulong> u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0
-                Vector512<ulong> u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1
-                Vector512<ulong> u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0
-                Vector512<ulong> u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1
-
-                // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
-                Vector512<ulong> u0_hi = Avx512F.ShiftRightLogical(u0, 32);
-                Vector512<ulong> u1_lo = Avx512F.And(u1, mask32);
-                Vector512<ulong> u2_lo = Avx512F.And(u2, mask32);
-                Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo);
-
-                // The extra carry: c = t >> 32.
-                Vector512<ulong> c = Avx512F.ShiftRightLogical(t, 32);
-
-                // Now, assemble the lower 64 bits:
-                // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
-                Vector512<ulong> u0_lo = Avx512F.And(u0, mask32);
-                Vector512<ulong> t_lo = Avx512F.And(t, mask32);
-                lo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32));
-
-                // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
-                Vector512<ulong> u1_hi = Avx512F.ShiftRightLogical(u1, 32);
-                Vector512<ulong> u2_hi = Avx512F.ShiftRightLogical(u2, 32);
-                hi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c);
-            }
+            ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
+
+            res = new UInt256(r0, r1, r2, r3);
+        }
+
+        private static void MultiplyULong(UInt256 x, UInt256 y, out UInt256 res)
+        {
+            // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615)
+            ulong high = Math.BigMul(x.u0, y.u0, out ulong low);
+            // Assignment to res after multiply in case is used as input for x or y (by ref aliasing)
+            res = default;
+            Unsafe.AsRef(in res.u0) = low;
+            Unsafe.AsRef(in res.u1) = high;
         }
 
         public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res);

From ff5584171858d1f17695b8f191affb4a6b7a8e44 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 06:15:16 +0000
Subject: [PATCH 16/38] Improved comments

---
 src/Nethermind.Int256/UInt256.cs | 69 +++++++++++++++++---------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 16adf6f..3fe2716 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1012,56 +1012,61 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
 
             // 1. Load the 256‐bit inputs into 256‐bit vector registers.
-            Vector256<ulong> aVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
-            Vector256<ulong> bVector = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
+            Vector256<ulong> x0123 = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
+            Vector256<ulong> y0123 = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
 
-            // 2. Rearrange the 64‐bit limbs into 512‐bit vectors.
-            Vector256<ulong> aPerm0 = Avx2.Permute4x64(aVector, 16);
-            Vector256<ulong> aPerm1 = Avx2.Permute4x64(aVector, 73);
-            Vector512<ulong> rearrangedA = Vector512.Create(aPerm0, aPerm1);
+            // Mask for the lower 32 bits.
+            Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
 
-            Vector256<ulong> bPerm0 = Avx2.Permute4x64(bVector, 132);
-            Vector256<ulong> bPerm1 = Avx2.Permute4x64(bVector, 177);
-            Vector512<ulong> rearrangedB = Vector512.Create(bPerm0, bPerm1);
+            // 2. Rearrange the 64‐bit limbs into 512‐bit vectors.
+            // x0010 = [ x0, x0, x1, x0 ]
+            Vector256<ulong> x0010 = Avx2.Permute4x64(x0123, 16);
+            // x1201 = [ x1, x2, x0, x1 ]
+            Vector256<ulong> x1201 = Avx2.Permute4x64(x0123, 73);
+            // x00101201 = [ x0, x0, x1, x0, x1, x2, x0, x1 ]
+            Vector512<ulong> x00101201 = Vector512.Create(x0010, x1201);
+
+            // y0102 = [ y0, y1, y0, y2 ] 
+            Vector256<ulong> y0102 = Avx2.Permute4x64(y0123, 132);
+            // y1032 = [ y1, y0, y3, y2 ]
+            Vector256<ulong> y1032 = Avx2.Permute4x64(y0123, 177);
+            // y01021032 = [ y0, y1, y0, y2, y1, y0, y3, y2 ]
+            Vector512<ulong> y01021032 = Vector512.Create(y0102, y1032);
 
             // 3. Multiply the corresponding 64‐bit limbs.
 
-            // Mask for the lower 32 bits.
-            Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
-
             // Split each 64-bit operand into 32-bit halves:
-            // a0 = lower 32 bits, a1 = upper 32 bits
-            Vector512<ulong> a0 = Avx512F.And(rearrangedA, mask32);
-            Vector512<ulong> a1 = Avx512F.ShiftRightLogical(rearrangedA, 32);
-            Vector512<ulong> b0 = Avx512F.And(rearrangedB, mask32);
-            Vector512<ulong> b1 = Avx512F.ShiftRightLogical(rearrangedB, 32);
+            Vector512<ulong> xLo = Avx512F.And(x00101201, mask32);
+            Vector512<ulong> xHi = Avx512F.ShiftRightLogical(x00101201, 32);
+            Vector512<ulong> yLo = Avx512F.And(y01021032, mask32);
+            Vector512<ulong> yHi = Avx512F.ShiftRightLogical(y01021032, 32);
 
             // Compute the four 32x32 partial products.
             // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
-            Vector512<ulong> u0 = Avx512DQ.MultiplyLow(a0, b0); // a0 * b0
-            Vector512<ulong> u1 = Avx512DQ.MultiplyLow(a0, b1); // a0 * b1
-            Vector512<ulong> u2 = Avx512DQ.MultiplyLow(a1, b0); // a1 * b0
-            Vector512<ulong> u3 = Avx512DQ.MultiplyLow(a1, b1); // a1 * b1
+            Vector512<ulong> u0 = Avx512DQ.MultiplyLow(xLo, yLo);
+            Vector512<ulong> u1 = Avx512DQ.MultiplyLow(xLo, yHi);
+            Vector512<ulong> u2 = Avx512DQ.MultiplyLow(xHi, yLo);
+            Vector512<ulong> u3 = Avx512DQ.MultiplyLow(xHi, yHi);
 
             // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
-            Vector512<ulong> u0_hi = Avx512F.ShiftRightLogical(u0, 32);
-            Vector512<ulong> u1_lo = Avx512F.And(u1, mask32);
-            Vector512<ulong> u2_lo = Avx512F.And(u2, mask32);
-            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0_hi, u1_lo), u2_lo);
+            Vector512<ulong> u0Hi = Avx512F.ShiftRightLogical(u0, 32);
+            Vector512<ulong> u1Lo = Avx512F.And(u1, mask32);
+            Vector512<ulong> u2Lo = Avx512F.And(u2, mask32);
+            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0Hi, u1Lo), u2Lo);
 
             // The extra carry: c = t >> 32.
-            Vector512<ulong> c = Avx512F.ShiftRightLogical(t, 32);
+            Vector512<ulong> carry = Avx512F.ShiftRightLogical(t, 32);
 
             // Now, assemble the lower 64 bits:
             // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
-            Vector512<ulong> u0_lo = Avx512F.And(u0, mask32);
-            Vector512<ulong> t_lo = Avx512F.And(t, mask32);
-            Vector512<ulong> partialLo = Avx512F.Or(u0_lo, Avx512F.ShiftLeftLogical(t_lo, 32));
+            Vector512<ulong> u0Lo = Avx512F.And(u0, mask32);
+            Vector512<ulong> tLo = Avx512F.And(t, mask32);
+            Vector512<ulong> partialLo = Avx512F.Or(u0Lo, Avx512F.ShiftLeftLogical(tLo, 32));
 
             // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
-            Vector512<ulong> u1_hi = Avx512F.ShiftRightLogical(u1, 32);
-            Vector512<ulong> u2_hi = Avx512F.ShiftRightLogical(u2, 32);
-            Vector512<ulong> partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1_hi), u2_hi), c);
+            Vector512<ulong> u1Hi = Avx512F.ShiftRightLogical(u1, 32);
+            Vector512<ulong> u2Hi = Avx512F.ShiftRightLogical(u2, 32);
+            Vector512<ulong> partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1Hi), u2Hi), carry);
 
             // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20)
             //    into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.)

From 5c8329c31678d877ef4f7efbd198b84737e98cc7 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 07:16:42 +0000
Subject: [PATCH 17/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 3fe2716..a10f800 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1049,24 +1049,14 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Vector512<ulong> u3 = Avx512DQ.MultiplyLow(xHi, yHi);
 
             // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
-            Vector512<ulong> u0Hi = Avx512F.ShiftRightLogical(u0, 32);
-            Vector512<ulong> u1Lo = Avx512F.And(u1, mask32);
-            Vector512<ulong> u2Lo = Avx512F.And(u2, mask32);
-            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(u0Hi, u1Lo), u2Lo);
-
-            // The extra carry: c = t >> 32.
-            Vector512<ulong> carry = Avx512F.ShiftRightLogical(t, 32);
+            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(Avx512F.ShiftRightLogical(u0, 32), Avx512F.And(u1, mask32)), Avx512F.And(u2, mask32));
 
             // Now, assemble the lower 64 bits:
             // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
-            Vector512<ulong> u0Lo = Avx512F.And(u0, mask32);
-            Vector512<ulong> tLo = Avx512F.And(t, mask32);
-            Vector512<ulong> partialLo = Avx512F.Or(u0Lo, Avx512F.ShiftLeftLogical(tLo, 32));
+            Vector512<ulong> partialLo = Avx512F.Or(Avx512F.And(u0, mask32), Avx512F.ShiftLeftLogical(Avx512F.And(t, mask32), 32));
 
-            // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + c.
-            Vector512<ulong> u1Hi = Avx512F.ShiftRightLogical(u1, 32);
-            Vector512<ulong> u2Hi = Avx512F.ShiftRightLogical(u2, 32);
-            Vector512<ulong> partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, u1Hi), u2Hi), carry);
+            // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + (t >> 32).
+            Vector512<ulong> partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, Avx512F.ShiftRightLogical(u1, 32)), Avx512F.ShiftRightLogical(u2, 32)), Avx512F.ShiftRightLogical(t, 32));
 
             // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20)
             //    into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.)
@@ -1133,7 +1123,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             intermediateResult = WithUpper(intermediateResult, newUpper);
 
             // 9. Process group‑3:
-            //    Multiply “aHigh” and “bLow” (with the proper reversed order) then add in the remaining lower parts.
+            //    Multiply x23 and y10 (with the proper reversed order) then add in the remaining lower parts.
             Vector128<ulong> aHigh = Vector128.Create(x.u2, x.u3);
             Vector128<ulong> bLow = Vector128.Create(y.u1, y.u0);
             // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits)
@@ -1149,10 +1139,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
             Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
             // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
-            Vector128<ulong> upperTemp = intermediateResult.GetUpper();
-            Vector128<ulong> hsBroadcast = Sse2.And(BroadcastLower128(horizontalSum), highMask);
-            Vector128<ulong> newUpperTemp = Sse2.Add(upperTemp, hsBroadcast);
-            intermediateResult = WithUpper(intermediateResult, newUpperTemp);
+            intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(default, Sse2.And(horizontalSum, highMask)));
 
             // 10. Write out the final 256‑bit result.
             Unsafe.SkipInit(out res);

From cc6cac73695892df8d305b0c27ae95c39e2bb8ca Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 07:42:59 +0000
Subject: [PATCH 18/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index a10f800..ef3ea1e 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1086,10 +1086,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb.
             //    Instead of extracting a scalar, we broadcast the lower 64 bits to a vector.
             //    (Assume BroadcastLower128 returns a copy with both lanes equal to element0.)
-            Vector128<ulong> csLow = BroadcastLower128(crossSum);
-            // Create a mask to add only to the high lane: mask = {0, ulong.MaxValue}
-            Vector128<ulong> highMask = Vector128.Create(0ul, ulong.MaxValue);
-            Vector128<ulong> addMask = Sse2.And(csLow, highMask);
+            Vector128<ulong> addMask = Sse2.Shuffle(Vector128<double>.Zero, crossSum.AsDouble(), 0).AsUInt64();
             Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
 
             // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
@@ -1139,6 +1136,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
             Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
             // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
+            Vector128<ulong> highMask = Vector128.Create(0ul, ulong.MaxValue);
             intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(default, Sse2.And(horizontalSum, highMask)));
 
             // 10. Write out the final 256‑bit result.
@@ -1170,13 +1168,6 @@ static Vector128<ulong> ExtractHighLimb(Vector128<ulong> vec)
                 return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64();
             }
 
-            // Helpers to “broadcast” the lower or upper 64‐bit lane of a Vector128<ulong>.
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector128<ulong> BroadcastLower128(Vector128<ulong> vec)
-            {
-                // Replicate element0 to both lanes.
-                return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0).AsUInt64();
-            }
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             static Vector128<ulong> BroadcastUpper128(Vector128<ulong> vec)
             {

From bfaa88cfaa2766d4de2dad443b600aa9bed25ca7 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 07:50:47 +0000
Subject: [PATCH 19/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index ef3ea1e..a0facc1 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1136,8 +1136,9 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
             Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
             // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
-            Vector128<ulong> highMask = Vector128.Create(0ul, ulong.MaxValue);
-            intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(default, Sse2.And(horizontalSum, highMask)));
+            // 2. Use a shuffle with a zero vector to directly form { 0, horizontalSum[0] }
+            Vector128<ulong> high = Sse2.Shuffle(Vector128<double>.Zero, horizontalSum.AsDouble(), 0).AsUInt64();
+            intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128<ulong>.Zero, high));
 
             // 10. Write out the final 256‑bit result.
             Unsafe.SkipInit(out res);

From f7152ad3ddd480f049703fce56530c3b9f8e2494 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:12:13 +0000
Subject: [PATCH 20/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index a0facc1..5872224 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1199,14 +1199,9 @@ static Vector128<ulong> Add128(Vector128<ulong> left, Vector128<ulong> right)
                 // Normalize the overflow mask: shift each 64-bit lane right by 63 bits.
                 // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0.
                 overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
-
-                // Promote the carry from the lower lane (element 0) into the upper lane.
-                // First, swap the two 64-bit lanes so that the lower lane's carry moves to the higher lane.
-                Vector128<ulong> swappedCarry = Sse2.Shuffle(overflowMask.AsDouble(), overflowMask.AsDouble(), 0x1).AsUInt64();
-
-                // Next, clear the (now swapped) lower lane by blending with a zero vector.
-                // The immediate mask 0x1 indicates that lane 0 should come from the zero vector and lane 1 remains unchanged.
-                Vector128<ulong> promotedCarry = Sse41.Blend(swappedCarry.AsDouble(), Vector128<double>.Zero, 0x1).AsUInt64();
+                // Next, clear the (now swapped) lower lane by shuffle with a zero vector.
+                // The immediate mask 0x0 indicates that lane 0 should come from the zero vector and lane 1 from overflow.
+                Vector128<ulong> promotedCarry = Sse2.Shuffle(Vector128<double>.Zero, overflowMask.AsDouble(), 0).AsUInt64();
 
                 // Add the propagated carry to the sum.
                 return Sse2.Add(sum, promotedCarry);

From 17f5729ab91f455d4f2ab9a9afce90392a7dbfb7 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:27:06 +0000
Subject: [PATCH 21/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 5872224..1650aa6 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1133,28 +1133,21 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”):
             // (Note: prod6 already holds both lower parts.)
             finalProdLow = Sse2.Add(finalProdLow, prod6);
-            // Now perform a horizontal add so that the two 64‑bit lanes collapse to a single 64‑bit value.
-            Vector128<ulong> horizontalSum = HorizontalAdd(finalProdLow);
+            // Reinterpret the 64-bit integer vector as a vector of two doubles.
+            // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes.
+            Vector128<ulong> swapped = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64();
+            // Add the original vector and the swapped vector.
+            // This results in a vector where both lanes equal (vec[0] + vec[1]).
+            Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, swapped);
             // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
-            // 2. Use a shuffle with a zero vector to directly form { 0, horizontalSum[0] }
-            Vector128<ulong> high = Sse2.Shuffle(Vector128<double>.Zero, horizontalSum.AsDouble(), 0).AsUInt64();
+            // 2. Use a unpackHigh with a zero vector to directly form { 0, horizontalSum[0] }
+            Vector128<ulong> high = Sse2.UnpackHigh(Vector128<ulong>.Zero, horizontalSum);
             intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128<ulong>.Zero, high));
 
             // 10. Write out the final 256‑bit result.
             Unsafe.SkipInit(out res);
             Unsafe.As<UInt256, Vector256<ulong>>(ref res) = intermediateResult;
 
-            static Vector128<ulong> HorizontalAdd(Vector128<ulong> vec)
-            {
-                // Reinterpret the 64-bit integer vector as a vector of two doubles.
-                // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes.
-                Vector128<ulong> swapped = Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 0x1).AsUInt64();
-
-                // Add the original vector and the swapped vector.
-                // This results in a vector where both lanes equal (vec[0] + vec[1]).
-                return Sse2.Add(vec, swapped);
-            }
-
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
             {

From abb4081b4fde40c93bfc8626164a2e75b61544dc Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:31:19 +0000
Subject: [PATCH 22/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 1650aa6..63d9c4d 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1194,7 +1194,7 @@ static Vector128<ulong> Add128(Vector128<ulong> left, Vector128<ulong> right)
                 overflowMask = Sse2.ShiftRightLogical(overflowMask, 63);
                 // Next, clear the (now swapped) lower lane by shuffle with a zero vector.
                 // The immediate mask 0x0 indicates that lane 0 should come from the zero vector and lane 1 from overflow.
-                Vector128<ulong> promotedCarry = Sse2.Shuffle(Vector128<double>.Zero, overflowMask.AsDouble(), 0).AsUInt64();
+                Vector128<ulong> promotedCarry = Sse2.UnpackLow(Vector128<ulong>.Zero, overflowMask);
 
                 // Add the propagated carry to the sum.
                 return Sse2.Add(sum, promotedCarry);

From 684ce56b3b9e6673819f09244a444a24152b8f74 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:33:54 +0000
Subject: [PATCH 23/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 63d9c4d..3b4343e 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1086,7 +1086,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb.
             //    Instead of extracting a scalar, we broadcast the lower 64 bits to a vector.
             //    (Assume BroadcastLower128 returns a copy with both lanes equal to element0.)
-            Vector128<ulong> addMask = Sse2.Shuffle(Vector128<double>.Zero, crossSum.AsDouble(), 0).AsUInt64();
+            Vector128<ulong> addMask = Sse2.UnpackLow(Vector128<ulong>.Zero, crossSum);
             Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
 
             // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)

From c9118f2b4ee683d7aa2391d0e24794b35720de1b Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:38:49 +0000
Subject: [PATCH 24/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 3b4343e..d447f5d 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1091,8 +1091,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
             // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
             uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan(
-                ExtractHighLimb(prod0Updated), // compare updated high limb...
-                ExtractHighLimb(prod0)         // ...with the original high limb
+                Sse2.UnpackHigh(prod0Updated, prod0Updated), // compare updated high limb...
+                Sse2.UnpackHigh(prod0, prod0)         // ...with the original high limb
                 ).AsByte()) & 1;
             // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
             Vector128<ulong> csHigh = BroadcastUpper128(crossSum);
@@ -1100,7 +1100,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
             // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
             uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan(
-                ExtractHighLimb(prod1), csHigh).AsByte()) & 1);
+                Sse2.UnpackHigh(prod1, prod1), csHigh).AsByte()) & 1);
             Vector128<ulong> limb3Vec = Vector128.CreateScalar((ulong)limb3);
 
             // 7. Build the 256‑bit “intermediate” result from group‑1:
@@ -1154,13 +1154,6 @@ static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
                 // Replace the upper 128 bits of vec with upper.
                 return Avx2.InsertVector128(vec, upper, 1);
             }
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector128<ulong> ExtractHighLimb(Vector128<ulong> vec)
-            {
-                // Reinterpret the 64-bit vector as 32-bit elements, shuffle to replicate the upper 64-bit limb,
-                // then reinterpret back as 64-bit.
-                return Sse2.Shuffle(vec.AsUInt32(), 0xEE).AsUInt64();
-            }
 
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             static Vector128<ulong> BroadcastUpper128(Vector128<ulong> vec)

From 8fa3b377715477a2cd9d818114e0191bfc78e51c Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:41:47 +0000
Subject: [PATCH 25/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index d447f5d..3732faf 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1095,7 +1095,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                 Sse2.UnpackHigh(prod0, prod0)         // ...with the original high limb
                 ).AsByte()) & 1;
             // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
-            Vector128<ulong> csHigh = BroadcastUpper128(crossSum);
+            Vector128<ulong> csHigh = Sse2.UnpackHigh(crossSum, crossSum);
             Vector128<ulong> limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag));
 
             // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
@@ -1155,12 +1155,6 @@ static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
                 return Avx2.InsertVector128(vec, upper, 1);
             }
 
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector128<ulong> BroadcastUpper128(Vector128<ulong> vec)
-            {
-                // Replicate element1 to both lanes.
-                return Sse2.Shuffle(vec.AsDouble(), vec.AsDouble(), 3).AsUInt64(); // 0xFF means both lanes come from the original element1
-            }
             /// <summary>
             /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
             /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits

From ae34bf9f5b826b41b72ba7f2ef8c742ac9c3adb6 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:48:11 +0000
Subject: [PATCH 26/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 3732faf..493c5c6 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1099,16 +1099,16 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Vector128<ulong> limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag));
 
             // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
-            uint limb3 = (uint)(Sse2.MoveMask(Avx512F.VL.CompareGreaterThan(
-                Sse2.UnpackHigh(prod1, prod1), csHigh).AsByte()) & 1);
-            Vector128<ulong> limb3Vec = Vector128.CreateScalar((ulong)limb3);
+            // Shift right each lane by 63 bits, so that 0 becomes 0 and 0xFFFFFFFFFFFFFFFF becomes 1.
+            Vector128<ulong> limb3Vec = Sse2.ShiftRightLogical(Avx512F.VL.CompareGreaterThan(
+                Sse2.UnpackHigh(prod1, prod1), csHigh), 63);
 
+            // Pack limb2 into the lower half and limb3 into the upper half.
+            Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec);
             // 7. Build the 256‑bit “intermediate” result from group‑1:
             //    Lower 128 bits = prod00 (with updated high limb)
             //    Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector.
             Vector128<ulong> lowerIntermediate = prod0Updated;
-            // Pack limb2 into the lower half and limb3 into the upper half.
-            Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec);
             Vector256<ulong> intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate);
 
             // 8. Process group‑2: (prod02, prod11, prod20)

From 450ec9e4d2203d32dda7905f860664cf6a0fb43a Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 08:51:15 +0000
Subject: [PATCH 27/38] Otpimize

---
 src/Nethermind.Int256/UInt256.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 493c5c6..28b4a62 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1090,13 +1090,13 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
 
             // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
-            uint carryFlag = (uint)Sse2.MoveMask(Avx512F.VL.CompareLessThan(
+            Vector128<ulong> carryFlag = Sse2.ShiftRightLogical(Avx512F.VL.CompareLessThan(
                 Sse2.UnpackHigh(prod0Updated, prod0Updated), // compare updated high limb...
                 Sse2.UnpackHigh(prod0, prod0)         // ...with the original high limb
-                ).AsByte()) & 1;
+                ), 63);
             // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
             Vector128<ulong> csHigh = Sse2.UnpackHigh(crossSum, crossSum);
-            Vector128<ulong> limb2 = Sse2.Add(csHigh, Vector128.CreateScalar((ulong)carryFlag));
+            Vector128<ulong> limb2 = Sse2.Add(csHigh, carryFlag);
 
             // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
             // Shift right each lane by 63 bits, so that 0 becomes 0 and 0xFFFFFFFFFFFFFFFF becomes 1.

From 18ba5fc1c86a792f3f70fc161fbe42afca4df4b7 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 09:02:53 +0000
Subject: [PATCH 28/38] Refactor

---
 src/Nethermind.Int256/UInt256.cs | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 28b4a62..e11e435 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1117,7 +1117,7 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Add totalGroup2 into the current upper 128 bits of intermediateResult.
             Vector128<ulong> currentUpper = intermediateResult.GetUpper();
             Vector128<ulong> newUpper = Add128(currentUpper, totalGroup2);
-            intermediateResult = WithUpper(intermediateResult, newUpper);
+            intermediateResult = Avx2.InsertVector128(intermediateResult, newUpper, 1);
 
             // 9. Process group‑3:
             //    Multiply x23 and y10 (with the proper reversed order) then add in the remaining lower parts.
@@ -1148,13 +1148,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Unsafe.SkipInit(out res);
             Unsafe.As<UInt256, Vector256<ulong>>(ref res) = intermediateResult;
 
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            static Vector256<ulong> WithUpper(Vector256<ulong> vec, Vector128<ulong> upper)
-            {
-                // Replace the upper 128 bits of vec with upper.
-                return Avx2.InsertVector128(vec, upper, 1);
-            }
-
             /// <summary>
             /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
             /// Each 128-bit integer is represented as a <see cref="Vector128{ulong}"/>, with element 0 holding the lower 64 bits

From 040b8addf4fb61d94efc8c1e65073c9d3ba13041 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 09:47:57 +0000
Subject: [PATCH 29/38] Clean up comments

---
 src/Nethermind.Int256/UInt256.cs | 261 ++++++++++++++++---------------
 1 file changed, 134 insertions(+), 127 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index e11e435..eb30ef2 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -996,153 +996,160 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u
             res = a - b - borrow;
             borrow = (((~a) & b) | (~(a ^ b)) & res) >> 63;
         }
-
-        // Multiply sets res to the product x*y
+        /// <summary>
+        /// Multiplies two 256‑bit unsigned integers (<paramref name="x"/> and <paramref name="y"/>) and
+        /// writes the 256‑bit product to <paramref name="res"/>. This implementation uses AVX‑512,
+        /// AVX2, and SSE2 intrinsics for high‑performance multi‑precision arithmetic.
+        /// </summary>
+        /// <param name="x">The first 256‑bit unsigned integer.</param>
+        /// <param name="y">The second 256‑bit unsigned integer.</param>
+        /// <param name="res">When this method returns, contains the 256‑bit product of x and y.</param>
         public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
         {
+            // If both inputs fit in 64 bits, use a simple multiplication routine.
             if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0)
             {
                 MultiplyULong(x, y, out res);
                 return;
             }
+            // Fallback to a non‑AVX‑512 implementation if the required intrinsics are not supported.
             if (!Avx512F.IsSupported || !Avx512DQ.IsSupported)
             {
                 MultiplyNonAvx512(x, y, out res);
                 return;
             }
 
-            // 1. Load the 256‐bit inputs into 256‐bit vector registers.
-            Vector256<ulong> x0123 = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
-            Vector256<ulong> y0123 = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
+            // 1. Load the 256‑bit inputs into 256‑bit vector registers.
+            Vector256<ulong> vecX = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
+            Vector256<ulong> vecY = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
 
-            // Mask for the lower 32 bits.
+            // Create a 512‑bit mask to isolate the lower 32 bits of each 64‑bit limb.
             Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
 
-            // 2. Rearrange the 64‐bit limbs into 512‐bit vectors.
-            // x0010 = [ x0, x0, x1, x0 ]
-            Vector256<ulong> x0010 = Avx2.Permute4x64(x0123, 16);
-            // x1201 = [ x1, x2, x0, x1 ]
-            Vector256<ulong> x1201 = Avx2.Permute4x64(x0123, 73);
-            // x00101201 = [ x0, x0, x1, x0, x1, x2, x0, x1 ]
-            Vector512<ulong> x00101201 = Vector512.Create(x0010, x1201);
-
-            // y0102 = [ y0, y1, y0, y2 ] 
-            Vector256<ulong> y0102 = Avx2.Permute4x64(y0123, 132);
-            // y1032 = [ y1, y0, y3, y2 ]
-            Vector256<ulong> y1032 = Avx2.Permute4x64(y0123, 177);
-            // y01021032 = [ y0, y1, y0, y2, y1, y0, y3, y2 ]
-            Vector512<ulong> y01021032 = Vector512.Create(y0102, y1032);
-
-            // 3. Multiply the corresponding 64‐bit limbs.
-
-            // Split each 64-bit operand into 32-bit halves:
-            Vector512<ulong> xLo = Avx512F.And(x00101201, mask32);
-            Vector512<ulong> xHi = Avx512F.ShiftRightLogical(x00101201, 32);
-            Vector512<ulong> yLo = Avx512F.And(y01021032, mask32);
-            Vector512<ulong> yHi = Avx512F.ShiftRightLogical(y01021032, 32);
-
-            // Compute the four 32x32 partial products.
-            // Each multiplication here is on 32-bit values, so the result fits in 64 bits.
-            Vector512<ulong> u0 = Avx512DQ.MultiplyLow(xLo, yLo);
-            Vector512<ulong> u1 = Avx512DQ.MultiplyLow(xLo, yHi);
-            Vector512<ulong> u2 = Avx512DQ.MultiplyLow(xHi, yLo);
-            Vector512<ulong> u3 = Avx512DQ.MultiplyLow(xHi, yHi);
-
-            // Now, compute t = (u0 >> 32) + (u1 & mask32) + (u2 & mask32)
-            Vector512<ulong> t = Avx512F.Add(Avx512F.Add(Avx512F.ShiftRightLogical(u0, 32), Avx512F.And(u1, mask32)), Avx512F.And(u2, mask32));
-
-            // Now, assemble the lower 64 bits:
-            // low part of u0 is u0 & mask32; low 32 bits of t are (t & mask32) shifted left 32.
-            Vector512<ulong> partialLo = Avx512F.Or(Avx512F.And(u0, mask32), Avx512F.ShiftLeftLogical(Avx512F.And(t, mask32), 32));
-
-            // The high 64 bits are: u3 + (u1 >> 32) + (u2 >> 32) + (t >> 32).
-            Vector512<ulong> partialHi = Avx512F.Add(Avx512F.Add(Avx512F.Add(u3, Avx512F.ShiftRightLogical(u1, 32)), Avx512F.ShiftRightLogical(u2, 32)), Avx512F.ShiftRightLogical(t, 32));
-
-            // 4. Rearrange the six “group‑1” products (prod00, prod01, prod10, prod02, prod11, prod20)
-            //    into 128‑bit quantities. (Here we use the AVX‑512 “extract 128‑bit” function to get two adjacent 64‑bit lanes.)
-            //    – Products 0 and 1 come from index 0:
-            Vector128<ulong> pair01Lo = Avx512F.ExtractVector128(partialLo, 0); // lanes 0–1: prod00_lo, prod01_lo
-            Vector128<ulong> pair01Hi = Avx512F.ExtractVector128(partialHi, 0); // lanes 0–1: prod00_hi, prod01_hi
-                                                                                // Unpack lower (lane0) and upper (lane1) to form product0 and product1:
-            Vector128<ulong> prod0 = Sse2.UnpackLow(pair01Lo, pair01Hi);  // prod00 = {lo, hi}
-            Vector128<ulong> prod1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // prod01 = {lo, hi}
-
-            // – Products 2 and 3 come from index 1:
-            Vector128<ulong> pair23Lo = Avx512F.ExtractVector128(partialLo, 1); // lanes 2–3: prod10_lo, prod02_lo
-            Vector128<ulong> pair23Hi = Avx512F.ExtractVector128(partialHi, 1); // lanes 2–3: prod10_hi, prod02_hi
-            Vector128<ulong> prod2 = Sse2.UnpackLow(pair23Lo, pair23Hi);  // prod10
-            Vector128<ulong> prod3 = Sse2.UnpackHigh(pair23Lo, pair23Hi); // prod02
-
-            // – Products 4 and 5 come from index 2:
-            Vector128<ulong> pair45Lo = Avx512F.ExtractVector128(partialLo, 2); // lanes 4–5: prod11_lo, prod20_lo
-            Vector128<ulong> pair45Hi = Avx512F.ExtractVector128(partialHi, 2); // lanes 4–5: prod11_hi, prod20_hi
-            Vector128<ulong> prod4 = Sse2.UnpackLow(pair45Lo, pair45Hi);  // prod11
-            Vector128<ulong> prod5 = Sse2.UnpackHigh(pair45Lo, pair45Hi); // prod20
-
-            // 5. Group‑1 “cross‑term” addition:
-            //    crossSum = prod01 + prod10 (i.e. add the 128‑bit numbers prod1 and prod2)
-            Vector128<ulong> crossSum = Add128(prod1, prod2);
-
-            // 6. Add the low half of crossSum (i.e. its lower 64 bits) to prod00’s high limb.
-            //    Instead of extracting a scalar, we broadcast the lower 64 bits to a vector.
-            //    (Assume BroadcastLower128 returns a copy with both lanes equal to element0.)
-            Vector128<ulong> addMask = Sse2.UnpackLow(Vector128<ulong>.Zero, crossSum);
-            Vector128<ulong> prod0Updated = Sse2.Add(prod0, addMask);
-
-            // Now, compute the “carry” from that addition. (Again, we must compute a one‐bit flag.)
-            Vector128<ulong> carryFlag = Sse2.ShiftRightLogical(Avx512F.VL.CompareLessThan(
-                Sse2.UnpackHigh(prod0Updated, prod0Updated), // compare updated high limb...
-                Sse2.UnpackHigh(prod0, prod0)         // ...with the original high limb
-                ), 63);
-            // Add the carry to the high half of crossSum. (Broadcast the carry into a 128‑bit vector.)
-            Vector128<ulong> csHigh = Sse2.UnpackHigh(crossSum, crossSum);
-            Vector128<ulong> limb2 = Sse2.Add(csHigh, carryFlag);
-
-            // And form limb3 from a comparison of prod01’s high limb with crossSum’s high:
-            // Shift right each lane by 63 bits, so that 0 becomes 0 and 0xFFFFFFFFFFFFFFFF becomes 1.
-            Vector128<ulong> limb3Vec = Sse2.ShiftRightLogical(Avx512F.VL.CompareGreaterThan(
-                Sse2.UnpackHigh(prod1, prod1), csHigh), 63);
-
-            // Pack limb2 into the lower half and limb3 into the upper half.
-            Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3Vec);
-            // 7. Build the 256‑bit “intermediate” result from group‑1:
-            //    Lower 128 bits = prod00 (with updated high limb)
-            //    Upper 128 bits = (limb2, limb3) packed into a 128‑bit vector.
-            Vector128<ulong> lowerIntermediate = prod0Updated;
-            Vector256<ulong> intermediateResult = Vector256.Create(lowerIntermediate, upperIntermediate);
-
-            // 8. Process group‑2: (prod02, prod11, prod20)
-            Vector128<ulong> group2Sum = Add128(prod3, prod4);
-            Vector128<ulong> totalGroup2 = Add128(group2Sum, prod5);
-            // Add totalGroup2 into the current upper 128 bits of intermediateResult.
+            // 2. Rearrange the 64‑bit limbs into 512‑bit vectors for the partial products.
+            // For x:
+            //   xPerm1 = [ x0, x0, x1, x0 ]
+            Vector256<ulong> xPerm1 = Avx2.Permute4x64(vecX, 16);
+            //   xPerm2 = [ x1, x2, x0, x1 ]
+            Vector256<ulong> xPerm2 = Avx2.Permute4x64(vecX, 73);
+            //   xRearranged = [ x0, x0, x1, x0, x1, x2, x0, x1 ]
+            Vector512<ulong> xRearranged = Vector512.Create(xPerm1, xPerm2);
+
+            // For y:
+            //   yPerm1 = [ y0, y1, y0, y2 ]
+            Vector256<ulong> yPerm1 = Avx2.Permute4x64(vecY, 132);
+            //   yPerm2 = [ y1, y0, y3, y2 ]
+            Vector256<ulong> yPerm2 = Avx2.Permute4x64(vecY, 177);
+            //   yRearranged = [ y0, y1, y0, y2, y1, y0, y3, y2 ]
+            Vector512<ulong> yRearranged = Vector512.Create(yPerm1, yPerm2);
+
+            // 3. Split each 64‑bit limb into its lower and upper 32‑bit halves.
+            Vector512<ulong> xLowerParts = Avx512F.And(xRearranged, mask32);
+            Vector512<ulong> xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32);
+            Vector512<ulong> yLowerParts = Avx512F.And(yRearranged, mask32);
+            Vector512<ulong> yUpperParts = Avx512F.ShiftRightLogical(yRearranged, 32);
+
+            // Compute four 32x32‑bit partial products (each fits in 64 bits).
+            Vector512<ulong> prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower×lower
+            Vector512<ulong> prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts);  // lower×upper
+            Vector512<ulong> prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts);  // upper×lower
+            Vector512<ulong> prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts);  // upper×upper
+
+            // Compute an intermediate term:
+            //   termT = (prodLL >> 32) + (prodLH & mask32) + (prodHL & mask32)
+            Vector512<ulong> termT = Avx512F.Add(
+                                        Avx512F.Add(Avx512F.ShiftRightLogical(prodLL, 32),
+                                                    Avx512F.And(prodLH, mask32)),
+                                        Avx512F.And(prodHL, mask32));
+
+            // Assemble the lower 64 bits of each partial product:
+            //   lowerPartial = (prodLL & mask32) OR ((termT & mask32) << 32)
+            Vector512<ulong> lowerPartial = Avx512F.Or(
+                                                Avx512F.And(prodLL, mask32),
+                                                Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32));
+
+            // Assemble the higher 64 bits:
+            //   higherPartial = prodHH + (prodLH >> 32) + (prodHL >> 32) + (termT >> 32)
+            Vector512<ulong> higherPartial = Avx512F.Add(
+                                                 Avx512F.Add(
+                                                     Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)),
+                                                     Avx512F.ShiftRightLogical(prodHL, 32)),
+                                                 Avx512F.ShiftRightLogical(termT, 32));
+
+            // 4. Unpack the 512‑bit partial results into six 128‑bit values.
+            // Group 1 (products 0 and 1):
+            Vector128<ulong> pair01Lo = Avx512F.ExtractVector128(lowerPartial, 0); // lanes 0–1: product0 (low), product1 (low)
+            Vector128<ulong> pair01Hi = Avx512F.ExtractVector128(higherPartial, 0); // lanes 0–1: product0 (high), product1 (high)
+            Vector128<ulong> product0 = Sse2.UnpackLow(pair01Lo, pair01Hi);  // product0 = { low, high }
+            Vector128<ulong> product1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // product1 = { low, high }
+
+            // Group 2 (products 2 and 3):
+            Vector128<ulong> pair23Lo = Avx512F.ExtractVector128(lowerPartial, 1); // lanes 2–3
+            Vector128<ulong> pair23Hi = Avx512F.ExtractVector128(higherPartial, 1); // lanes 2–3
+            Vector128<ulong> product2 = Sse2.UnpackLow(pair23Lo, pair23Hi);
+            Vector128<ulong> product3 = Sse2.UnpackHigh(pair23Lo, pair23Hi);
+
+            // Group 3 (products 4 and 5):
+            Vector128<ulong> pair45Lo = Avx512F.ExtractVector128(lowerPartial, 2); // lanes 4–5
+            Vector128<ulong> pair45Hi = Avx512F.ExtractVector128(higherPartial, 2); // lanes 4–5
+            Vector128<ulong> product4 = Sse2.UnpackLow(pair45Lo, pair45Hi);
+            Vector128<ulong> product5 = Sse2.UnpackHigh(pair45Lo, pair45Hi);
+
+            // 5. Group 1 cross‑term addition:
+            // Compute crossSum = product1 + product2 (as 128‑bit numbers).
+            Vector128<ulong> crossSum = Add128(product1, product2);
+
+            // 6. Add the lower 64 bits of crossSum to the high limb of product0.
+            // Broadcast crossSum’s low 64 bits into both lanes.
+            Vector128<ulong> crossAddMask = Sse2.UnpackLow(Vector128<ulong>.Zero, crossSum);
+            Vector128<ulong> updatedProduct0 = Sse2.Add(product0, crossAddMask);
+
+            // Compute the carry from that addition by comparing the high limbs before and after.
+            Vector128<ulong> product0HighBefore = Sse2.UnpackHigh(product0, product0);
+            Vector128<ulong> product0HighAfter = Sse2.UnpackHigh(updatedProduct0, updatedProduct0);
+            Vector128<ulong> carryFlag = Sse2.ShiftRightLogical(
+                                            Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore),
+                                            63);
+            // Propagate the carry by adding it to crossSum’s high limb.
+            Vector128<ulong> crossSumHigh = Sse2.UnpackHigh(crossSum, crossSum);
+            Vector128<ulong> limb2 = Sse2.Add(crossSumHigh, carryFlag);
+
+            // Determine an extra carry if product1’s high limb exceeds crossSum’s high limb.
+            Vector128<ulong> limb3 = Sse2.ShiftRightLogical(
+                                        Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh),
+                                        63);
+
+            // Pack limb2 (low) and limb3 (high) to form the new upper half.
+            Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3);
+
+            // 7. Build the intermediate 256‑bit result.
+            // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3).
+            Vector256<ulong> intermediateResult = Vector256.Create(updatedProduct0, upperIntermediate);
+
+            // 8. Group 2 combination:
+            // Sum product3, product4, and product5.
+            Vector128<ulong> group2Sum = Add128(product3, product4);
+            Vector128<ulong> totalGroup2 = Add128(group2Sum, product5);
+            // Add this total into the upper 128 bits of the intermediate result.
             Vector128<ulong> currentUpper = intermediateResult.GetUpper();
             Vector128<ulong> newUpper = Add128(currentUpper, totalGroup2);
             intermediateResult = Avx2.InsertVector128(intermediateResult, newUpper, 1);
 
-            // 9. Process group‑3:
-            //    Multiply x23 and y10 (with the proper reversed order) then add in the remaining lower parts.
-            Vector128<ulong> aHigh = Vector128.Create(x.u2, x.u3);
-            Vector128<ulong> bLow = Vector128.Create(y.u1, y.u0);
-            // Use the AVX512DQ MultiplyLow intrinsic (which multiplies 64‑bit integers and returns the low 64 bits)
-            Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(aHigh, bLow);
-
-            // Extract from partialLo the two lower parts for prod03 and prod12.
-            // With partialLo logically split into Lower (lanes 0–3) and Upper (lanes 4–7),
-            // lanes 6 and 7 are in the Upper half; extracting the second 128‐bit portion of Upper gives us these lanes.
-            Vector128<ulong> prod6 = Avx2.ExtractVector128(partialLo.GetUpper(), 1);
-            // Extract from index 3 the two lower‐parts from prod03 and prod12 (which we stored in “prod6”):
-            // (Note: prod6 already holds both lower parts.)
-            finalProdLow = Sse2.Add(finalProdLow, prod6);
-            // Reinterpret the 64-bit integer vector as a vector of two doubles.
-            // Then use _mm_shuffle_pd (exposed as Sse2.Shuffle for doubles) to swap the two lanes.
-            Vector128<ulong> swapped = Sse2.Shuffle(finalProdLow.AsDouble(), finalProdLow.AsDouble(), 0x1).AsUInt64();
-            // Add the original vector and the swapped vector.
-            // This results in a vector where both lanes equal (vec[0] + vec[1]).
-            Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, swapped);
-            // Add the horizontal sum (broadcast into the high lane) to the most–significant limb of intermediateResult.
-            // 2. Use a unpackHigh with a zero vector to directly form { 0, horizontalSum[0] }
-            Vector128<ulong> high = Sse2.UnpackHigh(Vector128<ulong>.Zero, horizontalSum);
-            intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128<ulong>.Zero, high));
+            // 9. Group 3 cross‑terms:
+            // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order.
+            Vector128<ulong> xHigh = Vector128.Create(x.u2, x.u3);
+            Vector128<ulong> yLow = Vector128.Create(y.u1, y.u0);
+            Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow);
+
+            // Add in the extra lower parts from the upper half of lowerPartial.
+            Vector128<ulong> extraLow = Avx2.ExtractVector128(lowerPartial.GetUpper(), 1);
+            finalProdLow = Sse2.Add(finalProdLow, extraLow);
+            // Perform a horizontal sum so that both lanes contain the same result.
+            Vector128<ulong> swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow);
+            Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, swappedFinal);
+            // Add the horizontal sum (broadcast into the high lane) to the most‑significant limb.
+            Vector128<ulong> highCarry = Sse2.UnpackHigh(Vector128<ulong>.Zero, horizontalSum);
+            intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128<ulong>.Zero, highCarry));
 
             // 10. Write out the final 256‑bit result.
             Unsafe.SkipInit(out res);

From de893e8d9485a9fb478a7d14ecc2fdc5128d8688 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 10:23:48 +0000
Subject: [PATCH 30/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index eb30ef2..2633f48 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1076,24 +1076,21 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
                                                      Avx512F.ShiftRightLogical(prodHL, 32)),
                                                  Avx512F.ShiftRightLogical(termT, 32));
 
+            Vector512<ulong> productLow = Avx512F.UnpackLow(lowerPartial, higherPartial);
+            Vector512<ulong> productHi = Avx512F.UnpackHigh(lowerPartial, higherPartial);
+
             // 4. Unpack the 512‑bit partial results into six 128‑bit values.
             // Group 1 (products 0 and 1):
-            Vector128<ulong> pair01Lo = Avx512F.ExtractVector128(lowerPartial, 0); // lanes 0–1: product0 (low), product1 (low)
-            Vector128<ulong> pair01Hi = Avx512F.ExtractVector128(higherPartial, 0); // lanes 0–1: product0 (high), product1 (high)
-            Vector128<ulong> product0 = Sse2.UnpackLow(pair01Lo, pair01Hi);  // product0 = { low, high }
-            Vector128<ulong> product1 = Sse2.UnpackHigh(pair01Lo, pair01Hi); // product1 = { low, high }
+            Vector128<ulong> product0 = Avx512F.ExtractVector128(productLow, 0);
+            Vector128<ulong> product1 = Avx512F.ExtractVector128(productHi, 0);
 
             // Group 2 (products 2 and 3):
-            Vector128<ulong> pair23Lo = Avx512F.ExtractVector128(lowerPartial, 1); // lanes 2–3
-            Vector128<ulong> pair23Hi = Avx512F.ExtractVector128(higherPartial, 1); // lanes 2–3
-            Vector128<ulong> product2 = Sse2.UnpackLow(pair23Lo, pair23Hi);
-            Vector128<ulong> product3 = Sse2.UnpackHigh(pair23Lo, pair23Hi);
+            Vector128<ulong> product2 = Avx512F.ExtractVector128(productLow, 1);
+            Vector128<ulong> product3 = Avx512F.ExtractVector128(productHi, 1);
 
             // Group 3 (products 4 and 5):
-            Vector128<ulong> pair45Lo = Avx512F.ExtractVector128(lowerPartial, 2); // lanes 4–5
-            Vector128<ulong> pair45Hi = Avx512F.ExtractVector128(higherPartial, 2); // lanes 4–5
-            Vector128<ulong> product4 = Sse2.UnpackLow(pair45Lo, pair45Hi);
-            Vector128<ulong> product5 = Sse2.UnpackHigh(pair45Lo, pair45Hi);
+            Vector128<ulong> product4 = Avx512F.ExtractVector128(productLow, 2);
+            Vector128<ulong> product5 = Avx512F.ExtractVector128(productHi, 2);
 
             // 5. Group 1 cross‑term addition:
             // Compute crossSum = product1 + product2 (as 128‑bit numbers).

From d163c6590ba4b2f3e44542fe0c7e06644af89bdf Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 10:29:56 +0000
Subject: [PATCH 31/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 2633f48..d787dab 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1119,18 +1119,16 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Pack limb2 (low) and limb3 (high) to form the new upper half.
             Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3);
 
-            // 7. Build the intermediate 256‑bit result.
-            // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3).
-            Vector256<ulong> intermediateResult = Vector256.Create(updatedProduct0, upperIntermediate);
 
             // 8. Group 2 combination:
             // Sum product3, product4, and product5.
             Vector128<ulong> group2Sum = Add128(product3, product4);
             Vector128<ulong> totalGroup2 = Add128(group2Sum, product5);
             // Add this total into the upper 128 bits of the intermediate result.
-            Vector128<ulong> currentUpper = intermediateResult.GetUpper();
-            Vector128<ulong> newUpper = Add128(currentUpper, totalGroup2);
-            intermediateResult = Avx2.InsertVector128(intermediateResult, newUpper, 1);
+            Vector128<ulong> newUpper = Add128(upperIntermediate, totalGroup2);
+            // 7. Build the intermediate 256‑bit result.
+            // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3).
+            Vector256<ulong> intermediateResult = Vector256.Create(updatedProduct0, newUpper);
 
             // 9. Group 3 cross‑terms:
             // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order.

From 0bae58314e06c92d868e2a689cebdbf80e5dc838 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 10:33:33 +0000
Subject: [PATCH 32/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index d787dab..6fe0ae0 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1128,7 +1128,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Vector128<ulong> newUpper = Add128(upperIntermediate, totalGroup2);
             // 7. Build the intermediate 256‑bit result.
             // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3).
-            Vector256<ulong> intermediateResult = Vector256.Create(updatedProduct0, newUpper);
 
             // 9. Group 3 cross‑terms:
             // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order.
@@ -1144,7 +1143,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, swappedFinal);
             // Add the horizontal sum (broadcast into the high lane) to the most‑significant limb.
             Vector128<ulong> highCarry = Sse2.UnpackHigh(Vector128<ulong>.Zero, horizontalSum);
-            intermediateResult = Avx2.Add(intermediateResult, Vector256.Create(Vector128<ulong>.Zero, highCarry));
+            newUpper = Sse2.Add(newUpper, highCarry);
+            Vector256<ulong> intermediateResult = Vector256.Create(updatedProduct0, newUpper);
 
             // 10. Write out the final 256‑bit result.
             Unsafe.SkipInit(out res);

From 8a92748f87e4b60fd0eed619a3e96168c433fefc Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 10:45:59 +0000
Subject: [PATCH 33/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 6fe0ae0..a88322e 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1092,6 +1092,8 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             Vector128<ulong> product4 = Avx512F.ExtractVector128(productLow, 2);
             Vector128<ulong> product5 = Avx512F.ExtractVector128(productHi, 2);
 
+            Vector128<ulong> xHigh = Vector128.Create(x.u2, x.u3);
+            Vector128<ulong> yLow = Vector128.Create(y.u1, y.u0);
             // 5. Group 1 cross‑term addition:
             // Compute crossSum = product1 + product2 (as 128‑bit numbers).
             Vector128<ulong> crossSum = Add128(product1, product2);
@@ -1119,7 +1121,6 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // Pack limb2 (low) and limb3 (high) to form the new upper half.
             Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3);
 
-
             // 8. Group 2 combination:
             // Sum product3, product4, and product5.
             Vector128<ulong> group2Sum = Add128(product3, product4);

From 9b843d521d1203ef5c2b7dcab2b458422356b2d4 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 11:09:32 +0000
Subject: [PATCH 34/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index a88322e..3f6a851 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1132,12 +1132,10 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
 
             // 9. Group 3 cross‑terms:
             // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order.
-            Vector128<ulong> xHigh = Vector128.Create(x.u2, x.u3);
-            Vector128<ulong> yLow = Vector128.Create(y.u1, y.u0);
             Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow);
 
             // Add in the extra lower parts from the upper half of lowerPartial.
-            Vector128<ulong> extraLow = Avx2.ExtractVector128(lowerPartial.GetUpper(), 1);
+            Vector128<ulong> extraLow = Avx512F.ExtractVector128(lowerPartial, 3);
             finalProdLow = Sse2.Add(finalProdLow, extraLow);
             // Perform a horizontal sum so that both lanes contain the same result.
             Vector128<ulong> swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow);

From bca256d2fa0916fffcec1015460ad48adc99cbae Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 11:35:21 +0000
Subject: [PATCH 35/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 202 ++++++++++++-------------------
 1 file changed, 80 insertions(+), 122 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 3f6a851..2ea030e 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -998,8 +998,7 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u
         }
         /// <summary>
         /// Multiplies two 256‑bit unsigned integers (<paramref name="x"/> and <paramref name="y"/>) and
-        /// writes the 256‑bit product to <paramref name="res"/>. This implementation uses AVX‑512,
-        /// AVX2, and SSE2 intrinsics for high‑performance multi‑precision arithmetic.
+        /// writes the 256‑bit product to <paramref name="res"/>.
         /// </summary>
         /// <param name="x">The first 256‑bit unsigned integer.</param>
         /// <param name="y">The second 256‑bit unsigned integer.</param>
@@ -1009,145 +1008,136 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             // If both inputs fit in 64 bits, use a simple multiplication routine.
             if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0)
             {
-                MultiplyULong(x, y, out res);
+                // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615)
+                ulong high = Math.BigMul(x.u0, y.u0, out ulong low);
+                // Assignment to res after multiply in case is used as input for x or y (by ref aliasing)
+                res = default;
+                Unsafe.AsRef(in res.u0) = low;
+                Unsafe.AsRef(in res.u1) = high;
                 return;
             }
-            // Fallback to a non‑AVX‑512 implementation if the required intrinsics are not supported.
+            // Fallback if the required AVX‑512 intrinsics are not supported.
             if (!Avx512F.IsSupported || !Avx512DQ.IsSupported)
             {
-                MultiplyNonAvx512(x, y, out res);
+                ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
+                ref ulong ry = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in y));
+
+                (ulong carry, ulong r0) = Multiply64(rx, ry);
+                UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
+                UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
+                ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
+
+                UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
+                UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
+                res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
+
+                UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
+                res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
+
+                ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
+
+                res = new UInt256(r0, r1, r2, r3);
                 return;
             }
 
-            // 1. Load the 256‑bit inputs into 256‑bit vector registers.
+            // Step 1: load the inputs and prepare the mask constant.
             Vector256<ulong> vecX = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
             Vector256<ulong> vecY = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
-
-            // Create a 512‑bit mask to isolate the lower 32 bits of each 64‑bit limb.
             Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
 
-            // 2. Rearrange the 64‑bit limbs into 512‑bit vectors for the partial products.
-            // For x:
-            //   xPerm1 = [ x0, x0, x1, x0 ]
-            Vector256<ulong> xPerm1 = Avx2.Permute4x64(vecX, 16);
-            //   xPerm2 = [ x1, x2, x0, x1 ]
-            Vector256<ulong> xPerm2 = Avx2.Permute4x64(vecX, 73);
-            //   xRearranged = [ x0, x0, x1, x0, x1, x2, x0, x1 ]
-            Vector512<ulong> xRearranged = Vector512.Create(xPerm1, xPerm2);
+            // Step 2: permute x and y. These operations are independent.
+            Vector256<ulong> xPerm1 = Avx2.Permute4x64(vecX, 16);  // [ x0, x0, x1, x0 ]
+            Vector256<ulong> yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ]
+            Vector256<ulong> xPerm2 = Avx2.Permute4x64(vecX, 73);  // [ x1, x2, x0, x1 ]
+            Vector256<ulong> yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ]
 
-            // For y:
-            //   yPerm1 = [ y0, y1, y0, y2 ]
-            Vector256<ulong> yPerm1 = Avx2.Permute4x64(vecY, 132);
-            //   yPerm2 = [ y1, y0, y3, y2 ]
-            Vector256<ulong> yPerm2 = Avx2.Permute4x64(vecY, 177);
-            //   yRearranged = [ y0, y1, y0, y2, y1, y0, y3, y2 ]
+            Vector512<ulong> xRearranged = Vector512.Create(xPerm1, xPerm2);
             Vector512<ulong> yRearranged = Vector512.Create(yPerm1, yPerm2);
 
-            // 3. Split each 64‑bit limb into its lower and upper 32‑bit halves.
+            // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts.
             Vector512<ulong> xLowerParts = Avx512F.And(xRearranged, mask32);
-            Vector512<ulong> xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32);
             Vector512<ulong> yLowerParts = Avx512F.And(yRearranged, mask32);
+            Vector512<ulong> xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32);
             Vector512<ulong> yUpperParts = Avx512F.ShiftRightLogical(yRearranged, 32);
 
-            // Compute four 32x32‑bit partial products (each fits in 64 bits).
-            Vector512<ulong> prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower×lower
-            Vector512<ulong> prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts);  // lower×upper
-            Vector512<ulong> prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts);  // upper×lower
-            Vector512<ulong> prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts);  // upper×upper
-
-            // Compute an intermediate term:
-            //   termT = (prodLL >> 32) + (prodLH & mask32) + (prodHL & mask32)
-            Vector512<ulong> termT = Avx512F.Add(
-                                        Avx512F.Add(Avx512F.ShiftRightLogical(prodLL, 32),
-                                                    Avx512F.And(prodLH, mask32)),
-                                        Avx512F.And(prodHL, mask32));
-
-            // Assemble the lower 64 bits of each partial product:
-            //   lowerPartial = (prodLL & mask32) OR ((termT & mask32) << 32)
-            Vector512<ulong> lowerPartial = Avx512F.Or(
-                                                Avx512F.And(prodLL, mask32),
-                                                Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32));
-
-            // Assemble the higher 64 bits:
-            //   higherPartial = prodHH + (prodLH >> 32) + (prodHL >> 32) + (termT >> 32)
-            Vector512<ulong> higherPartial = Avx512F.Add(
-                                                 Avx512F.Add(
-                                                     Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)),
-                                                     Avx512F.ShiftRightLogical(prodHL, 32)),
-                                                 Avx512F.ShiftRightLogical(termT, 32));
-
+            // Step 4: launch four 32×32‑bit multiplications in parallel.
+            Vector512<ulong> prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower × lower
+            Vector512<ulong> prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts); // lower × upper
+            Vector512<ulong> prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts); // upper × lower
+            Vector512<ulong> prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts); // upper × upper
+
+            // Step 5: compute the intermediate term while the multiplications are in flight.
+            Vector512<ulong> prodLL_hi = Avx512F.ShiftRightLogical(prodLL, 32);
+            Vector512<ulong> prodLH_lo = Avx512F.And(prodLH, mask32);
+            Vector512<ulong> prodHL_lo = Avx512F.And(prodHL, mask32);
+            Vector512<ulong> termT = Avx512F.Add(Avx512F.Add(prodLL_hi, prodLH_lo), prodHL_lo);
+
+            // Step 6: assemble the lower and higher partial results.
+            Vector512<ulong> lowerPartial =
+                Avx512F.Or(
+                    Avx512F.And(prodLL, mask32),
+                    Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32));
+            Vector512<ulong> higherPartial =
+                Avx512F.Add(
+                    Avx512F.Add(
+                        Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)),
+                        Avx512F.ShiftRightLogical(prodHL, 32)),
+                    Avx512F.ShiftRightLogical(termT, 32));
+
+            // Step 7: unpack the 512‑bit results into two groups.
             Vector512<ulong> productLow = Avx512F.UnpackLow(lowerPartial, higherPartial);
             Vector512<ulong> productHi = Avx512F.UnpackHigh(lowerPartial, higherPartial);
 
-            // 4. Unpack the 512‑bit partial results into six 128‑bit values.
-            // Group 1 (products 0 and 1):
+            // Step 8: extract the 128‑bit groups.
             Vector128<ulong> product0 = Avx512F.ExtractVector128(productLow, 0);
             Vector128<ulong> product1 = Avx512F.ExtractVector128(productHi, 0);
-
-            // Group 2 (products 2 and 3):
             Vector128<ulong> product2 = Avx512F.ExtractVector128(productLow, 1);
             Vector128<ulong> product3 = Avx512F.ExtractVector128(productHi, 1);
-
-            // Group 3 (products 4 and 5):
             Vector128<ulong> product4 = Avx512F.ExtractVector128(productLow, 2);
             Vector128<ulong> product5 = Avx512F.ExtractVector128(productHi, 2);
 
+            // Step 9: issue memory request for remaining parts.
             Vector128<ulong> xHigh = Vector128.Create(x.u2, x.u3);
             Vector128<ulong> yLow = Vector128.Create(y.u1, y.u0);
-            // 5. Group 1 cross‑term addition:
-            // Compute crossSum = product1 + product2 (as 128‑bit numbers).
-            Vector128<ulong> crossSum = Add128(product1, product2);
 
-            // 6. Add the lower 64 bits of crossSum to the high limb of product0.
-            // Broadcast crossSum’s low 64 bits into both lanes.
+            // Step 10: perform the group 1 cross‑term addition.
+            Vector128<ulong> crossSum = Add128(product1, product2);
             Vector128<ulong> crossAddMask = Sse2.UnpackLow(Vector128<ulong>.Zero, crossSum);
             Vector128<ulong> updatedProduct0 = Sse2.Add(product0, crossAddMask);
 
-            // Compute the carry from that addition by comparing the high limbs before and after.
+            // Compute the carry from adding crossSum’s low 64 bits.
             Vector128<ulong> product0HighBefore = Sse2.UnpackHigh(product0, product0);
             Vector128<ulong> product0HighAfter = Sse2.UnpackHigh(updatedProduct0, updatedProduct0);
-            Vector128<ulong> carryFlag = Sse2.ShiftRightLogical(
-                                            Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore),
-                                            63);
-            // Propagate the carry by adding it to crossSum’s high limb.
+            Vector128<ulong> carryFlag =
+                Sse2.ShiftRightLogical(
+                    Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore),
+                    63);
             Vector128<ulong> crossSumHigh = Sse2.UnpackHigh(crossSum, crossSum);
             Vector128<ulong> limb2 = Sse2.Add(crossSumHigh, carryFlag);
-
-            // Determine an extra carry if product1’s high limb exceeds crossSum’s high limb.
-            Vector128<ulong> limb3 = Sse2.ShiftRightLogical(
-                                        Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh),
-                                        63);
-
-            // Pack limb2 (low) and limb3 (high) to form the new upper half.
+            Vector128<ulong> limb3 =
+                Sse2.ShiftRightLogical(
+                    Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh),
+                    63);
             Vector128<ulong> upperIntermediate = Sse2.UnpackLow(limb2, limb3);
 
-            // 8. Group 2 combination:
-            // Sum product3, product4, and product5.
+            // Step 11: combine group 2 partial results.
             Vector128<ulong> group2Sum = Add128(product3, product4);
             Vector128<ulong> totalGroup2 = Add128(group2Sum, product5);
-            // Add this total into the upper 128 bits of the intermediate result.
-            Vector128<ulong> newUpper = Add128(upperIntermediate, totalGroup2);
-            // 7. Build the intermediate 256‑bit result.
-            // Lower 128 bits come from the updated product0; upper 128 bits come from (limb2, limb3).
+            Vector128<ulong> newHalf = Add128(upperIntermediate, totalGroup2);
 
-            // 9. Group 3 cross‑terms:
-            // Multiply the high limbs of x (x.u2, x.u3) with the low limbs of y (y.u1, y.u0) in reversed order.
+            // Step 12: process group 3 cross‑terms.
             Vector128<ulong> finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow);
-
-            // Add in the extra lower parts from the upper half of lowerPartial.
             Vector128<ulong> extraLow = Avx512F.ExtractVector128(lowerPartial, 3);
             finalProdLow = Sse2.Add(finalProdLow, extraLow);
-            // Perform a horizontal sum so that both lanes contain the same result.
             Vector128<ulong> swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow);
             Vector128<ulong> horizontalSum = Sse2.Add(finalProdLow, swappedFinal);
-            // Add the horizontal sum (broadcast into the high lane) to the most‑significant limb.
             Vector128<ulong> highCarry = Sse2.UnpackHigh(Vector128<ulong>.Zero, horizontalSum);
-            newUpper = Sse2.Add(newUpper, highCarry);
-            Vector256<ulong> intermediateResult = Vector256.Create(updatedProduct0, newUpper);
+            newHalf = Sse2.Add(newHalf, highCarry);
 
-            // 10. Write out the final 256‑bit result.
+            // Combine the results into the final 256‑bit value.
+            Vector256<ulong> finalResult = Vector256.Create(updatedProduct0, newHalf);
             Unsafe.SkipInit(out res);
-            Unsafe.As<UInt256, Vector256<ulong>>(ref res) = intermediateResult;
+            Unsafe.As<UInt256, Vector256<ulong>>(ref res) = finalResult;
 
             /// <summary>
             /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane.
@@ -1182,38 +1172,6 @@ static Vector128<ulong> Add128(Vector128<ulong> left, Vector128<ulong> right)
             }
         }
 
-        private static void MultiplyNonAvx512(UInt256 x, UInt256 y, out UInt256 res)
-        {
-            ref ulong rx = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in x));
-            ref ulong ry = ref Unsafe.As<UInt256, ulong>(ref Unsafe.AsRef(in y));
-
-            (ulong carry, ulong r0) = Multiply64(rx, ry);
-            UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1);
-            UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2);
-            ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry;
-
-            UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1);
-            UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2);
-            res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry;
-
-            UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2);
-            res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry;
-
-            ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3);
-
-            res = new UInt256(r0, r1, r2, r3);
-        }
-
-        private static void MultiplyULong(UInt256 x, UInt256 y, out UInt256 res)
-        {
-            // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615)
-            ulong high = Math.BigMul(x.u0, y.u0, out ulong low);
-            // Assignment to res after multiply in case is used as input for x or y (by ref aliasing)
-            res = default;
-            Unsafe.AsRef(in res.u0) = low;
-            Unsafe.AsRef(in res.u1) = high;
-        }
-
         public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res);
 
         public static bool MultiplyOverflow(in UInt256 x, in UInt256 y, out UInt256 res)

From f380ecf55157073e113c8376d93bcd37b86c7fdf Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 21:12:08 +0000
Subject: [PATCH 36/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 2ea030e..6bdd8b8 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1003,6 +1003,7 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u
         /// <param name="x">The first 256‑bit unsigned integer.</param>
         /// <param name="y">The second 256‑bit unsigned integer.</param>
         /// <param name="res">When this method returns, contains the 256‑bit product of x and y.</param>
+        [SkipLocalsInit]
         public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
         {
             // If both inputs fit in 64 bits, use a simple multiplication routine.
@@ -1041,18 +1042,17 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
 
             // Step 1: load the inputs and prepare the mask constant.
-            Vector256<ulong> vecX = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
-            Vector256<ulong> vecY = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
+            Vector512<ulong> xPermute = Vector512.Create(0ul, 0, 1, 0, 1, 2, 0, 1);
+            Vector512<ulong> yPermute = Vector512.Create(0ul, 1, 0, 2, 1, 0, 3, 2);
+            Unsafe.SkipInit(out Vector512<ulong> vecX);
+            Unsafe.SkipInit(out Vector512<ulong> vecY);
+            vecX = Avx512F.InsertVector256(vecX, Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x)), 0);
+            vecY = Avx512F.InsertVector256(vecY, Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y)), 0);
             Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
 
             // Step 2: permute x and y. These operations are independent.
-            Vector256<ulong> xPerm1 = Avx2.Permute4x64(vecX, 16);  // [ x0, x0, x1, x0 ]
-            Vector256<ulong> yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ]
-            Vector256<ulong> xPerm2 = Avx2.Permute4x64(vecX, 73);  // [ x1, x2, x0, x1 ]
-            Vector256<ulong> yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ]
-
-            Vector512<ulong> xRearranged = Vector512.Create(xPerm1, xPerm2);
-            Vector512<ulong> yRearranged = Vector512.Create(yPerm1, yPerm2);
+            Vector512<ulong> xRearranged = Avx512F.PermuteVar8x64(vecX, xPermute);
+            Vector512<ulong> yRearranged = Avx512F.PermuteVar8x64(vecY, yPermute);
 
             // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts.
             Vector512<ulong> xLowerParts = Avx512F.And(xRearranged, mask32);

From 2e4113b15d6de40c39a9425608f23ebf28f058f3 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 21:28:52 +0000
Subject: [PATCH 37/38] Revert "Optimize"

This reverts commit f380ecf55157073e113c8376d93bcd37b86c7fdf.
---
 src/Nethermind.Int256/UInt256.cs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 6bdd8b8..2ea030e 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1003,7 +1003,6 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u
         /// <param name="x">The first 256‑bit unsigned integer.</param>
         /// <param name="y">The second 256‑bit unsigned integer.</param>
         /// <param name="res">When this method returns, contains the 256‑bit product of x and y.</param>
-        [SkipLocalsInit]
         public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
         {
             // If both inputs fit in 64 bits, use a simple multiplication routine.
@@ -1042,17 +1041,18 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
             }
 
             // Step 1: load the inputs and prepare the mask constant.
-            Vector512<ulong> xPermute = Vector512.Create(0ul, 0, 1, 0, 1, 2, 0, 1);
-            Vector512<ulong> yPermute = Vector512.Create(0ul, 1, 0, 2, 1, 0, 3, 2);
-            Unsafe.SkipInit(out Vector512<ulong> vecX);
-            Unsafe.SkipInit(out Vector512<ulong> vecY);
-            vecX = Avx512F.InsertVector256(vecX, Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x)), 0);
-            vecY = Avx512F.InsertVector256(vecY, Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y)), 0);
+            Vector256<ulong> vecX = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in x));
+            Vector256<ulong> vecY = Unsafe.As<UInt256, Vector256<ulong>>(ref Unsafe.AsRef(in y));
             Vector512<ulong> mask32 = Vector512.Create(0xFFFFFFFFUL);
 
             // Step 2: permute x and y. These operations are independent.
-            Vector512<ulong> xRearranged = Avx512F.PermuteVar8x64(vecX, xPermute);
-            Vector512<ulong> yRearranged = Avx512F.PermuteVar8x64(vecY, yPermute);
+            Vector256<ulong> xPerm1 = Avx2.Permute4x64(vecX, 16);  // [ x0, x0, x1, x0 ]
+            Vector256<ulong> yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ]
+            Vector256<ulong> xPerm2 = Avx2.Permute4x64(vecX, 73);  // [ x1, x2, x0, x1 ]
+            Vector256<ulong> yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ]
+
+            Vector512<ulong> xRearranged = Vector512.Create(xPerm1, xPerm2);
+            Vector512<ulong> yRearranged = Vector512.Create(yPerm1, yPerm2);
 
             // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts.
             Vector512<ulong> xLowerParts = Avx512F.And(xRearranged, mask32);

From f4045aa318cf6cb47ec3b53dd0021555a32543a6 Mon Sep 17 00:00:00 2001
From: Ben Adams <thundercat@illyriad.co.uk>
Date: Sun, 9 Feb 2025 21:37:46 +0000
Subject: [PATCH 38/38] Optimize

---
 src/Nethermind.Int256/UInt256.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs
index 2ea030e..0905a83 100644
--- a/src/Nethermind.Int256/UInt256.cs
+++ b/src/Nethermind.Int256/UInt256.cs
@@ -1003,6 +1003,7 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u
         /// <param name="x">The first 256‑bit unsigned integer.</param>
         /// <param name="y">The second 256‑bit unsigned integer.</param>
         /// <param name="res">When this method returns, contains the 256‑bit product of x and y.</param>
+        [SkipLocalsInit]
         public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res)
         {
             // If both inputs fit in 64 bits, use a simple multiplication routine.