From 7d7b936e1fdc7a6467658dbd442d1616c5023a94 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Mon, 10 Feb 2025 10:21:25 +0000 Subject: [PATCH] Vectorized Multiply (on Avx512) (#45) * avx512 multiply * Optimize * optimize * optimize * Optimize * Optimize * Simplify * Optimize * Optimize * optimize * Recoment and rename * Optimize * Optimize * Fix benchmarks * Temp refactor * Improved comments * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Otpimize * Refactor * Clean up comments * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Optimize * Revert "Optimize" This reverts commit f380ecf55157073e113c8376d93bcd37b86c7fdf. * Optimize --- src/Nethermind.Int256.Benchmark/Benchmarks.cs | 92 +++++----- .../NoIntrinsicsJobAttribute.cs | 4 +- src/Nethermind.Int256/UInt256.cs | 172 ++++++++++++++++-- 3 files changed, 206 insertions(+), 62 deletions(-) diff --git a/src/Nethermind.Int256.Benchmark/Benchmarks.cs b/src/Nethermind.Int256.Benchmark/Benchmarks.cs index 616aac1..b22a1da 100644 --- a/src/Nethermind.Int256.Benchmark/Benchmarks.cs +++ b/src/Nethermind.Int256.Benchmark/Benchmarks.cs @@ -89,8 +89,8 @@ public class SignedIntTwoParamBenchmarkBase : SignedBenchmarkBase public (int, Int256) D; } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddUnsigned : UnsignedTwoParamBenchmarkBase { @@ -108,8 +108,8 @@ public UInt256 Add_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddSigned : SignedTwoParamBenchmarkBase { @@ -127,8 +127,8 @@ public Int256 Add_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractUnsigned : UnsignedTwoParamBenchmarkBase { @@ -146,8 +146,8 @@ public UInt256 Subtract_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractSigned : SignedTwoParamBenchmarkBase { @@ -165,8 +165,8 @@ public Int256 Subtract_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddModUnsinged : UnsignedThreeParamBenchmarkBase { @@ -184,8 +184,8 @@ public UInt256 AddMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class AddModSinged : SignedThreeParamBenchmarkBase { @@ -203,8 +203,8 @@ public Int256 AddMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractModUnsinged : UnsignedThreeParamBenchmarkBase { @@ -222,8 +222,8 @@ public UInt256 SubtractMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class SubtractModSigned : SignedThreeParamBenchmarkBase { @@ -241,8 +241,8 @@ public Int256 SubtractMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplyUnsigned : UnsignedTwoParamBenchmarkBase { @@ -260,8 +260,8 @@ public UInt256 Multiply_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplySigned : SignedTwoParamBenchmarkBase { @@ -279,8 +279,8 @@ public Int256 Multiply_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplyModUnsigned : UnsignedThreeParamBenchmarkBase { @@ -298,8 +298,8 @@ public UInt256 MultiplyMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class MultiplyModSigned : SignedThreeParamBenchmarkBase { @@ -317,8 +317,8 @@ public Int256 MultiplyMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class DivideUnsigned : UnsignedTwoParamBenchmarkBase { @@ -336,8 +336,8 @@ public UInt256 Divide_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class DivideSigned : SignedTwoParamBenchmarkBase { @@ -355,8 +355,8 @@ public Int256 Divide_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpUnsigned : UnsignedIntTwoParamBenchmarkBase { @@ -374,8 +374,8 @@ public UInt256 Exp_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpSigned : SignedIntTwoParamBenchmarkBase { @@ -393,8 +393,8 @@ public Int256 Exp_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpModUnsigned : UnsignedThreeParamBenchmarkBase { @@ -412,8 +412,8 @@ public UInt256 ExpMod_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class ExpModSigned : SignedBenchmarkBase { @@ -440,8 +440,8 @@ public Int256 ExpMod_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class LeftShiftUnsigned : UnsignedIntTwoParamBenchmarkBase { @@ -459,8 +459,8 @@ public UInt256 LeftShift_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class LeftShiftSigned : SignedIntTwoParamBenchmarkBase { @@ -478,8 +478,8 @@ public Int256 LeftShift_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class RightShiftUnsigned : UnsignedIntTwoParamBenchmarkBase { @@ -497,8 +497,8 @@ public UInt256 RightShift_UInt256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class RightShiftSigned : SignedIntTwoParamBenchmarkBase { @@ -516,8 +516,8 @@ public Int256 RightShift_Int256() } } - [SimpleJob(RuntimeMoniker.Net70, baseline: true)] - [NoIntrinsicsJob(RuntimeMoniker.Net70)] + [SimpleJob(RuntimeMoniker.Net90, baseline: true)] + [NoIntrinsicsJob(RuntimeMoniker.Net90)] [MemoryDiagnoser] public class IsZeroOne { diff --git a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs index d90679e..f3307cb 100644 --- a/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs +++ b/src/Nethermind.Int256.Benchmark/NoIntrinsicsJobAttribute.cs @@ -1,4 +1,4 @@ -using System; +using System; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; @@ -116,6 +116,8 @@ internal static Runtime GetRuntime(this RuntimeMoniker runtimeMoniker) return CoreRuntime.Core70; case RuntimeMoniker.Net80: return CoreRuntime.Core80; + case RuntimeMoniker.Net90: + return CoreRuntime.Core90; case RuntimeMoniker.Mono: return MonoRuntime.Default; case RuntimeMoniker.NativeAot60: diff --git a/src/Nethermind.Int256/UInt256.cs b/src/Nethermind.Int256/UInt256.cs index 75a4c73..0905a83 100644 --- a/src/Nethermind.Int256/UInt256.cs +++ b/src/Nethermind.Int256/UInt256.cs @@ -996,10 +996,17 @@ private static void SubtractWithBorrow(ulong a, ulong b, ref ulong borrow, out u res = a - b - borrow; borrow = (((~a) & b) | (~(a ^ b)) & res) >> 63; } - - // Multiply sets res to the product x*y + /// + /// Multiplies two 256‑bit unsigned integers ( and ) and + /// writes the 256‑bit product to . + /// + /// The first 256‑bit unsigned integer. + /// The second 256‑bit unsigned integer. + /// When this method returns, contains the 256‑bit product of x and y. + [SkipLocalsInit] public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) { + // If both inputs fit in 64 bits, use a simple multiplication routine. if ((x.u1 | x.u2 | x.u3 | y.u1 | y.u2 | y.u3) == 0) { // Fast multiply for numbers less than 2^64 (18,446,744,073,709,551,615) @@ -1010,25 +1017,160 @@ public static void Multiply(in UInt256 x, in UInt256 y, out UInt256 res) Unsafe.AsRef(in res.u1) = high; return; } + // Fallback if the required AVX‑512 intrinsics are not supported. + if (!Avx512F.IsSupported || !Avx512DQ.IsSupported) + { + ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); + ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); - ref ulong rx = ref Unsafe.As(ref Unsafe.AsRef(in x)); - ref ulong ry = ref Unsafe.As(ref Unsafe.AsRef(in y)); + (ulong carry, ulong r0) = Multiply64(rx, ry); + UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); + UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); + ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; - (ulong carry, ulong r0) = Multiply64(rx, ry); - UmulHop(carry, Unsafe.Add(ref rx, 1), ry, out carry, out ulong res1); - UmulHop(carry, Unsafe.Add(ref rx, 2), ry, out carry, out ulong res2); - ulong res3 = Unsafe.Add(ref rx, 3) * ry + carry; + UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); + UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); + res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; - UmulHop(res1, rx, Unsafe.Add(ref ry, 1), out carry, out ulong r1); - UmulStep(res2, Unsafe.Add(ref rx, 1), Unsafe.Add(ref ry, 1), carry, out carry, out res2); - res3 = res3 + Unsafe.Add(ref rx, 2) * Unsafe.Add(ref ry, 1) + carry; + UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); + res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; - UmulHop(res2, rx, Unsafe.Add(ref ry, 2), out carry, out ulong r2); - res3 = res3 + Unsafe.Add(ref rx, 1) * Unsafe.Add(ref ry, 2) + carry; + ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); - ulong r3 = res3 + rx * Unsafe.Add(ref ry, 3); + res = new UInt256(r0, r1, r2, r3); + return; + } - res = new UInt256(r0, r1, r2, r3); + // Step 1: load the inputs and prepare the mask constant. + Vector256 vecX = Unsafe.As>(ref Unsafe.AsRef(in x)); + Vector256 vecY = Unsafe.As>(ref Unsafe.AsRef(in y)); + Vector512 mask32 = Vector512.Create(0xFFFFFFFFUL); + + // Step 2: permute x and y. These operations are independent. + Vector256 xPerm1 = Avx2.Permute4x64(vecX, 16); // [ x0, x0, x1, x0 ] + Vector256 yPerm1 = Avx2.Permute4x64(vecY, 132); // [ y0, y1, y0, y2 ] + Vector256 xPerm2 = Avx2.Permute4x64(vecX, 73); // [ x1, x2, x0, x1 ] + Vector256 yPerm2 = Avx2.Permute4x64(vecY, 177); // [ y1, y0, y3, y2 ] + + Vector512 xRearranged = Vector512.Create(xPerm1, xPerm2); + Vector512 yRearranged = Vector512.Create(yPerm1, yPerm2); + + // Step 3: split each 64‑bit limb into its lower and upper 32‑bit parts. + Vector512 xLowerParts = Avx512F.And(xRearranged, mask32); + Vector512 yLowerParts = Avx512F.And(yRearranged, mask32); + Vector512 xUpperParts = Avx512F.ShiftRightLogical(xRearranged, 32); + Vector512 yUpperParts = Avx512F.ShiftRightLogical(yRearranged, 32); + + // Step 4: launch four 32×32‑bit multiplications in parallel. + Vector512 prodLL = Avx512DQ.MultiplyLow(xLowerParts, yLowerParts); // lower × lower + Vector512 prodLH = Avx512DQ.MultiplyLow(xLowerParts, yUpperParts); // lower × upper + Vector512 prodHL = Avx512DQ.MultiplyLow(xUpperParts, yLowerParts); // upper × lower + Vector512 prodHH = Avx512DQ.MultiplyLow(xUpperParts, yUpperParts); // upper × upper + + // Step 5: compute the intermediate term while the multiplications are in flight. + Vector512 prodLL_hi = Avx512F.ShiftRightLogical(prodLL, 32); + Vector512 prodLH_lo = Avx512F.And(prodLH, mask32); + Vector512 prodHL_lo = Avx512F.And(prodHL, mask32); + Vector512 termT = Avx512F.Add(Avx512F.Add(prodLL_hi, prodLH_lo), prodHL_lo); + + // Step 6: assemble the lower and higher partial results. + Vector512 lowerPartial = + Avx512F.Or( + Avx512F.And(prodLL, mask32), + Avx512F.ShiftLeftLogical(Avx512F.And(termT, mask32), 32)); + Vector512 higherPartial = + Avx512F.Add( + Avx512F.Add( + Avx512F.Add(prodHH, Avx512F.ShiftRightLogical(prodLH, 32)), + Avx512F.ShiftRightLogical(prodHL, 32)), + Avx512F.ShiftRightLogical(termT, 32)); + + // Step 7: unpack the 512‑bit results into two groups. + Vector512 productLow = Avx512F.UnpackLow(lowerPartial, higherPartial); + Vector512 productHi = Avx512F.UnpackHigh(lowerPartial, higherPartial); + + // Step 8: extract the 128‑bit groups. + Vector128 product0 = Avx512F.ExtractVector128(productLow, 0); + Vector128 product1 = Avx512F.ExtractVector128(productHi, 0); + Vector128 product2 = Avx512F.ExtractVector128(productLow, 1); + Vector128 product3 = Avx512F.ExtractVector128(productHi, 1); + Vector128 product4 = Avx512F.ExtractVector128(productLow, 2); + Vector128 product5 = Avx512F.ExtractVector128(productHi, 2); + + // Step 9: issue memory request for remaining parts. + Vector128 xHigh = Vector128.Create(x.u2, x.u3); + Vector128 yLow = Vector128.Create(y.u1, y.u0); + + // Step 10: perform the group 1 cross‑term addition. + Vector128 crossSum = Add128(product1, product2); + Vector128 crossAddMask = Sse2.UnpackLow(Vector128.Zero, crossSum); + Vector128 updatedProduct0 = Sse2.Add(product0, crossAddMask); + + // Compute the carry from adding crossSum’s low 64 bits. + Vector128 product0HighBefore = Sse2.UnpackHigh(product0, product0); + Vector128 product0HighAfter = Sse2.UnpackHigh(updatedProduct0, updatedProduct0); + Vector128 carryFlag = + Sse2.ShiftRightLogical( + Avx512F.VL.CompareLessThan(product0HighAfter, product0HighBefore), + 63); + Vector128 crossSumHigh = Sse2.UnpackHigh(crossSum, crossSum); + Vector128 limb2 = Sse2.Add(crossSumHigh, carryFlag); + Vector128 limb3 = + Sse2.ShiftRightLogical( + Avx512F.VL.CompareGreaterThan(Sse2.UnpackHigh(product1, product1), crossSumHigh), + 63); + Vector128 upperIntermediate = Sse2.UnpackLow(limb2, limb3); + + // Step 11: combine group 2 partial results. + Vector128 group2Sum = Add128(product3, product4); + Vector128 totalGroup2 = Add128(group2Sum, product5); + Vector128 newHalf = Add128(upperIntermediate, totalGroup2); + + // Step 12: process group 3 cross‑terms. + Vector128 finalProdLow = Avx512DQ.VL.MultiplyLow(xHigh, yLow); + Vector128 extraLow = Avx512F.ExtractVector128(lowerPartial, 3); + finalProdLow = Sse2.Add(finalProdLow, extraLow); + Vector128 swappedFinal = Sse2.UnpackLow(finalProdLow, finalProdLow); + Vector128 horizontalSum = Sse2.Add(finalProdLow, swappedFinal); + Vector128 highCarry = Sse2.UnpackHigh(Vector128.Zero, horizontalSum); + newHalf = Sse2.Add(newHalf, highCarry); + + // Combine the results into the final 256‑bit value. + Vector256 finalResult = Vector256.Create(updatedProduct0, newHalf); + Unsafe.SkipInit(out res); + Unsafe.As>(ref res) = finalResult; + + /// + /// Adds two 128-bit unsigned integers while propagating an overflow (carry) from the lower 64-bit lane to the higher lane. + /// Each 128-bit integer is represented as a , with element 0 holding the lower 64 bits + /// and element 1 holding the higher 64 bits. + /// + /// The first 128-bit unsigned integer operand. + /// The second 128-bit unsigned integer operand. + /// + /// A representing the sum of the two operands, with any carry from the lower lane added + /// into the higher lane. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 Add128(Vector128 left, Vector128 right) + { + // Perform a lane-wise addition of the two operands. + Vector128 sum = Sse2.Add(left, right); + + // For unsigned addition, an overflow in a lane occurs if the result is less than one of the operands. + // Comparing 'sum' with 'operand1' produces a mask where each 64-bit lane is all ones if an overflow occurred, or zero otherwise. + Vector128 overflowMask = Avx512F.VL.CompareLessThan(sum, left); + + // Normalize the overflow mask: shift each 64-bit lane right by 63 bits. + // This converts a full mask (0xFFFFFFFFFFFFFFFF) to 1, leaving lanes with no overflow as 0. + overflowMask = Sse2.ShiftRightLogical(overflowMask, 63); + // Next, clear the (now swapped) lower lane by shuffle with a zero vector. + // The immediate mask 0x0 indicates that lane 0 should come from the zero vector and lane 1 from overflow. + Vector128 promotedCarry = Sse2.UnpackLow(Vector128.Zero, overflowMask); + + // Add the propagated carry to the sum. + return Sse2.Add(sum, promotedCarry); + } } public void Multiply(in UInt256 a, out UInt256 res) => Multiply(this, a, out res);