Skip to content

Commit 493d04a

Browse files
committed
Add unrolled FMA loop
1 parent 941e173 commit 493d04a

File tree

1 file changed

+26
-8
lines changed

1 file changed

+26
-8
lines changed

src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
7474
if (Fma.IsSupported)
7575
{
7676
float* bufferStart = this.bufferPtr;
77-
float* bufferEnd = bufferStart + (this.Length & ~1);
78-
Vector256<float> result256 = Vector256<float>.Zero;
77+
float* bufferEnd = bufferStart + (this.Length & ~3);
78+
Vector256<float> result256_0 = Vector256<float>.Zero;
79+
Vector256<float> result256_1 = Vector256<float>.Zero;
7980
var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
8081

8182
while (bufferStart < bufferEnd)
@@ -87,19 +88,36 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
8788
//
8889
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
8990
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
90-
// vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256
91+
// vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
9192
//
9293
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
93-
result256 = Fma.MultiplyAdd(
94+
// Additionally, we're also unrolling two computations per each loop iterations to leverage the
95+
// fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
96+
result256_0 = Fma.MultiplyAdd(
9497
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
9598
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
96-
result256);
99+
result256_0);
97100

98-
bufferStart += 2;
99-
rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
101+
result256_1 = Fma.MultiplyAdd(
102+
Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
103+
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
104+
result256_1);
105+
106+
bufferStart += 4;
107+
rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
108+
}
109+
110+
result256_0 = Avx.Add(result256_0, result256_1);
111+
112+
if ((this.Length & 3) >= 2)
113+
{
114+
result256_0 = Fma.MultiplyAdd(
115+
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
116+
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
117+
result256_0);
100118
}
101119

102-
Vector128<float> result128 = Sse.Add(result256.GetLower(), result256.GetUpper());
120+
Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
103121

104122
if ((this.Length & 1) != 0)
105123
{

0 commit comments

Comments
 (0)