@@ -74,8 +74,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
74
74
if ( Fma . IsSupported )
75
75
{
76
76
float * bufferStart = this . bufferPtr ;
77
- float * bufferEnd = bufferStart + ( this . Length & ~ 1 ) ;
78
- Vector256 < float > result256 = Vector256 < float > . Zero ;
77
+ float * bufferEnd = bufferStart + ( this . Length & ~ 3 ) ;
78
+ Vector256 < float > result256_0 = Vector256 < float > . Zero ;
79
+ Vector256 < float > result256_1 = Vector256 < float > . Zero ;
79
80
var mask = Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ) ;
80
81
81
82
while ( bufferStart < bufferEnd )
@@ -87,19 +88,36 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
87
88
//
88
89
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
89
90
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
90
- // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256
91
+ // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
91
92
//
92
93
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
93
- result256 = Fma . MultiplyAdd (
94
+ // Additionally, we're also unrolling two computations per each loop iterations to leverage the
95
+ // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
96
+ result256_0 = Fma . MultiplyAdd (
94
97
Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ,
95
98
Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) bufferStart ) . AsSingle ( ) , mask ) ,
96
- result256 ) ;
99
+ result256_0 ) ;
97
100
98
- bufferStart += 2 ;
99
- rowStartRef = ref Unsafe . Add ( ref rowStartRef , 2 ) ;
101
+ result256_1 = Fma . MultiplyAdd (
102
+ Unsafe . As < Vector4 , Vector256 < float > > ( ref Unsafe . Add ( ref rowStartRef , 2 ) ) ,
103
+ Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) ( bufferStart + 2 ) ) . AsSingle ( ) , mask ) ,
104
+ result256_1 ) ;
105
+
106
+ bufferStart += 4 ;
107
+ rowStartRef = ref Unsafe . Add ( ref rowStartRef , 4 ) ;
108
+ }
109
+
110
+ result256_0 = Avx . Add ( result256_0 , result256_1 ) ;
111
+
112
+ if ( ( this . Length & 3 ) >= 2 )
113
+ {
114
+ result256_0 = Fma . MultiplyAdd (
115
+ Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ,
116
+ Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) bufferStart ) . AsSingle ( ) , mask ) ,
117
+ result256_0 ) ;
100
118
}
101
119
102
- Vector128 < float > result128 = Sse . Add ( result256 . GetLower ( ) , result256 . GetUpper ( ) ) ;
120
+ Vector128 < float > result128 = Sse . Add ( result256_0 . GetLower ( ) , result256_0 . GetUpper ( ) ) ;
103
121
104
122
if ( ( this . Length & 1 ) != 0 )
105
123
{
0 commit comments