@@ -86,153 +86,150 @@ unsafe fn convolve_vertical_avx2_row_impl(
86
86
87
87
let mut rem = dst;
88
88
89
- #[ cfg( target_arch = "x86_64" ) ]
90
- {
91
- let iter_64 = rem. chunks_exact_mut ( 64 ) ;
92
-
93
- for dst in iter_64 {
94
- let mut store0 = _mm256_set1_epi16 ( ROUNDING ) ;
95
- let mut store1 = _mm256_set1_epi16 ( ROUNDING ) ;
96
- let mut store2 = _mm256_set1_epi16 ( ROUNDING ) ;
97
- let mut store3 = _mm256_set1_epi16 ( ROUNDING ) ;
98
-
99
- let px = cx;
100
-
101
- if bounds_size == 2 {
102
- let py = bounds. start ;
103
- let weights = weight. get_unchecked ( 0 ..2 ) ;
104
- let v_weight0 = _mm256_set1_epi16 ( weights[ 0 ] ) ;
105
- let v_weight1 = _mm256_set1_epi16 ( weights[ 1 ] ) ;
106
- let v_offset0 = src_stride * py + px;
107
- let src_ptr0 = src. get_unchecked ( v_offset0..) ;
108
- let v_offset1 = src_stride * ( py + 1 ) + px;
109
- let src_ptr1 = src. get_unchecked ( v_offset1..) ;
110
-
111
- let item_row0 = _mm256_loadu_si256 ( src_ptr0. as_ptr ( ) as * const __m256i ) ;
112
- let item_row1 =
113
- _mm256_loadu_si256 ( src_ptr0. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
114
-
115
- ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight0) ;
116
- ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight0) ;
117
-
118
- let item_row10 = _mm256_loadu_si256 ( src_ptr1. as_ptr ( ) as * const __m256i ) ;
119
- let item_row11 =
120
- _mm256_loadu_si256 ( src_ptr1. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
121
-
122
- ( store0, store1) = m256dot ( store0, store1, item_row10, v_weight1) ;
123
- ( store2, store3) = m256dot ( store2, store3, item_row11, v_weight1) ;
124
- } else if bounds_size == 3 {
125
- let py = bounds. start ;
126
- let weights = weight. get_unchecked ( 0 ..3 ) ;
127
- let v_weight0 = _mm256_set1_epi16 ( weights[ 0 ] ) ;
128
- let v_weight1 = _mm256_set1_epi16 ( weights[ 1 ] ) ;
129
- let v_weight2 = _mm256_set1_epi16 ( weights[ 2 ] ) ;
130
- let v_offset0 = src_stride * py + px;
131
- let src_ptr0 = src. get_unchecked ( v_offset0..) ;
132
- let v_offset1 = src_stride * ( py + 1 ) + px;
133
- let src_ptr1 = src. get_unchecked ( v_offset1..) ;
134
- let v_offset2 = src_stride * ( py + 2 ) + px;
135
- let src_ptr2 = src. get_unchecked ( v_offset2..) ;
136
-
137
- let item_row0 = _mm256_loadu_si256 ( src_ptr0. as_ptr ( ) as * const __m256i ) ;
138
- let item_row1 =
139
- _mm256_loadu_si256 ( src_ptr0. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
140
-
141
- ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight0) ;
142
- ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight0) ;
143
-
144
- let item_row10 = _mm256_loadu_si256 ( src_ptr1. as_ptr ( ) as * const __m256i ) ;
145
- let item_row11 =
146
- _mm256_loadu_si256 ( src_ptr1. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
147
-
148
- ( store0, store1) = m256dot ( store0, store1, item_row10, v_weight1) ;
149
- ( store2, store3) = m256dot ( store2, store3, item_row11, v_weight1) ;
150
-
151
- let item_row20 = _mm256_loadu_si256 ( src_ptr2. as_ptr ( ) as * const __m256i ) ;
152
- let item_row21 =
153
- _mm256_loadu_si256 ( src_ptr2. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
154
-
155
- ( store0, store1) = m256dot ( store0, store1, item_row20, v_weight2) ;
156
- ( store2, store3) = m256dot ( store2, store3, item_row21, v_weight2) ;
157
- } else if bounds_size == 4 {
158
- let py = bounds. start ;
159
- let weights = weight. get_unchecked ( 0 ..4 ) ;
160
- let v_weight0 = _mm256_set1_epi16 ( weights[ 0 ] ) ;
161
- let v_weight1 = _mm256_set1_epi16 ( weights[ 1 ] ) ;
162
- let v_weight2 = _mm256_set1_epi16 ( weights[ 2 ] ) ;
163
- let v_weight3 = _mm256_set1_epi16 ( weights[ 3 ] ) ;
164
- let v_offset0 = src_stride * py + px;
165
- let src_ptr0 = src. get_unchecked ( v_offset0..) ;
166
- let v_offset1 = src_stride * ( py + 1 ) + px;
167
- let src_ptr1 = src. get_unchecked ( v_offset1..) ;
168
- let v_offset2 = src_stride * ( py + 2 ) + px;
169
- let src_ptr2 = src. get_unchecked ( v_offset2..) ;
170
- let v_offset3 = src_stride * ( py + 3 ) + px;
171
- let src_ptr3 = src. get_unchecked ( v_offset3..) ;
172
-
173
- let item_row0 = _mm256_loadu_si256 ( src_ptr0. as_ptr ( ) as * const __m256i ) ;
174
- let item_row1 =
175
- _mm256_loadu_si256 ( src_ptr0. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
176
-
177
- ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight0) ;
178
- ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight0) ;
179
-
180
- let item_row10 = _mm256_loadu_si256 ( src_ptr1. as_ptr ( ) as * const __m256i ) ;
181
- let item_row11 =
182
- _mm256_loadu_si256 ( src_ptr1. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
183
-
184
- ( store0, store1) = m256dot ( store0, store1, item_row10, v_weight1) ;
185
- ( store2, store3) = m256dot ( store2, store3, item_row11, v_weight1) ;
186
-
187
- let item_row20 = _mm256_loadu_si256 ( src_ptr2. as_ptr ( ) as * const __m256i ) ;
188
- let item_row21 =
189
- _mm256_loadu_si256 ( src_ptr2. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
190
-
191
- ( store0, store1) = m256dot ( store0, store1, item_row20, v_weight2) ;
192
- ( store2, store3) = m256dot ( store2, store3, item_row21, v_weight2) ;
193
-
194
- let item_row30 = _mm256_loadu_si256 ( src_ptr3. as_ptr ( ) as * const __m256i ) ;
195
- let item_row31 =
196
- _mm256_loadu_si256 ( src_ptr3. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
197
-
198
- ( store0, store1) = m256dot ( store0, store1, item_row30, v_weight3) ;
199
- ( store2, store3) = m256dot ( store2, store3, item_row31, v_weight3) ;
200
- } else {
201
- for j in 0 ..bounds_size {
202
- let py = bounds. start + j;
203
- let weight = weight. get_unchecked ( j..( j + 1 ) ) ;
204
- let v_weight = _mm256_set1_epi16 ( weight[ 0 ] ) ;
205
- let v_offset = src_stride * py + px;
206
- let src_ptr = src. get_unchecked ( v_offset..) ;
207
- let item_row0 = _mm256_loadu_si256 ( src_ptr. as_ptr ( ) as * const __m256i ) ;
208
- let item_row1 =
209
- _mm256_loadu_si256 ( src_ptr. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
210
-
211
- ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight) ;
212
- ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight) ;
213
- }
214
- }
89
+ let iter_64 = rem. chunks_exact_mut ( 64 ) ;
215
90
216
- let rebased0 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store0) ;
217
- let rebased1 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store1) ;
218
- let rebased2 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store2) ;
219
- let rebased3 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store3) ;
91
+ for dst in iter_64 {
92
+ let mut store0 = _mm256_set1_epi16 ( ROUNDING ) ;
93
+ let mut store1 = _mm256_set1_epi16 ( ROUNDING ) ;
94
+ let mut store2 = _mm256_set1_epi16 ( ROUNDING ) ;
95
+ let mut store3 = _mm256_set1_epi16 ( ROUNDING ) ;
220
96
221
- let shrank0 = _mm256_packus_epi16 ( rebased0, rebased1) ;
222
- let shrank1 = _mm256_packus_epi16 ( rebased2, rebased3) ;
97
+ let px = cx;
223
98
224
- _mm256_storeu_si256 ( dst. as_mut_ptr ( ) as * mut __m256i , shrank0) ;
225
- _mm256_storeu_si256 (
226
- dst. get_unchecked_mut ( 32 ..) . as_mut_ptr ( ) as * mut __m256i ,
227
- shrank1,
228
- ) ;
99
+ if bounds_size == 2 {
100
+ let py = bounds. start ;
101
+ let weights = weight. get_unchecked ( 0 ..2 ) ;
102
+ let v_weight0 = _mm256_set1_epi16 ( weights[ 0 ] ) ;
103
+ let v_weight1 = _mm256_set1_epi16 ( weights[ 1 ] ) ;
104
+ let v_offset0 = src_stride * py + px;
105
+ let src_ptr0 = src. get_unchecked ( v_offset0..) ;
106
+ let v_offset1 = src_stride * ( py + 1 ) + px;
107
+ let src_ptr1 = src. get_unchecked ( v_offset1..) ;
108
+
109
+ let item_row0 = _mm256_loadu_si256 ( src_ptr0. as_ptr ( ) as * const __m256i ) ;
110
+ let item_row1 =
111
+ _mm256_loadu_si256 ( src_ptr0. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
112
+
113
+ ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight0) ;
114
+ ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight0) ;
115
+
116
+ let item_row10 = _mm256_loadu_si256 ( src_ptr1. as_ptr ( ) as * const __m256i ) ;
117
+ let item_row11 =
118
+ _mm256_loadu_si256 ( src_ptr1. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
229
119
230
- cx += 64 ;
120
+ ( store0, store1) = m256dot ( store0, store1, item_row10, v_weight1) ;
121
+ ( store2, store3) = m256dot ( store2, store3, item_row11, v_weight1) ;
122
+ } else if bounds_size == 3 {
123
+ let py = bounds. start ;
124
+ let weights = weight. get_unchecked ( 0 ..3 ) ;
125
+ let v_weight0 = _mm256_set1_epi16 ( weights[ 0 ] ) ;
126
+ let v_weight1 = _mm256_set1_epi16 ( weights[ 1 ] ) ;
127
+ let v_weight2 = _mm256_set1_epi16 ( weights[ 2 ] ) ;
128
+ let v_offset0 = src_stride * py + px;
129
+ let src_ptr0 = src. get_unchecked ( v_offset0..) ;
130
+ let v_offset1 = src_stride * ( py + 1 ) + px;
131
+ let src_ptr1 = src. get_unchecked ( v_offset1..) ;
132
+ let v_offset2 = src_stride * ( py + 2 ) + px;
133
+ let src_ptr2 = src. get_unchecked ( v_offset2..) ;
134
+
135
+ let item_row0 = _mm256_loadu_si256 ( src_ptr0. as_ptr ( ) as * const __m256i ) ;
136
+ let item_row1 =
137
+ _mm256_loadu_si256 ( src_ptr0. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
138
+
139
+ ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight0) ;
140
+ ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight0) ;
141
+
142
+ let item_row10 = _mm256_loadu_si256 ( src_ptr1. as_ptr ( ) as * const __m256i ) ;
143
+ let item_row11 =
144
+ _mm256_loadu_si256 ( src_ptr1. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
145
+
146
+ ( store0, store1) = m256dot ( store0, store1, item_row10, v_weight1) ;
147
+ ( store2, store3) = m256dot ( store2, store3, item_row11, v_weight1) ;
148
+
149
+ let item_row20 = _mm256_loadu_si256 ( src_ptr2. as_ptr ( ) as * const __m256i ) ;
150
+ let item_row21 =
151
+ _mm256_loadu_si256 ( src_ptr2. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
152
+
153
+ ( store0, store1) = m256dot ( store0, store1, item_row20, v_weight2) ;
154
+ ( store2, store3) = m256dot ( store2, store3, item_row21, v_weight2) ;
155
+ } else if bounds_size == 4 {
156
+ let py = bounds. start ;
157
+ let weights = weight. get_unchecked ( 0 ..4 ) ;
158
+ let v_weight0 = _mm256_set1_epi16 ( weights[ 0 ] ) ;
159
+ let v_weight1 = _mm256_set1_epi16 ( weights[ 1 ] ) ;
160
+ let v_weight2 = _mm256_set1_epi16 ( weights[ 2 ] ) ;
161
+ let v_weight3 = _mm256_set1_epi16 ( weights[ 3 ] ) ;
162
+ let v_offset0 = src_stride * py + px;
163
+ let src_ptr0 = src. get_unchecked ( v_offset0..) ;
164
+ let v_offset1 = src_stride * ( py + 1 ) + px;
165
+ let src_ptr1 = src. get_unchecked ( v_offset1..) ;
166
+ let v_offset2 = src_stride * ( py + 2 ) + px;
167
+ let src_ptr2 = src. get_unchecked ( v_offset2..) ;
168
+ let v_offset3 = src_stride * ( py + 3 ) + px;
169
+ let src_ptr3 = src. get_unchecked ( v_offset3..) ;
170
+
171
+ let item_row0 = _mm256_loadu_si256 ( src_ptr0. as_ptr ( ) as * const __m256i ) ;
172
+ let item_row1 =
173
+ _mm256_loadu_si256 ( src_ptr0. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
174
+
175
+ ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight0) ;
176
+ ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight0) ;
177
+
178
+ let item_row10 = _mm256_loadu_si256 ( src_ptr1. as_ptr ( ) as * const __m256i ) ;
179
+ let item_row11 =
180
+ _mm256_loadu_si256 ( src_ptr1. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
181
+
182
+ ( store0, store1) = m256dot ( store0, store1, item_row10, v_weight1) ;
183
+ ( store2, store3) = m256dot ( store2, store3, item_row11, v_weight1) ;
184
+
185
+ let item_row20 = _mm256_loadu_si256 ( src_ptr2. as_ptr ( ) as * const __m256i ) ;
186
+ let item_row21 =
187
+ _mm256_loadu_si256 ( src_ptr2. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
188
+
189
+ ( store0, store1) = m256dot ( store0, store1, item_row20, v_weight2) ;
190
+ ( store2, store3) = m256dot ( store2, store3, item_row21, v_weight2) ;
191
+
192
+ let item_row30 = _mm256_loadu_si256 ( src_ptr3. as_ptr ( ) as * const __m256i ) ;
193
+ let item_row31 =
194
+ _mm256_loadu_si256 ( src_ptr3. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
195
+
196
+ ( store0, store1) = m256dot ( store0, store1, item_row30, v_weight3) ;
197
+ ( store2, store3) = m256dot ( store2, store3, item_row31, v_weight3) ;
198
+ } else {
199
+ for j in 0 ..bounds_size {
200
+ let py = bounds. start + j;
201
+ let weight = weight. get_unchecked ( j..( j + 1 ) ) ;
202
+ let v_weight = _mm256_set1_epi16 ( weight[ 0 ] ) ;
203
+ let v_offset = src_stride * py + px;
204
+ let src_ptr = src. get_unchecked ( v_offset..) ;
205
+ let item_row0 = _mm256_loadu_si256 ( src_ptr. as_ptr ( ) as * const __m256i ) ;
206
+ let item_row1 =
207
+ _mm256_loadu_si256 ( src_ptr. get_unchecked ( 32 ..) . as_ptr ( ) as * const __m256i ) ;
208
+
209
+ ( store0, store1) = m256dot ( store0, store1, item_row0, v_weight) ;
210
+ ( store2, store3) = m256dot ( store2, store3, item_row1, v_weight) ;
211
+ }
231
212
}
232
213
233
- rem = rem. chunks_exact_mut ( 64 ) . into_remainder ( ) ;
214
+ let rebased0 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store0) ;
215
+ let rebased1 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store1) ;
216
+ let rebased2 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store2) ;
217
+ let rebased3 = _mm256_srai_epi16 :: < R_SHR_SCALE > ( store3) ;
218
+
219
+ let shrank0 = _mm256_packus_epi16 ( rebased0, rebased1) ;
220
+ let shrank1 = _mm256_packus_epi16 ( rebased2, rebased3) ;
221
+
222
+ _mm256_storeu_si256 ( dst. as_mut_ptr ( ) as * mut __m256i , shrank0) ;
223
+ _mm256_storeu_si256 (
224
+ dst. get_unchecked_mut ( 32 ..) . as_mut_ptr ( ) as * mut __m256i ,
225
+ shrank1,
226
+ ) ;
227
+
228
+ cx += 64 ;
234
229
}
235
230
231
+ rem = rem. chunks_exact_mut ( 64 ) . into_remainder ( ) ;
232
+
236
233
let iter_32 = rem. chunks_exact_mut ( 32 ) ;
237
234
238
235
for dst in iter_32 {
0 commit comments