Skip to content

Commit b62ba8e

Browse files
committed
Build fixes
1 parent 6e19729 commit b62ba8e

File tree

2 files changed

+136
-140
lines changed

2 files changed

+136
-140
lines changed

src/avx2/vertical_u8.rs

-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ unsafe fn convolve_vertical_part_avx_64(
8686

8787
let mut jj = 0usize;
8888

89-
#[cfg(target_arch = "x86_64")]
9089
while jj < bounds_size.saturating_sub(2) {
9190
let py = start_y + jj;
9291
let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32;

src/avx2/vertical_u8_lp.rs

+136-139
Original file line numberDiff line numberDiff line change
@@ -86,153 +86,150 @@ unsafe fn convolve_vertical_avx2_row_impl(
8686

8787
let mut rem = dst;
8888

89-
#[cfg(target_arch = "x86_64")]
90-
{
91-
let iter_64 = rem.chunks_exact_mut(64);
92-
93-
for dst in iter_64 {
94-
let mut store0 = _mm256_set1_epi16(ROUNDING);
95-
let mut store1 = _mm256_set1_epi16(ROUNDING);
96-
let mut store2 = _mm256_set1_epi16(ROUNDING);
97-
let mut store3 = _mm256_set1_epi16(ROUNDING);
98-
99-
let px = cx;
100-
101-
if bounds_size == 2 {
102-
let py = bounds.start;
103-
let weights = weight.get_unchecked(0..2);
104-
let v_weight0 = _mm256_set1_epi16(weights[0]);
105-
let v_weight1 = _mm256_set1_epi16(weights[1]);
106-
let v_offset0 = src_stride * py + px;
107-
let src_ptr0 = src.get_unchecked(v_offset0..);
108-
let v_offset1 = src_stride * (py + 1) + px;
109-
let src_ptr1 = src.get_unchecked(v_offset1..);
110-
111-
let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
112-
let item_row1 =
113-
_mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
114-
115-
(store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
116-
(store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
117-
118-
let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
119-
let item_row11 =
120-
_mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
121-
122-
(store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
123-
(store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
124-
} else if bounds_size == 3 {
125-
let py = bounds.start;
126-
let weights = weight.get_unchecked(0..3);
127-
let v_weight0 = _mm256_set1_epi16(weights[0]);
128-
let v_weight1 = _mm256_set1_epi16(weights[1]);
129-
let v_weight2 = _mm256_set1_epi16(weights[2]);
130-
let v_offset0 = src_stride * py + px;
131-
let src_ptr0 = src.get_unchecked(v_offset0..);
132-
let v_offset1 = src_stride * (py + 1) + px;
133-
let src_ptr1 = src.get_unchecked(v_offset1..);
134-
let v_offset2 = src_stride * (py + 2) + px;
135-
let src_ptr2 = src.get_unchecked(v_offset2..);
136-
137-
let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
138-
let item_row1 =
139-
_mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
140-
141-
(store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
142-
(store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
143-
144-
let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
145-
let item_row11 =
146-
_mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
147-
148-
(store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
149-
(store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
150-
151-
let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
152-
let item_row21 =
153-
_mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
154-
155-
(store0, store1) = m256dot(store0, store1, item_row20, v_weight2);
156-
(store2, store3) = m256dot(store2, store3, item_row21, v_weight2);
157-
} else if bounds_size == 4 {
158-
let py = bounds.start;
159-
let weights = weight.get_unchecked(0..4);
160-
let v_weight0 = _mm256_set1_epi16(weights[0]);
161-
let v_weight1 = _mm256_set1_epi16(weights[1]);
162-
let v_weight2 = _mm256_set1_epi16(weights[2]);
163-
let v_weight3 = _mm256_set1_epi16(weights[3]);
164-
let v_offset0 = src_stride * py + px;
165-
let src_ptr0 = src.get_unchecked(v_offset0..);
166-
let v_offset1 = src_stride * (py + 1) + px;
167-
let src_ptr1 = src.get_unchecked(v_offset1..);
168-
let v_offset2 = src_stride * (py + 2) + px;
169-
let src_ptr2 = src.get_unchecked(v_offset2..);
170-
let v_offset3 = src_stride * (py + 3) + px;
171-
let src_ptr3 = src.get_unchecked(v_offset3..);
172-
173-
let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
174-
let item_row1 =
175-
_mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
176-
177-
(store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
178-
(store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
179-
180-
let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
181-
let item_row11 =
182-
_mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
183-
184-
(store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
185-
(store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
186-
187-
let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
188-
let item_row21 =
189-
_mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
190-
191-
(store0, store1) = m256dot(store0, store1, item_row20, v_weight2);
192-
(store2, store3) = m256dot(store2, store3, item_row21, v_weight2);
193-
194-
let item_row30 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
195-
let item_row31 =
196-
_mm256_loadu_si256(src_ptr3.get_unchecked(32..).as_ptr() as *const __m256i);
197-
198-
(store0, store1) = m256dot(store0, store1, item_row30, v_weight3);
199-
(store2, store3) = m256dot(store2, store3, item_row31, v_weight3);
200-
} else {
201-
for j in 0..bounds_size {
202-
let py = bounds.start + j;
203-
let weight = weight.get_unchecked(j..(j + 1));
204-
let v_weight = _mm256_set1_epi16(weight[0]);
205-
let v_offset = src_stride * py + px;
206-
let src_ptr = src.get_unchecked(v_offset..);
207-
let item_row0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
208-
let item_row1 =
209-
_mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
210-
211-
(store0, store1) = m256dot(store0, store1, item_row0, v_weight);
212-
(store2, store3) = m256dot(store2, store3, item_row1, v_weight);
213-
}
214-
}
89+
let iter_64 = rem.chunks_exact_mut(64);
21590

216-
let rebased0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
217-
let rebased1 = _mm256_srai_epi16::<R_SHR_SCALE>(store1);
218-
let rebased2 = _mm256_srai_epi16::<R_SHR_SCALE>(store2);
219-
let rebased3 = _mm256_srai_epi16::<R_SHR_SCALE>(store3);
91+
for dst in iter_64 {
92+
let mut store0 = _mm256_set1_epi16(ROUNDING);
93+
let mut store1 = _mm256_set1_epi16(ROUNDING);
94+
let mut store2 = _mm256_set1_epi16(ROUNDING);
95+
let mut store3 = _mm256_set1_epi16(ROUNDING);
22096

221-
let shrank0 = _mm256_packus_epi16(rebased0, rebased1);
222-
let shrank1 = _mm256_packus_epi16(rebased2, rebased3);
97+
let px = cx;
22398

224-
_mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
225-
_mm256_storeu_si256(
226-
dst.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i,
227-
shrank1,
228-
);
99+
if bounds_size == 2 {
100+
let py = bounds.start;
101+
let weights = weight.get_unchecked(0..2);
102+
let v_weight0 = _mm256_set1_epi16(weights[0]);
103+
let v_weight1 = _mm256_set1_epi16(weights[1]);
104+
let v_offset0 = src_stride * py + px;
105+
let src_ptr0 = src.get_unchecked(v_offset0..);
106+
let v_offset1 = src_stride * (py + 1) + px;
107+
let src_ptr1 = src.get_unchecked(v_offset1..);
108+
109+
let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
110+
let item_row1 =
111+
_mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
112+
113+
(store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
114+
(store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
115+
116+
let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
117+
let item_row11 =
118+
_mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
229119

230-
cx += 64;
120+
(store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
121+
(store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
122+
} else if bounds_size == 3 {
123+
let py = bounds.start;
124+
let weights = weight.get_unchecked(0..3);
125+
let v_weight0 = _mm256_set1_epi16(weights[0]);
126+
let v_weight1 = _mm256_set1_epi16(weights[1]);
127+
let v_weight2 = _mm256_set1_epi16(weights[2]);
128+
let v_offset0 = src_stride * py + px;
129+
let src_ptr0 = src.get_unchecked(v_offset0..);
130+
let v_offset1 = src_stride * (py + 1) + px;
131+
let src_ptr1 = src.get_unchecked(v_offset1..);
132+
let v_offset2 = src_stride * (py + 2) + px;
133+
let src_ptr2 = src.get_unchecked(v_offset2..);
134+
135+
let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
136+
let item_row1 =
137+
_mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
138+
139+
(store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
140+
(store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
141+
142+
let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
143+
let item_row11 =
144+
_mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
145+
146+
(store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
147+
(store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
148+
149+
let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
150+
let item_row21 =
151+
_mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
152+
153+
(store0, store1) = m256dot(store0, store1, item_row20, v_weight2);
154+
(store2, store3) = m256dot(store2, store3, item_row21, v_weight2);
155+
} else if bounds_size == 4 {
156+
let py = bounds.start;
157+
let weights = weight.get_unchecked(0..4);
158+
let v_weight0 = _mm256_set1_epi16(weights[0]);
159+
let v_weight1 = _mm256_set1_epi16(weights[1]);
160+
let v_weight2 = _mm256_set1_epi16(weights[2]);
161+
let v_weight3 = _mm256_set1_epi16(weights[3]);
162+
let v_offset0 = src_stride * py + px;
163+
let src_ptr0 = src.get_unchecked(v_offset0..);
164+
let v_offset1 = src_stride * (py + 1) + px;
165+
let src_ptr1 = src.get_unchecked(v_offset1..);
166+
let v_offset2 = src_stride * (py + 2) + px;
167+
let src_ptr2 = src.get_unchecked(v_offset2..);
168+
let v_offset3 = src_stride * (py + 3) + px;
169+
let src_ptr3 = src.get_unchecked(v_offset3..);
170+
171+
let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
172+
let item_row1 =
173+
_mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
174+
175+
(store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
176+
(store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
177+
178+
let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
179+
let item_row11 =
180+
_mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
181+
182+
(store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
183+
(store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
184+
185+
let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
186+
let item_row21 =
187+
_mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
188+
189+
(store0, store1) = m256dot(store0, store1, item_row20, v_weight2);
190+
(store2, store3) = m256dot(store2, store3, item_row21, v_weight2);
191+
192+
let item_row30 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
193+
let item_row31 =
194+
_mm256_loadu_si256(src_ptr3.get_unchecked(32..).as_ptr() as *const __m256i);
195+
196+
(store0, store1) = m256dot(store0, store1, item_row30, v_weight3);
197+
(store2, store3) = m256dot(store2, store3, item_row31, v_weight3);
198+
} else {
199+
for j in 0..bounds_size {
200+
let py = bounds.start + j;
201+
let weight = weight.get_unchecked(j..(j + 1));
202+
let v_weight = _mm256_set1_epi16(weight[0]);
203+
let v_offset = src_stride * py + px;
204+
let src_ptr = src.get_unchecked(v_offset..);
205+
let item_row0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
206+
let item_row1 =
207+
_mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
208+
209+
(store0, store1) = m256dot(store0, store1, item_row0, v_weight);
210+
(store2, store3) = m256dot(store2, store3, item_row1, v_weight);
211+
}
231212
}
232213

233-
rem = rem.chunks_exact_mut(64).into_remainder();
214+
let rebased0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
215+
let rebased1 = _mm256_srai_epi16::<R_SHR_SCALE>(store1);
216+
let rebased2 = _mm256_srai_epi16::<R_SHR_SCALE>(store2);
217+
let rebased3 = _mm256_srai_epi16::<R_SHR_SCALE>(store3);
218+
219+
let shrank0 = _mm256_packus_epi16(rebased0, rebased1);
220+
let shrank1 = _mm256_packus_epi16(rebased2, rebased3);
221+
222+
_mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
223+
_mm256_storeu_si256(
224+
dst.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i,
225+
shrank1,
226+
);
227+
228+
cx += 64;
234229
}
235230

231+
rem = rem.chunks_exact_mut(64).into_remainder();
232+
236233
let iter_32 = rem.chunks_exact_mut(32);
237234

238235
for dst in iter_32 {

0 commit comments

Comments
 (0)