From ab50ceb9725fe532318928b54e26ed2c74a83ee1 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Wed, 1 Jan 2025 11:18:31 +0000 Subject: [PATCH] AVX, SSE refactor and improvements --- app/benches/resize_rgba/main.rs | 2 +- app/src/main.rs | 2 +- src/avx2/alpha_f16.rs | 246 ++++++++-------- src/avx2/alpha_f32.rs | 50 ++-- src/avx2/alpha_u16.rs | 255 +++-------------- src/avx2/alpha_u8.rs | 276 +++++++++--------- src/avx2/rgba_f16.rs | 9 +- src/avx2/rgba_u8_lb.rs | 398 +++++++++++++------------- src/sse/alpha_f16.rs | 134 +++++---- src/sse/alpha_u16.rs | 300 ++++++++++---------- src/sse/alpha_u8.rs | 135 +++++---- src/sse/plane_f32.rs | 264 +++++++++--------- src/sse/rgb_f16.rs | 6 +- src/sse/rgb_u8.rs | 409 +++++++++++++-------------- src/sse/rgba_f16.rs | 11 +- src/sse/rgba_f32.rs | 122 ++++---- src/sse/rgba_u8.rs | 284 ++++++++++--------- src/sse/rgba_u8_lb.rs | 478 ++++++++++++++++---------------- 18 files changed, 1567 insertions(+), 1814 deletions(-) diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs index effd560..93b8f04 100644 --- a/app/benches/resize_rgba/main.rs +++ b/app/benches/resize_rgba/main.rs @@ -166,7 +166,7 @@ pub fn criterion_benchmark(c: &mut Criterion) { dimensions.0 as usize, dimensions.1 as usize, ) - .unwrap(); + .unwrap(); let mut target = ImageStoreMut::alloc_with_depth( dimensions.0 as usize / 4, dimensions.1 as usize / 4, diff --git a/app/src/main.rs b/app/src/main.rs index 7eac02e..25367ac 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -42,7 +42,7 @@ fn resize_plane( fn main() { // test_fast_image(); - let img = ImageReader::open("./assets/abstract_alpha.png") + let img = ImageReader::open("./assets/asset_4.png") .unwrap() .decode() .unwrap(); diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs index 62679de..8cbca73 100644 --- a/src/avx2/alpha_f16.rs +++ b/src/avx2/alpha_f16.rs @@ -55,61 +55,59 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[ let mut rem = dst; let mut src_rem = src; - unsafe { - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); + let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); + let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3)); - let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha); - let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha); - let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha); + let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3)); + let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha); + let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha); + let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha); - let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); - let high_r = _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), - high_alpha, - ); - let high_g = _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), - high_alpha, - ); - let high_b = _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), - high_alpha, - ); - let r_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), - ); - let g_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), - ); - let b_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), - ); - let dst_ptr = dst.as_mut_ptr(); - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); + let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); + let high_r = _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), + high_alpha, + ); + let high_g = _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), + high_alpha, + ); + let high_b = _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), + high_alpha, + ); + let r_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), + ); + let g_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), + ); + let b_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), + ); + let dst_ptr = dst.as_mut_ptr(); + let (d_lane0, d_lane1, d_lane2, d_lane3) = + avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); + _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); + _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); + _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + premultiply_pixel_f16_row(rem, src_rem); } @@ -155,84 +153,82 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16( unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3)); - let zeros = _mm256_setzero_ps(); - let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros); - let low_r = _mm256_blendv_ps( - _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_g = _mm256_blendv_ps( - _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_b = _mm256_blendv_ps( - _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha), - zeros, - low_alpha_zero_mask, - ); + for dst in rem.chunks_exact_mut(16 * 4) { + let src_ptr = dst.as_ptr(); + let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); + let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); + let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); - let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros); - let high_r = _mm256_blendv_ps( - _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_g = _mm256_blendv_ps( - _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_b = _mm256_blendv_ps( - _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let r_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), - ); - let g_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), - ); - let b_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), - ); - let dst_ptr = dst.as_mut_ptr(); - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } + let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3)); + let zeros = _mm256_setzero_ps(); + let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros); + let low_r = _mm256_blendv_ps( + _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_g = _mm256_blendv_ps( + _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_b = _mm256_blendv_ps( + _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha), + zeros, + low_alpha_zero_mask, + ); - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); + let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros); + let high_r = _mm256_blendv_ps( + _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_g = _mm256_blendv_ps( + _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_b = _mm256_blendv_ps( + _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let r_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), + ); + let g_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), + ); + let b_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), + ); + let dst_ptr = dst.as_mut_ptr(); + let (d_lane0, d_lane1, d_lane2, d_lane3) = + avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); + _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); + _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); + _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + unpremultiply_pixel_f16_row(rem); } diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs index f136208..f5b4d40 100644 --- a/src/avx2/alpha_f32.rs +++ b/src/avx2/alpha_f32.rs @@ -129,34 +129,32 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32]) let mut rem = dst; let mut src_rem = src; - unsafe { - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let rgba0 = _mm256_loadu_ps(src_ptr); - let rgba1 = _mm256_loadu_ps(src_ptr.add(8)); - let rgba2 = _mm256_loadu_ps(src_ptr.add(16)); - let rgba3 = _mm256_loadu_ps(src_ptr.add(24)); - let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3); - - let rrr = _mm256_mul_ps(rrr, aaa); - let ggg = _mm256_mul_ps(ggg, aaa); - let bbb = _mm256_mul_ps(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_ps(dst_ptr, rgba0); - _mm256_storeu_ps(dst_ptr.add(8), rgba1); - _mm256_storeu_ps(dst_ptr.add(16), rgba2); - _mm256_storeu_ps(dst_ptr.add(24), rgba3); - - _cx += 8; - } - - rem = rem.chunks_exact_mut(8 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(8 * 4).remainder(); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let src_ptr = src.as_ptr(); + let rgba0 = _mm256_loadu_ps(src_ptr); + let rgba1 = _mm256_loadu_ps(src_ptr.add(8)); + let rgba2 = _mm256_loadu_ps(src_ptr.add(16)); + let rgba3 = _mm256_loadu_ps(src_ptr.add(24)); + let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3); + + let rrr = _mm256_mul_ps(rrr, aaa); + let ggg = _mm256_mul_ps(ggg, aaa); + let bbb = _mm256_mul_ps(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm256_storeu_ps(dst_ptr, rgba0); + _mm256_storeu_ps(dst_ptr.add(8), rgba1); + _mm256_storeu_ps(dst_ptr.add(16), rgba2); + _mm256_storeu_ps(dst_ptr.add(24), rgba3); + + _cx += 8; } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(8 * 4).remainder(); + premultiply_pixel_f32_row(rem, src_rem); } diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs index e34ee1a..a77b32d 100644 --- a/src/avx2/alpha_u16.rs +++ b/src/avx2/alpha_u16.rs @@ -75,6 +75,17 @@ pub(crate) unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i { _mm256_srli_epi32::(_mm256_add_epi32(v, _mm256_srli_epi32::(v))) } +#[inline(always)] +unsafe fn _mm256_div_by_epi32(v: __m256i) -> __m256i { + if BIT_DEPTH == 10 { + _mm256_div_by_1023_epi32(v) + } else if BIT_DEPTH == 12 { + _mm256_div_by_4095_epi32(v) + } else { + _mm256_div_by_65535_epi32(v) + } +} + pub(crate) fn avx_premultiply_alpha_rgba_u16( dst: &mut [u16], src: &[u16], @@ -92,21 +103,10 @@ trait Avx2PremultiplyExecutor { unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize); } -struct Avx2PremultiplyExecutor10Bit {} - -impl Default for Avx2PremultiplyExecutor10Bit { - fn default() -> Self { - Avx2PremultiplyExecutor10Bit {} - } -} - -impl Avx2PremultiplyExecutor10Bit { - fn create() -> impl Avx2PremultiplyExecutor { - Avx2PremultiplyExecutor10Bit::default() - } -} +#[derive(Default)] +struct Avx2PremultiplyExecutorDefault {} -impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor10Bit { +impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorDefault { #[target_feature(enable = "avx2")] unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { let max_colors = (1 << bit_depth) - 1; @@ -131,205 +131,31 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor10Bit { let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); let new_rrr = _mm256_packus_epi32( - _mm256_div_by_1023_epi32(_mm256_madd_epi16( + _mm256_div_by_epi32::(_mm256_madd_epi16( _mm256_unpacklo_epi16(pixel.0, zeros), low_alpha, )), - _mm256_div_by_1023_epi32(_mm256_madd_epi16( + _mm256_div_by_epi32::(_mm256_madd_epi16( _mm256_unpackhi_epi16(pixel.0, zeros), high_alpha, )), ); let new_ggg = _mm256_packus_epi32( - _mm256_div_by_1023_epi32(_mm256_madd_epi16( + _mm256_div_by_epi32::(_mm256_madd_epi16( _mm256_unpacklo_epi16(pixel.1, zeros), low_alpha, )), - _mm256_div_by_1023_epi32(_mm256_madd_epi16( + _mm256_div_by_epi32::(_mm256_madd_epi16( _mm256_unpackhi_epi16(pixel.1, zeros), high_alpha, )), ); let new_bbb = _mm256_packus_epi32( - _mm256_div_by_1023_epi32(_mm256_madd_epi16( + _mm256_div_by_epi32::(_mm256_madd_epi16( _mm256_unpacklo_epi16(pixel.2, zeros), low_alpha, )), - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.2, zeros), - high_alpha, - )), - ); - - let dst_ptr = dst.as_mut_ptr(); - - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); - - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); - - premultiply_alpha_rgba_row(rem, src_rem, max_colors); - } -} - -struct Avx2PremultiplyExecutor12Bit {} - -impl Default for Avx2PremultiplyExecutor12Bit { - fn default() -> Self { - Avx2PremultiplyExecutor12Bit {} - } -} - -impl Avx2PremultiplyExecutor12Bit { - fn create() -> impl Avx2PremultiplyExecutor { - Avx2PremultiplyExecutor12Bit::default() - } -} - -impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor12Bit { - #[target_feature(enable = "avx2")] - unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { - let max_colors = (1 << bit_depth) - 1; - - let mut rem = dst; - let mut src_rem = src; - - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let zeros = _mm256_setzero_si256(); - let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); - let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - - let new_rrr = _mm256_packus_epi32( - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.0, zeros), - low_alpha, - )), - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.0, zeros), - high_alpha, - )), - ); - let new_ggg = _mm256_packus_epi32( - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.1, zeros), - low_alpha, - )), - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.1, zeros), - high_alpha, - )), - ); - let new_bbb = _mm256_packus_epi32( - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.2, zeros), - low_alpha, - )), - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.2, zeros), - high_alpha, - )), - ); - - let dst_ptr = dst.as_mut_ptr(); - - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); - - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); - - premultiply_alpha_rgba_row(rem, src_rem, max_colors); - } -} - -struct Avx2PremultiplyExecutor16Bit {} - -impl Default for Avx2PremultiplyExecutor16Bit { - fn default() -> Self { - Avx2PremultiplyExecutor16Bit {} - } -} - -impl Avx2PremultiplyExecutor16Bit { - fn create() -> impl Avx2PremultiplyExecutor { - Avx2PremultiplyExecutor16Bit::default() - } -} - -impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor16Bit { - #[target_feature(enable = "avx2")] - unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { - let max_colors = (1 << bit_depth) - 1; - - let mut rem = dst; - let mut src_rem = src; - - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let zeros = _mm256_setzero_si256(); - let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); - let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - - let new_rrr = _mm256_packus_epi32( - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpacklo_epi16(pixel.0, zeros), - low_alpha, - )), - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpackhi_epi16(pixel.0, zeros), - high_alpha, - )), - ); - let new_ggg = _mm256_packus_epi32( - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpacklo_epi16(pixel.1, zeros), - low_alpha, - )), - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpackhi_epi16(pixel.1, zeros), - high_alpha, - )), - ); - let new_bbb = _mm256_packus_epi32( - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpacklo_epi16(pixel.2, zeros), - low_alpha, - )), - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( + _mm256_div_by_epi32::(_mm256_madd_epi16( _mm256_unpackhi_epi16(pixel.2, zeros), high_alpha, )), @@ -353,20 +179,9 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor16Bit { } } +#[derive(Default)] struct Avx2PremultiplyExecutorAnyBit {} -impl Default for Avx2PremultiplyExecutorAnyBit { - fn default() -> Self { - Avx2PremultiplyExecutorAnyBit {} - } -} - -impl Avx2PremultiplyExecutorAnyBit { - fn create() -> impl Avx2PremultiplyExecutor { - Avx2PremultiplyExecutorAnyBit {} - } -} - impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorAnyBit { #[target_feature(enable = "avx2")] unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { @@ -425,13 +240,33 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorAnyBit { /// This inlining is required to activate all features for runtime dispatch unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) { if bit_depth == 10 { - avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutor10Bit::create()); + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorDefault::<10>::default(), + ); } else if bit_depth == 12 { - avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutor12Bit::create()); + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorDefault::<12>::default(), + ); } else if bit_depth == 16 { - avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutor16Bit::create()); + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorDefault::<16>::default(), + ); } else { - avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutorAnyBit::create()); + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorAnyBit::default(), + ); }; } diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 35c8f94..4ecff54 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -110,102 +110,100 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let mut rem = dst; let mut src_rem = src; - unsafe { - for (dst, src) in rem - .chunks_exact_mut(32 * 4) - .zip(src_rem.chunks_exact(32 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); - let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); - let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let zeros = _mm256_setzero_si256(); - - let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros); - - let aaa_low = _mm256_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm256_unpackhi_epi8(aaa, zeros); - - rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low)); - rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high)); - ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low)); - ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high)); - bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low)); - bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high)); - - let rrr = _mm256_packus_epi16(rrr_low, rrr_high); - let ggg = _mm256_packus_epi16(ggg_low, ggg_high); - let bbb = _mm256_packus_epi16(bbb_low, bbb_high); - - let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); - let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); - _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); - _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); - } - - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(32 * 4).remainder(); - - let zeros = _mm_setzero_si128(); - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); - - let aaa_low = _mm_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm_unpackhi_epi8(aaa, zeros); - - rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); - rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); - ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); - ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); - bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); - bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); - - let rrr = _mm_packus_epi16(rrr_low, rrr_high); - let ggg = _mm_packus_epi16(ggg_low, ggg_high); - let bbb = _mm_packus_epi16(bbb_low, bbb_high); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); + for (dst, src) in rem + .chunks_exact_mut(32 * 4) + .zip(src_rem.chunks_exact(32 * 4)) + { + let src_ptr = src.as_ptr(); + let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); + let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); + let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let zeros = _mm256_setzero_si256(); + + let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm256_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm256_unpackhi_epi8(aaa, zeros); + + rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low)); + rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high)); + ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low)); + ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high)); + bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low)); + bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm256_packus_epi16(rrr_low, rrr_high); + let ggg = _mm256_packus_epi16(ggg_low, ggg_high); + let bbb = _mm256_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); + let dst_ptr = dst.as_mut_ptr(); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); + _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + } + + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(32 * 4).remainder(); + + let zeros = _mm_setzero_si128(); + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm_unpackhi_epi8(aaa, zeros); + + rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); + rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); + ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); + ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); + bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); + bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm_packus_epi16(rrr_low, rrr_high); + let ggg = _mm_packus_epi16(ggg_low, ggg_high); + let bbb = _mm_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + premultiply_alpha_rgba_row_impl(rem, src_rem); } @@ -249,54 +247,52 @@ pub(crate) fn avx_unpremultiply_alpha_rgba( unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(32 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); - let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); - let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let rrr = avx2_unpremultiply_row(rrr, aaa); - let ggg = avx2_unpremultiply_row(ggg, aaa); - let bbb = avx2_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); - _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); - _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); - } - - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let rrr = sse_unpremultiply_row(rrr, aaa); - let ggg = sse_unpremultiply_row(ggg, aaa); - let bbb = sse_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + for dst in rem.chunks_exact_mut(32 * 4) { + let src_ptr = dst.as_ptr(); + let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); + let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); + let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let rrr = avx2_unpremultiply_row(rrr, aaa); + let ggg = avx2_unpremultiply_row(ggg, aaa); + let bbb = avx2_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); + _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + } + + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); + + for dst in rem.chunks_exact_mut(16 * 4) { + let src_ptr = dst.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let rrr = sse_unpremultiply_row(rrr, aaa); + let ggg = sse_unpremultiply_row(ggg, aaa); + let bbb = sse_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + unpremultiply_alpha_rgba_row_impl(rem); } diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs index 105b2f2..b2bc4d5 100644 --- a/src/avx2/rgba_f16.rs +++ b/src/avx2/rgba_f16.rs @@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16( let src_ptr = src.add(start_x * COMPONENTS); let rgb_pixel = _mm_loadu_si64(src_ptr as *const u8); let pixels = avx_combine_ps(_mm_cvtph_ps(rgb_pixel), _mm_setzero_ps()); - let acc = _mm256_fma_ps::(store_0, pixels, weight0); - acc + _mm256_fma_ps::(store_0, pixels, weight0) } #[inline(always)] @@ -70,8 +69,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16( let rgb_pixel_1 = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(rgb_pixels_row_0)); let acc = _mm256_fma_ps::(store_0, rgb_pixel_0, weight0); - let acc = _mm256_fma_ps::(acc, rgb_pixel_1, weight1); - acc + _mm256_fma_ps::(acc, rgb_pixel_1, weight1) } #[inline(always)] @@ -112,8 +110,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16( const COMPONENTS: usize = 4; let src_ptr = src.add(start_x * COMPONENTS); let rgb_pixels = _mm_loadu_si128(src_ptr as *const __m128i); - let acc = _mm256_fma_ps::(store_0, _mm256_cvtph_ps(rgb_pixels), weight0); - acc + _mm256_fma_ps::(store_0, _mm256_cvtph_ps(rgb_pixels), weight0) } pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16( diff --git a/src/avx2/rgba_u8_lb.rs b/src/avx2/rgba_u8_lb.rs index b38ad25..d9663d3 100644 --- a/src/avx2/rgba_u8_lb.rs +++ b/src/avx2/rgba_u8_lb.rs @@ -98,220 +98,214 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 4; + const CHANNELS: usize = 4; - const SCALE: i32 = 6; - const V_SHR: i32 = SCALE; - const ROUNDING: i16 = 1 << (V_SHR - 1); + const SCALE: i32 = 6; + const V_SHR: i32 = SCALE; + const ROUNDING: i16 = 1 << (V_SHR - 1); - let vld = _mm256_setr_epi16( - ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ); + let vld = _mm256_setr_epi16( + ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ); + + let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 8 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 8 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - - let w0 = w_ptr[0]; - let w1 = w_ptr[1]; - let w2 = w_ptr[2]; - let w3 = w_ptr[3]; - let w4 = w_ptr[4]; - let w5 = w_ptr[5]; - let w6 = w_ptr[6]; - let w7 = w_ptr[7]; - - let weights0 = _mm256_setr_epi16( - w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5, - ); - let weights1 = _mm256_setr_epi16( - w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7, - ); - - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm256_loadu_si256( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - let rgb_pixel_1 = _mm256_loadu_si256( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - let rgb_pixel_2 = _mm256_loadu_si256( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - let rgb_pixel_3 = _mm256_loadu_si256( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - - store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1); - store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1); - store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1); - store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1); - - jx += 8; - } - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - - let w0 = w_ptr[0]; - let w1 = w_ptr[1]; - let w2 = w_ptr[2]; - let w3 = w_ptr[3]; - - let weights = _mm256_setr_epi16( - w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3, - ); - - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - - let rgb_pixel_1 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - let rgb_pixel_2 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - let rgb_pixel_3 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - - store_0 = hdot2(store_0, rgb_pixel_0, weights); - store_1 = hdot2(store_1, rgb_pixel_1, weights); - store_2 = hdot2(store_2, rgb_pixel_2, weights); - store_3 = hdot2(store_3, rgb_pixel_3, weights); - - jx += 4; - } - - let mut store_0 = _mm_add_epi16( - _mm256_castsi256_si128(store_0), - _mm256_extracti128_si256::<1>(store_0), + let w0 = w_ptr[0]; + let w1 = w_ptr[1]; + let w2 = w_ptr[2]; + let w3 = w_ptr[3]; + let w4 = w_ptr[4]; + let w5 = w_ptr[5]; + let w6 = w_ptr[6]; + let w7 = w_ptr[7]; + + let weights0 = _mm256_setr_epi16( + w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5, ); - let mut store_1 = _mm_add_epi16( - _mm256_castsi256_si128(store_1), - _mm256_extracti128_si256::<1>(store_1), + let weights1 = _mm256_setr_epi16( + w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7, ); - let mut store_2 = _mm_add_epi16( - _mm256_castsi256_si128(store_2), - _mm256_extracti128_si256::<1>(store_2), + + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm256_loadu_si256( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - let mut store_3 = _mm_add_epi16( - _mm256_castsi256_si128(store_3), - _mm256_extracti128_si256::<1>(store_3), + let rgb_pixel_1 = _mm256_loadu_si256( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - - let rgb_pixel_0 = - _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_1 = - _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_2 = - _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_3 = - _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - - store_0 = hdot(store_0, rgb_pixel_0, weight01); - store_1 = hdot(store_1, rgb_pixel_1, weight01); - store_2 = hdot(store_2, rgb_pixel_2, weight01); - store_3 = hdot(store_3, rgb_pixel_3, weight01); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - - let weight0 = _mm_set1_epi16(w_ptr[0]); - - let start_bounds = bounds.start + jx; - - store_0 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); - jx += 1; - } - - store_0 = _mm_add_hi_lo_epi16(store_0); - store_1 = _mm_add_hi_lo_epi16(store_1); - store_2 = _mm_add_hi_lo_epi16(store_2); - store_3 = _mm_add_hi_lo_epi16(store_3); - - let store_16_8_0 = _mm_srai_epi16::(store_0); - let store_16_8_1 = _mm_srai_epi16::(store_1); - let store_16_8_2 = _mm_srai_epi16::(store_2); - let store_16_8_3 = _mm_srai_epi16::(store_3); - - _mm_storeu_si32( - chunk0.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_0, store_16_8_0), + let rgb_pixel_2 = _mm256_loadu_si256( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - _mm_storeu_si32( - chunk1.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_1, store_16_8_1), + let rgb_pixel_3 = _mm256_loadu_si256( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - _mm_storeu_si32( - chunk2.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_2, store_16_8_2), + + store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1); + store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1); + store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1); + store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1); + + jx += 8; + } + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + + let w0 = w_ptr[0]; + let w1 = w_ptr[1]; + let w2 = w_ptr[2]; + let w3 = w_ptr[3]; + + let weights = _mm256_setr_epi16( + w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3, ); - _mm_storeu_si32( - chunk3.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_3, store_16_8_3), + + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + + let rgb_pixel_1 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + let rgb_pixel_2 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + let rgb_pixel_3 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + + store_0 = hdot2(store_0, rgb_pixel_0, weights); + store_1 = hdot2(store_1, rgb_pixel_1, weights); + store_2 = hdot2(store_2, rgb_pixel_2, weights); + store_3 = hdot2(store_3, rgb_pixel_3, weights); + + jx += 4; + } + + let mut store_0 = _mm_add_epi16( + _mm256_castsi256_si128(store_0), + _mm256_extracti128_si256::<1>(store_0), + ); + let mut store_1 = _mm_add_epi16( + _mm256_castsi256_si128(store_1), + _mm256_extracti128_si256::<1>(store_1), + ); + let mut store_2 = _mm_add_epi16( + _mm256_castsi256_si128(store_2), + _mm256_extracti128_si256::<1>(store_2), + ); + let mut store_3 = _mm_add_epi16( + _mm256_castsi256_si128(store_3), + _mm256_extracti128_si256::<1>(store_3), + ); + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, ); + + let rgb_pixel_0 = + _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_1 = + _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_2 = + _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_3 = + _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + + store_0 = hdot(store_0, rgb_pixel_0, weight01); + store_1 = hdot(store_1, rgb_pixel_1, weight01); + store_2 = hdot(store_2, rgb_pixel_2, weight01); + store_3 = hdot(store_3, rgb_pixel_3, weight01); + + jx += 2; } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + + let weight0 = _mm_set1_epi16(w_ptr[0]); + + let start_bounds = bounds.start + jx; + + store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); + jx += 1; + } + + store_0 = _mm_add_hi_lo_epi16(store_0); + store_1 = _mm_add_hi_lo_epi16(store_1); + store_2 = _mm_add_hi_lo_epi16(store_2); + store_3 = _mm_add_hi_lo_epi16(store_3); + + let store_16_8_0 = _mm_srai_epi16::(store_0); + let store_16_8_1 = _mm_srai_epi16::(store_1); + let store_16_8_2 = _mm_srai_epi16::(store_2); + let store_16_8_3 = _mm_srai_epi16::(store_3); + + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs index b5ed21d..43f8bd0 100644 --- a/src/sse/alpha_f16.rs +++ b/src/sse/alpha_f16.rs @@ -195,78 +195,76 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16c( unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(8 * 4) { - let src_ptr = dst.as_ptr(); - let lane0 = _mm_loadu_si128(src_ptr as *const __m128i); - let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let low_alpha = _mm_cvtph_psx::(pixel.3); - let zeros = _mm_setzero_ps(); - let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros); - let low_r = _mm_blendv_ps( - _mm_mul_ps(_mm_cvtph_psx::(pixel.0), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_g = _mm_blendv_ps( - _mm_mul_ps(_mm_cvtph_psx::(pixel.1), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_b = _mm_blendv_ps( - _mm_mul_ps(_mm_cvtph_psx::(pixel.2), low_alpha), - zeros, - low_alpha_zero_mask, - ); + for dst in rem.chunks_exact_mut(8 * 4) { + let src_ptr = dst.as_ptr(); + let lane0 = _mm_loadu_si128(src_ptr as *const __m128i); + let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let high_alpha = _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.3)); - let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros); - let high_r = _mm_blendv_ps( - _mm_mul_ps( - _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.0)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_g = _mm_blendv_ps( - _mm_mul_ps( - _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.1)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_b = _mm_blendv_ps( - _mm_mul_ps( - _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.2)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let r_values = - _mm_unpacklo_epi64(_mm_cvtps_phx::(low_r), _mm_cvtps_phx::(high_r)); - let g_values = - _mm_unpacklo_epi64(_mm_cvtps_phx::(low_g), _mm_cvtps_phx::(high_g)); - let b_values = - _mm_unpacklo_epi64(_mm_cvtps_phx::(low_b), _mm_cvtps_phx::(high_b)); - let dst_ptr = dst.as_mut_ptr(); - let (d_lane0, d_lane1, d_lane2, d_lane3) = - sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); - _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3); - } + let low_alpha = _mm_cvtph_psx::(pixel.3); + let zeros = _mm_setzero_ps(); + let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros); + let low_r = _mm_blendv_ps( + _mm_mul_ps(_mm_cvtph_psx::(pixel.0), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_g = _mm_blendv_ps( + _mm_mul_ps(_mm_cvtph_psx::(pixel.1), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_b = _mm_blendv_ps( + _mm_mul_ps(_mm_cvtph_psx::(pixel.2), low_alpha), + zeros, + low_alpha_zero_mask, + ); - rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + let high_alpha = _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.3)); + let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros); + let high_r = _mm_blendv_ps( + _mm_mul_ps( + _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.0)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_g = _mm_blendv_ps( + _mm_mul_ps( + _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.1)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_b = _mm_blendv_ps( + _mm_mul_ps( + _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.2)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let r_values = + _mm_unpacklo_epi64(_mm_cvtps_phx::(low_r), _mm_cvtps_phx::(high_r)); + let g_values = + _mm_unpacklo_epi64(_mm_cvtps_phx::(low_g), _mm_cvtps_phx::(high_g)); + let b_values = + _mm_unpacklo_epi64(_mm_cvtps_phx::(low_b), _mm_cvtps_phx::(high_b)); + let dst_ptr = dst.as_mut_ptr(); + let (d_lane0, d_lane1, d_lane2, d_lane3) = + sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); + _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0); + _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2); + _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3); } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + unpremultiply_pixel_f16_row(rem); } diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs index 38e79e4..5910361 100644 --- a/src/sse/alpha_u16.rs +++ b/src/sse/alpha_u16.rs @@ -88,6 +88,17 @@ pub(crate) unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i { _mm_srli_epi32::(_mm_add_epi32(v, _mm_srli_epi32::(v))) } +#[inline(always)] +unsafe fn _mm_div_by(v: __m128i) -> __m128i { + if BIT_DEPTH == 10 { + _mm_div_by_1023_epi32(v) + } else if BIT_DEPTH == 12 { + _mm_div_by_4095_epi32(v) + } else { + _mm_div_by_65535_epi32(v) + } +} + pub(crate) fn unpremultiply_alpha_sse_rgba_u16( in_place: &mut [u16], width: usize, @@ -226,169 +237,144 @@ pub(crate) fn premultiply_alpha_sse_rgba_u16( } } -#[target_feature(enable = "sse4.1")] -unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) { - let max_colors = (1 << bit_depth) - 1; +trait Sse41PremultiplyExecutor { + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize); +} - let mut rem = dst; - let mut src_rem = src; +#[derive(Default)] +struct Sse41PremultiplyExecutorDefault {} - unsafe { - if bit_depth == 10 { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); - let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); - - let new_rrrr = _mm_packus_epi32( - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)), - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)), - ); - let new_gggg = _mm_packus_epi32( - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)), - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)), - ); - let new_bbbb = _mm_packus_epi32( - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)), - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)), - ); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } - } else if bit_depth == 12 { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); - let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); - - let new_rrrr = _mm_packus_epi32( - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)), - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)), - ); - let new_gggg = _mm_packus_epi32( - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)), - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)), - ); - let new_bbbb = _mm_packus_epi32( - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)), - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)), - ); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } - } else if bit_depth == 16 { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); - let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); - - let new_rrrr = _mm_packus_epi32( - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpacklo_epi16(rrrr, zeros), - a_lo_f, - )), - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpackhi_epi16(rrrr, zeros), - a_hi_f, - )), - ); - let new_gggg = _mm_packus_epi32( - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpacklo_epi16(gggg, zeros), - a_lo_f, - )), - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpackhi_epi16(gggg, zeros), - a_hi_f, - )), - ); - let new_bbbb = _mm_packus_epi32( - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpacklo_epi16(bbbb, zeros), - a_lo_f, - )), - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpackhi_epi16(bbbb, zeros), - a_hi_f, - )), - ); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } - } else { - let v_max_colors_scale = - _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors))); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128())); - let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128())); - - let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale); - let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale); - let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } +impl Sse41PremultiplyExecutor + for Sse41PremultiplyExecutorDefault +{ + #[target_feature(enable = "sse4.1")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = dst; + let mut src_rem = src; + + let zeros = _mm_setzero_si128(); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let src_ptr = src.as_ptr(); + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + + let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); + let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); + + let new_rrrr = _mm_packus_epi32( + _mm_div_by::(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)), + _mm_div_by::(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)), + ); + let new_gggg = _mm_packus_epi32( + _mm_div_by::(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)), + _mm_div_by::(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)), + ); + let new_bbbb = _mm_packus_epi32( + _mm_div_by::(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)), + _mm_div_by::(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)), + ); + + let (rgba0, rgba1, rgba2, rgba3) = + sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); } rem = rem.chunks_exact_mut(8 * 4).into_remainder(); src_rem = src_rem.chunks_exact(8 * 4).remainder(); + + premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32); } +} + +#[derive(Default)] +struct Sse41PremultiplyExecutorAny {} + +impl Sse41PremultiplyExecutor for Sse41PremultiplyExecutorAny { + #[target_feature(enable = "sse4.1")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = dst; + let mut src_rem = src; - premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32); + let v_max_colors_scale = + _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors))); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let src_ptr = src.as_ptr(); + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + + let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128())); + let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128())); + + let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale); + let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale); + let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale); + + let (rgba0, rgba1, rgba2, rgba3) = + sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); + } + + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(8 * 4).remainder(); + + premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32); + } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn pma_sse41_rgba16_dispatch( + dst: &mut [u16], + src: &[u16], + bit_depth: usize, + executor: impl Sse41PremultiplyExecutor, +) { + executor.premultiply(dst, src, bit_depth); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) { + if bit_depth == 10 { + pma_sse41_rgba16_dispatch( + dst, + src, + bit_depth, + Sse41PremultiplyExecutorDefault::<10>::default(), + ) + } else if bit_depth == 12 { + pma_sse41_rgba16_dispatch( + dst, + src, + bit_depth, + Sse41PremultiplyExecutorDefault::<12>::default(), + ) + } else if bit_depth == 16 { + pma_sse41_rgba16_dispatch( + dst, + src, + bit_depth, + Sse41PremultiplyExecutorDefault::<16>::default(), + ) + } else { + pma_sse41_rgba16_dispatch(dst, src, bit_depth, Sse41PremultiplyExecutorAny::default()) + } } diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs index 32f01cd..53a75ab 100644 --- a/src/sse/alpha_u8.rs +++ b/src/sse/alpha_u8.rs @@ -111,55 +111,53 @@ unsafe fn sse_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let mut rem = dst; let mut src_rem = src; - unsafe { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); - - let aaa_low = _mm_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm_unpackhi_epi8(aaa, zeros); - - rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); - rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); - ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); - ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); - bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); - bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); - - let rrr = _mm_packus_epi16(rrr_low, rrr_high); - let ggg = _mm_packus_epi16(ggg_low, ggg_high); - let bbb = _mm_packus_epi16(bbb_low, bbb_high); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); + let zeros = _mm_setzero_si128(); + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm_unpackhi_epi8(aaa, zeros); + + rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); + rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); + ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); + ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); + bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); + bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm_packus_epi16(rrr_low, rrr_high); + let ggg = _mm_packus_epi16(ggg_low, ggg_high); + let bbb = _mm_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + premultiply_alpha_rgba_row_impl(rem, src_rem); } @@ -203,31 +201,30 @@ pub(crate) fn sse_unpremultiply_alpha_rgba( #[target_feature(enable = "sse4.1")] unsafe fn sse_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let rrr = sse_unpremultiply_row(rrr, aaa); - let ggg = sse_unpremultiply_row(ggg, aaa); - let bbb = sse_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + for dst in rem.chunks_exact_mut(16 * 4) { + let src_ptr = dst.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let rrr = sse_unpremultiply_row(rrr, aaa); + let ggg = sse_unpremultiply_row(ggg, aaa); + let bbb = sse_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + for dst in rem.chunks_exact_mut(4) { let a = dst[3]; if a != 0 { diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs index 99ed16a..7c83a9a 100644 --- a/src/sse/plane_f32.rs +++ b/src/sse/plane_f32.rs @@ -316,7 +316,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_fma( ); } -#[inline] +#[inline(always)] unsafe fn convolve_horizontal_plane_sse_rows_4_impl( dst_width: usize, _: usize, @@ -326,139 +326,135 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( dst: &mut [f32], dst_stride: usize, ) { - unsafe { - let mut filter_offset = 0usize; - let zeros = _mm_setzero_ps(); - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = filter_weights.bounds.get_unchecked(x); - let mut jx = 0usize; - let mut store_0 = zeros; - let mut store_1 = zeros; - let mut store_2 = zeros; - let mut store_3 = zeros; - - while jx + 16 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights0 = _mm_loadu_ps(ptr); - let read_weights1 = _mm_loadu_ps(ptr.add(4)); - let read_weights2 = _mm_loadu_ps(ptr.add(8)); - let read_weights3 = _mm_loadu_ps(ptr.add(12)); - let weights = (read_weights0, read_weights1, read_weights2, read_weights3); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA); - let s_ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA); - let s_ptr2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA); - let s_ptr3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA); - jx += 16; - } - - while jx + 8 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights0 = _mm_loadu_ps(ptr); - let read_weights1 = _mm_loadu_ps(ptr.add(4)); - let read_weights = (read_weights0, read_weights1); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_8_f32!( - bounds_start, - src, - read_weights.0, - read_weights.1, - store_0, - FMA - ); - let s_ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_8_f32!( - bounds_start, - s_ptr_1, - read_weights.0, - read_weights.1, - store_1, - FMA - ); - let s_ptr2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_8_f32!( - bounds_start, - s_ptr2, - read_weights.0, - read_weights.1, - store_2, - FMA - ); - let s_ptr3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_8_f32!( - bounds_start, - s_ptr3, - read_weights.0, - read_weights.1, - store_3, - FMA - ); - jx += 8; - } - - while jx + 4 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = _mm_loadu_ps(ptr); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA); - let s_ptr_1 = src.get_unchecked(src_stride..); - store_1 = - conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA); - let s_ptr2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA); - let s_ptr3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA); - jx += 4; - } - - while jx + 2 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weights = - _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA); - let ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA); - let ptr_2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA); - let ptr_3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA); - jx += 2; - } - - while jx < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weight0 = _mm_set1_ps(ptr.read_unaligned()); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA); - let ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA); - let ptr_2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA); - let ptr_3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA); - jx += 1; - } - - let px = x; - let dest_ptr = dst.get_unchecked_mut(px); - *dest_ptr = _mm_hsum_ps(store_0); - - let dest_ptr = dst.get_unchecked_mut(px + dst_stride); - *dest_ptr = _mm_hsum_ps(store_1); - - let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2); - *dest_ptr = _mm_hsum_ps(store_2); - - let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3); - *dest_ptr = _mm_hsum_ps(store_3); - - filter_offset += filter_weights.aligned_size; + let mut filter_offset = 0usize; + let zeros = _mm_setzero_ps(); + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 16 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights0 = _mm_loadu_ps(ptr); + let read_weights1 = _mm_loadu_ps(ptr.add(4)); + let read_weights2 = _mm_loadu_ps(ptr.add(8)); + let read_weights3 = _mm_loadu_ps(ptr.add(12)); + let weights = (read_weights0, read_weights1, read_weights2, read_weights3); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA); + jx += 16; } + + while jx + 8 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights0 = _mm_loadu_ps(ptr); + let read_weights1 = _mm_loadu_ps(ptr.add(4)); + let read_weights = (read_weights0, read_weights1); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_8_f32!( + bounds_start, + src, + read_weights.0, + read_weights.1, + store_0, + FMA + ); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr_1, + read_weights.0, + read_weights.1, + store_1, + FMA + ); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr2, + read_weights.0, + read_weights.1, + store_2, + FMA + ); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr3, + read_weights.0, + read_weights.1, + store_3, + FMA + ); + jx += 8; + } + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = _mm_loadu_ps(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_set1_ps(ptr.read_unaligned()); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA); + jx += 1; + } + + let px = x; + let dest_ptr = dst.get_unchecked_mut(px); + *dest_ptr = _mm_hsum_ps(store_0); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride); + *dest_ptr = _mm_hsum_ps(store_1); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2); + *dest_ptr = _mm_hsum_ps(store_2); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3); + *dest_ptr = _mm_hsum_ps(store_3); + + filter_offset += filter_weights.aligned_size; } } diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs index ede3433..637b774 100644 --- a/src/sse/rgb_f16.rs +++ b/src/sse/rgb_f16.rs @@ -82,8 +82,7 @@ unsafe fn convolve_horizontal_parts_4_rgb_f16 let acc = _mm_prefer_fma_ps::(store_0, rgb_pixel_0, weight0); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_1, weight1); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_2, weight2); - let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3); - acc + _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3) } #[inline(always)] @@ -142,8 +141,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f16(store_0, rgb_pixel, weight0); - acc + _mm_prefer_fma_ps::(store_0, rgb_pixel, weight0) } pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16( diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs index 227eb7e..cc1b041 100644 --- a/src/sse/rgb_u8.rs +++ b/src/sse/rgb_u8.rs @@ -77,10 +77,9 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 3; + const CHANNELS: usize = 3; - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, @@ -90,7 +89,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( -1, -1, -1, -1); - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, @@ -100,147 +99,142 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( -1, -1, -1, -1); - let vld = _mm_set1_epi32(ROUNDING_CONST); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - // Will make step in 4 items however since it is RGB it is necessary to make a safe offset - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); - const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); - let weight01 = _mm_shuffle_epi32::(weights); - const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); - let weight23 = _mm_shuffle_epi32::(weights); - let bounds_start = (bounds.start + jx) * CHANNELS; - - let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..)); - let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..)); - let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..)); - let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..)); - - let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); - - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); - - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); - - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = (bounds.start + jx) * CHANNELS; - let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); - - let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..)); - let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..)); - let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..)); - let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..)); - - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let bounds_start = bounds.start + jx; - - let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); - - store_0 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3); - jx += 1; - } - - let store_0_8 = compress_i32(store_0); - let store_1_8 = compress_i32(store_1); - let store_2_8 = compress_i32(store_2); - let store_3_8 = compress_i32(store_3); - - let element_0 = _mm_extract_epi32::<0>(store_0_8); - let element_1 = _mm_extract_epi32::<0>(store_1_8); - let element_2 = _mm_extract_epi32::<0>(store_2_8); - let element_3 = _mm_extract_epi32::<0>(store_3_8); - - let bytes = element_0.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk0.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_1.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk1.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_2.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk2.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_3.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk3.get_unchecked_mut(2) = bytes[2]; + let vld = _mm_set1_epi32(ROUNDING_CONST); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + // Will make step in 4 items however since it is RGB it is necessary to make a safe offset + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); + const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); + let weight01 = _mm_shuffle_epi32::(weights); + const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); + let weight23 = _mm_shuffle_epi32::(weights); + let bounds_start = (bounds.start + jx) * CHANNELS; + + let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..)); + let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..)); + let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..)); + let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..)); + + let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); + + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); + + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); + + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = (bounds.start + jx) * CHANNELS; + let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); + + let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..)); + let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..)); + let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..)); + let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..)); + + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + + jx += 2; } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + let bounds_start = bounds.start + jx; + + let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); + + store_0 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3); + jx += 1; + } + + let store_0_8 = compress_i32(store_0); + let store_1_8 = compress_i32(store_1); + let store_2_8 = compress_i32(store_2); + let store_3_8 = compress_i32(store_3); + + let element_0 = _mm_extract_epi32::<0>(store_0_8); + let element_1 = _mm_extract_epi32::<0>(store_1_8); + let element_2 = _mm_extract_epi32::<0>(store_2_8); + let element_3 = _mm_extract_epi32::<0>(store_3_8); + + let bytes = element_0.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk0.get_unchecked_mut(2) = bytes[2]; + + let bytes = element_1.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk1.get_unchecked_mut(2) = bytes[2]; + + let bytes = element_2.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk2.get_unchecked_mut(2) = bytes[2]; + + let bytes = element_3.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk3.get_unchecked_mut(2) = bytes[2]; } } @@ -261,71 +255,68 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_impl( dst: &mut [u8], filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 3; - - let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1); - - let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1); - - for ((dst, bounds), weights) in dst - .chunks_exact_mut(CHANNELS) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let bounds_size = bounds.size; - let mut jx = 0usize; - let mut store = _mm_setzero_si128(); - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); - const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); - let weight01 = _mm_shuffle_epi32::(weights); - const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); - let weight23 = _mm_shuffle_epi32::(weights); - let bounds_start = bounds.start + jx; - let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..); - - let rgb_pixel = load_rgb_x4(src_ptr_0); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); - store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); - let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..); - let rgb_pixel = load_rgb_x2(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0)); - jx += 2; - } - - while jx < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); - store = - convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store); - jx += 1; - } - - let store_16_8 = compress_i32(store); - - let element = _mm_extract_epi32::<0>(store_16_8); - let bytes = element.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *dst.get_unchecked_mut(2) = bytes[2]; + const CHANNELS: usize = 3; + + let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1); + + let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1); + + for ((dst, bounds), weights) in dst + .chunks_exact_mut(CHANNELS) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let bounds_size = bounds.size; + let mut jx = 0usize; + let mut store = _mm_setzero_si128(); + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); + const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); + let weight01 = _mm_shuffle_epi32::(weights); + const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); + let weight23 = _mm_shuffle_epi32::(weights); + let bounds_start = bounds.start + jx; + let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..); + + let rgb_pixel = load_rgb_x4(src_ptr_0); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); + store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); + let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..); + let rgb_pixel = load_rgb_x2(src_ptr); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0)); + jx += 2; } + + while jx < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); + store = convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store); + jx += 1; + } + + let store_16_8 = compress_i32(store); + + let element = _mm_extract_epi32::<0>(store_16_8); + let bytes = element.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *dst.get_unchecked_mut(2) = bytes[2]; } } diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs index d032a21..5e367d2 100644 --- a/src/sse/rgba_f16.rs +++ b/src/sse/rgba_f16.rs @@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16(rgb_pixel); - let acc = _mm_prefer_fma_ps::(store_0, pixels, weight0); - acc + _mm_prefer_fma_ps::(store_0, pixels, weight0) } #[inline(always)] @@ -77,8 +76,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16(store_0, rgb_pixel_0, weight0); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_1, weight1); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_2, weight2); - let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3); - acc + _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3) } #[inline(always)] @@ -95,12 +93,11 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16(store_0, _mm_cvtph_psx::(rgb_pixels), weight0); - let acc = _mm_prefer_fma_ps::( + _mm_prefer_fma_ps::( acc, _mm_cvtph_psx::(_mm_srli_si128::<8>(rgb_pixels)), weight1, - ); - acc + ) } pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16( diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs index 34beb03..b5dc978 100644 --- a/src/sse/rgba_f32.rs +++ b/src/sse/rgba_f32.rs @@ -119,71 +119,65 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( src: &[f32], dst: &mut [f32], ) { - unsafe { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = filter_weights.bounds.get_unchecked(x); - let mut jx = 0usize; - let mut store = _mm_setzero_ps(); - - while jx + 4 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr); - let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_4_rgba_f32::( - filter_start, - src, - weight0, - weight1, - weight2, - weight3, - store, - ); - jx += 4; - } - - while jx + 2 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8)); - const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0); - let weight0 = - _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); - const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1); - let weight1 = - _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); - let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_2_rgba_f32::( - filter_start, - src, - weight0, - weight1, - store, - ); - jx += 2 - } - - while jx < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weight0 = _mm_load1_ps(ptr); - let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_one_rgba_f32::( - filter_start, - src, - weight0, - store, - ); - jx += 1; - } - - let px = x * CHANNELS; - let dest_ptr = dst.get_unchecked_mut(px..); - _mm_storeu_ps(dest_ptr.as_mut_ptr(), store); - - filter_offset += filter_weights.aligned_size; + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = _mm_setzero_ps(); + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr); + let filter_start = jx + bounds.start; + store = convolve_horizontal_parts_4_rgba_f32::( + filter_start, + src, + weight0, + weight1, + weight2, + weight3, + store, + ); + jx += 4; } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8)); + const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0); + let weight0 = + _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); + const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1); + let weight1 = + _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); + let filter_start = jx + bounds.start; + store = convolve_horizontal_parts_2_rgba_f32::( + filter_start, + src, + weight0, + weight1, + store, + ); + jx += 2 + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_load1_ps(ptr); + let filter_start = jx + bounds.start; + store = + convolve_horizontal_parts_one_rgba_f32::(filter_start, src, weight0, store); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = dst.get_unchecked_mut(px..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store); + + filter_offset += filter_weights.aligned_size; } } diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs index c746c33..1f3dc99 100644 --- a/src/sse/rgba_u8.rs +++ b/src/sse/rgba_u8.rs @@ -72,10 +72,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 4; + const CHANNELS: usize = 4; - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_lo = _mm_setr_epi8(0, -1, 4, -1, 1, -1, @@ -85,7 +84,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( 3, -1, 7, -1); - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_hi = _mm_setr_epi8(8, -1, 12, -1, 9, -1, @@ -95,152 +94,147 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( 11, -1, 15, -1); - let vld = _mm_set1_epi32(ROUNDING_CONST); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); - const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); - let weight01 = _mm_shuffle_epi32::(weights); - const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); - let weight23 = _mm_shuffle_epi32::(weights); - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_1 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_2 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_3 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - - let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); - - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); - - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); - - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); - - let rgb_pixel_0 = - _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_1 = - _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_2 = - _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_3 = - _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - - let weight0 = _mm_set1_epi32(w_ptr[0] as i32); - - let start_bounds = bounds.start + jx; - - store_0 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); - jx += 1; - } - - let store_16_8_0 = compress_i32(store_0); - let store_16_8_1 = compress_i32(store_1); - let store_16_8_2 = compress_i32(store_2); - let store_16_8_3 = compress_i32(store_3); - - _mm_storeu_si32( - chunk0.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_0, store_16_8_0), + let vld = _mm_set1_epi32(ROUNDING_CONST); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); + const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); + let weight01 = _mm_shuffle_epi32::(weights); + const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); + let weight23 = _mm_shuffle_epi32::(weights); + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - _mm_storeu_si32( - chunk1.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_1, store_16_8_1), + let rgb_pixel_1 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - _mm_storeu_si32( - chunk2.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_2, store_16_8_2), + let rgb_pixel_2 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - _mm_storeu_si32( - chunk3.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_3, store_16_8_3), + let rgb_pixel_3 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); + + let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); + + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); + + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); + + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); + + let rgb_pixel_0 = + _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_1 = + _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_2 = + _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_3 = + _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + + jx += 2; + } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + + let weight0 = _mm_set1_epi32(w_ptr[0] as i32); + + let start_bounds = bounds.start + jx; + + store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); + jx += 1; } + + let store_16_8_0 = compress_i32(store_0); + let store_16_8_1 = compress_i32(store_1); + let store_16_8_2 = compress_i32(store_2); + let store_16_8_3 = compress_i32(store_3); + + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs index e5b1934..8590618 100644 --- a/src/sse/rgba_u8_lb.rs +++ b/src/sse/rgba_u8_lb.rs @@ -112,254 +112,240 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 4; - - const SCALE: i32 = 6; - const V_SHR: i32 = SCALE; - const ROUNDING: i16 = 1 << (V_SHR - 1); - - let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0); - - let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 8 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - let weight23 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - - let weight45 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - - let weight67 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_0_1 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - let rgb_pixel_1 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_1_0 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - let rgb_pixel_2 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_2_1 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - let rgb_pixel_3 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_3_1 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - - store_0 = hdot4( - store_0, - rgb_pixel_0, - rgb_pixel_0_1, - weight01, - weight23, - weight45, - weight67, - ); - store_1 = hdot4( - store_1, - rgb_pixel_1, - rgb_pixel_1_0, - weight01, - weight23, - weight45, - weight67, - ); - store_2 = hdot4( - store_2, - rgb_pixel_2, - rgb_pixel_2_1, - weight01, - weight23, - weight45, - weight67, - ); - store_3 = hdot4( - store_3, - rgb_pixel_3, - rgb_pixel_3_1, - weight01, - weight23, - weight45, - weight67, - ); - - jx += 8; - } - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - let weight23 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_1 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_2 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_3 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - - store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23); - store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23); - store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23); - store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23); - - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - - let rgb_pixel_0 = - _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_1 = - _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_2 = - _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_3 = - _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - - store_0 = hdot(store_0, rgb_pixel_0, weight01); - store_1 = hdot(store_1, rgb_pixel_1, weight01); - store_2 = hdot(store_2, rgb_pixel_2, weight01); - store_3 = hdot(store_3, rgb_pixel_3, weight01); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - - let weight0 = _mm_set1_epi16(w_ptr[0]); - - let start_bounds = bounds.start + jx; - - store_0 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); - jx += 1; - } - - store_0 = _mm_add_hi_lo_epi16(store_0); - store_1 = _mm_add_hi_lo_epi16(store_1); - store_2 = _mm_add_hi_lo_epi16(store_2); - store_3 = _mm_add_hi_lo_epi16(store_3); - - let store_16_8_0 = _mm_srai_epi16::(store_0); - let store_16_8_1 = _mm_srai_epi16::(store_1); - let store_16_8_2 = _mm_srai_epi16::(store_2); - let store_16_8_3 = _mm_srai_epi16::(store_3); - - _mm_storeu_si32( - chunk0.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_0, store_16_8_0), - ); - _mm_storeu_si32( - chunk1.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_1, store_16_8_1), - ); - _mm_storeu_si32( - chunk2.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_2, store_16_8_2), - ); - _mm_storeu_si32( - chunk3.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_3, store_16_8_3), + const CHANNELS: usize = 4; + + const SCALE: i32 = 6; + const V_SHR: i32 = SCALE; + const ROUNDING: i16 = 1 << (V_SHR - 1); + + let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0); + + let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 8 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 8)); + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + let weight23 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let weight45 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let weight67 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_0_1 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + let rgb_pixel_1 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_1_0 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + let rgb_pixel_2 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); + let rgb_pixel_2_1 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + let rgb_pixel_3 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_3_1 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + + store_0 = hdot4( + store_0, + rgb_pixel_0, + rgb_pixel_0_1, + weight01, + weight23, + weight45, + weight67, + ); + store_1 = hdot4( + store_1, + rgb_pixel_1, + rgb_pixel_1_0, + weight01, + weight23, + weight45, + weight67, + ); + store_2 = hdot4( + store_2, + rgb_pixel_2, + rgb_pixel_2_1, + weight01, + weight23, + weight45, + weight67, + ); + store_3 = hdot4( + store_3, + rgb_pixel_3, + rgb_pixel_3_1, + weight01, + weight23, + weight45, + weight67, + ); + + jx += 8; } + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + let weight23 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_1 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_2 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_3 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + + store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23); + store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23); + store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23); + store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23); + + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let rgb_pixel_0 = + _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_1 = + _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_2 = + _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_3 = + _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + + store_0 = hdot(store_0, rgb_pixel_0, weight01); + store_1 = hdot(store_1, rgb_pixel_1, weight01); + store_2 = hdot(store_2, rgb_pixel_2, weight01); + store_3 = hdot(store_3, rgb_pixel_3, weight01); + + jx += 2; + } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + + let weight0 = _mm_set1_epi16(w_ptr[0]); + + let start_bounds = bounds.start + jx; + + store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); + jx += 1; + } + + store_0 = _mm_add_hi_lo_epi16(store_0); + store_1 = _mm_add_hi_lo_epi16(store_1); + store_2 = _mm_add_hi_lo_epi16(store_2); + store_3 = _mm_add_hi_lo_epi16(store_3); + + let store_16_8_0 = _mm_srai_epi16::(store_0); + let store_16_8_1 = _mm_srai_epi16::(store_1); + let store_16_8_2 = _mm_srai_epi16::(store_2); + let store_16_8_3 = _mm_srai_epi16::(store_3); + + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } }