From ab50ceb9725fe532318928b54e26ed2c74a83ee1 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Wed, 1 Jan 2025 11:18:31 +0000
Subject: [PATCH] AVX, SSE refactor and improvements

---
 app/benches/resize_rgba/main.rs |   2 +-
 app/src/main.rs                 |   2 +-
 src/avx2/alpha_f16.rs           | 246 ++++++++--------
 src/avx2/alpha_f32.rs           |  50 ++--
 src/avx2/alpha_u16.rs           | 255 +++--------------
 src/avx2/alpha_u8.rs            | 276 +++++++++---------
 src/avx2/rgba_f16.rs            |   9 +-
 src/avx2/rgba_u8_lb.rs          | 398 +++++++++++++-------------
 src/sse/alpha_f16.rs            | 134 +++++----
 src/sse/alpha_u16.rs            | 300 ++++++++++----------
 src/sse/alpha_u8.rs             | 135 +++++----
 src/sse/plane_f32.rs            | 264 +++++++++---------
 src/sse/rgb_f16.rs              |   6 +-
 src/sse/rgb_u8.rs               | 409 +++++++++++++--------------
 src/sse/rgba_f16.rs             |  11 +-
 src/sse/rgba_f32.rs             | 122 ++++----
 src/sse/rgba_u8.rs              | 284 ++++++++++---------
 src/sse/rgba_u8_lb.rs           | 478 ++++++++++++++++----------------
 18 files changed, 1567 insertions(+), 1814 deletions(-)

diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs
index effd560..93b8f04 100644
--- a/app/benches/resize_rgba/main.rs
+++ b/app/benches/resize_rgba/main.rs
@@ -166,7 +166,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
                 dimensions.0 as usize,
                 dimensions.1 as usize,
             )
-                .unwrap();
+            .unwrap();
             let mut target = ImageStoreMut::alloc_with_depth(
                 dimensions.0 as usize / 4,
                 dimensions.1 as usize / 4,
diff --git a/app/src/main.rs b/app/src/main.rs
index 7eac02e..25367ac 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -42,7 +42,7 @@ fn resize_plane(
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/abstract_alpha.png")
+    let img = ImageReader::open("./assets/asset_4.png")
         .unwrap()
         .decode()
         .unwrap();
diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
index 62679de..8cbca73 100644
--- a/src/avx2/alpha_f16.rs
+++ b/src/avx2/alpha_f16.rs
@@ -55,61 +55,59 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
+    for (dst, src) in rem
+        .chunks_exact_mut(16 * 4)
+        .zip(src_rem.chunks_exact(16 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+        let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
+        let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
-            let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha);
-            let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha);
-            let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha);
+        let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
+        let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha);
+        let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha);
+        let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha);
 
-            let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
-            let high_r = _mm256_mul_ps(
-                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
-                high_alpha,
-            );
-            let high_g = _mm256_mul_ps(
-                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
-                high_alpha,
-            );
-            let high_b = _mm256_mul_ps(
-                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
-                high_alpha,
-            );
-            let r_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
-            );
-            let g_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
-            );
-            let b_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
-            );
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+        let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
+        let high_r = _mm256_mul_ps(
+            _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
+            high_alpha,
+        );
+        let high_g = _mm256_mul_ps(
+            _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
+            high_alpha,
+        );
+        let high_b = _mm256_mul_ps(
+            _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
+            high_alpha,
+        );
+        let r_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
+        );
+        let g_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
+        );
+        let b_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
+        );
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+        _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+        _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
     premultiply_pixel_f16_row(rem, src_rem);
 }
 
@@ -155,84 +153,82 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
 unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) {
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-            let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
-            let zeros = _mm256_setzero_ps();
-            let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros);
-            let low_r = _mm256_blendv_ps(
-                _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_g = _mm256_blendv_ps(
-                _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_b = _mm256_blendv_ps(
-                _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+        let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
+        let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
-            let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros);
-            let high_r = _mm256_blendv_ps(
-                _mm256_mul_ps(
-                    _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_g = _mm256_blendv_ps(
-                _mm256_mul_ps(
-                    _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_b = _mm256_blendv_ps(
-                _mm256_mul_ps(
-                    _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let r_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
-            );
-            let g_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
-            );
-            let b_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
-            );
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
+        let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
+        let zeros = _mm256_setzero_ps();
+        let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros);
+        let low_r = _mm256_blendv_ps(
+            _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_g = _mm256_blendv_ps(
+            _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_b = _mm256_blendv_ps(
+            _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
 
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+        let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
+        let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros);
+        let high_r = _mm256_blendv_ps(
+            _mm256_mul_ps(
+                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_g = _mm256_blendv_ps(
+            _mm256_mul_ps(
+                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_b = _mm256_blendv_ps(
+            _mm256_mul_ps(
+                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let r_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
+        );
+        let g_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
+        );
+        let b_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
+        );
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+        _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+        _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
     unpremultiply_pixel_f16_row(rem);
 }
 
diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
index f136208..f5b4d40 100644
--- a/src/avx2/alpha_f32.rs
+++ b/src/avx2/alpha_f32.rs
@@ -129,34 +129,32 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32])
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm256_loadu_ps(src_ptr);
-            let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
-            let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
-            let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
-            let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = _mm256_mul_ps(rrr, aaa);
-            let ggg = _mm256_mul_ps(ggg, aaa);
-            let bbb = _mm256_mul_ps(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm256_storeu_ps(dst_ptr, rgba0);
-            _mm256_storeu_ps(dst_ptr.add(8), rgba1);
-            _mm256_storeu_ps(dst_ptr.add(16), rgba2);
-            _mm256_storeu_ps(dst_ptr.add(24), rgba3);
-
-            _cx += 8;
-        }
-
-        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(8 * 4).remainder();
+    for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm256_loadu_ps(src_ptr);
+        let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
+        let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
+        let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
+        let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = _mm256_mul_ps(rrr, aaa);
+        let ggg = _mm256_mul_ps(ggg, aaa);
+        let bbb = _mm256_mul_ps(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm256_storeu_ps(dst_ptr, rgba0);
+        _mm256_storeu_ps(dst_ptr.add(8), rgba1);
+        _mm256_storeu_ps(dst_ptr.add(16), rgba2);
+        _mm256_storeu_ps(dst_ptr.add(24), rgba3);
+
+        _cx += 8;
     }
 
+    rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
     premultiply_pixel_f32_row(rem, src_rem);
 }
 
diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
index e34ee1a..a77b32d 100644
--- a/src/avx2/alpha_u16.rs
+++ b/src/avx2/alpha_u16.rs
@@ -75,6 +75,17 @@ pub(crate) unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i {
     _mm256_srli_epi32::<DIVIDING_BY>(_mm256_add_epi32(v, _mm256_srli_epi32::<DIVIDING_BY>(v)))
 }
 
+#[inline(always)]
+unsafe fn _mm256_div_by_epi32<const BIT_DEPTH: usize>(v: __m256i) -> __m256i {
+    if BIT_DEPTH == 10 {
+        _mm256_div_by_1023_epi32(v)
+    } else if BIT_DEPTH == 12 {
+        _mm256_div_by_4095_epi32(v)
+    } else {
+        _mm256_div_by_65535_epi32(v)
+    }
+}
+
 pub(crate) fn avx_premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
@@ -92,21 +103,10 @@ trait Avx2PremultiplyExecutor {
     unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize);
 }
 
-struct Avx2PremultiplyExecutor10Bit {}
-
-impl Default for Avx2PremultiplyExecutor10Bit {
-    fn default() -> Self {
-        Avx2PremultiplyExecutor10Bit {}
-    }
-}
-
-impl Avx2PremultiplyExecutor10Bit {
-    fn create() -> impl Avx2PremultiplyExecutor {
-        Avx2PremultiplyExecutor10Bit::default()
-    }
-}
+#[derive(Default)]
+struct Avx2PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
 
-impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor10Bit {
+impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutor for Avx2PremultiplyExecutorDefault<BIT_DEPTH> {
     #[target_feature(enable = "avx2")]
     unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
         let max_colors = (1 << bit_depth) - 1;
@@ -131,205 +131,31 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor10Bit {
             let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
 
             let new_rrr = _mm256_packus_epi32(
-                _mm256_div_by_1023_epi32(_mm256_madd_epi16(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
                     _mm256_unpacklo_epi16(pixel.0, zeros),
                     low_alpha,
                 )),
-                _mm256_div_by_1023_epi32(_mm256_madd_epi16(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
                     _mm256_unpackhi_epi16(pixel.0, zeros),
                     high_alpha,
                 )),
             );
             let new_ggg = _mm256_packus_epi32(
-                _mm256_div_by_1023_epi32(_mm256_madd_epi16(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
                     _mm256_unpacklo_epi16(pixel.1, zeros),
                     low_alpha,
                 )),
-                _mm256_div_by_1023_epi32(_mm256_madd_epi16(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
                     _mm256_unpackhi_epi16(pixel.1, zeros),
                     high_alpha,
                 )),
             );
             let new_bbb = _mm256_packus_epi32(
-                _mm256_div_by_1023_epi32(_mm256_madd_epi16(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
                     _mm256_unpacklo_epi16(pixel.2, zeros),
                     low_alpha,
                 )),
-                _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                    _mm256_unpackhi_epi16(pixel.2, zeros),
-                    high_alpha,
-                )),
-            );
-
-            let dst_ptr = dst.as_mut_ptr();
-
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
-
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
-
-        premultiply_alpha_rgba_row(rem, src_rem, max_colors);
-    }
-}
-
-struct Avx2PremultiplyExecutor12Bit {}
-
-impl Default for Avx2PremultiplyExecutor12Bit {
-    fn default() -> Self {
-        Avx2PremultiplyExecutor12Bit {}
-    }
-}
-
-impl Avx2PremultiplyExecutor12Bit {
-    fn create() -> impl Avx2PremultiplyExecutor {
-        Avx2PremultiplyExecutor12Bit::default()
-    }
-}
-
-impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor12Bit {
-    #[target_feature(enable = "avx2")]
-    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
-        let max_colors = (1 << bit_depth) - 1;
-
-        let mut rem = dst;
-        let mut src_rem = src;
-
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-            let zeros = _mm256_setzero_si256();
-            let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
-            let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
-
-            let new_rrr = _mm256_packus_epi32(
-                _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                    _mm256_unpacklo_epi16(pixel.0, zeros),
-                    low_alpha,
-                )),
-                _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                    _mm256_unpackhi_epi16(pixel.0, zeros),
-                    high_alpha,
-                )),
-            );
-            let new_ggg = _mm256_packus_epi32(
-                _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                    _mm256_unpacklo_epi16(pixel.1, zeros),
-                    low_alpha,
-                )),
-                _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                    _mm256_unpackhi_epi16(pixel.1, zeros),
-                    high_alpha,
-                )),
-            );
-            let new_bbb = _mm256_packus_epi32(
-                _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                    _mm256_unpacklo_epi16(pixel.2, zeros),
-                    low_alpha,
-                )),
-                _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                    _mm256_unpackhi_epi16(pixel.2, zeros),
-                    high_alpha,
-                )),
-            );
-
-            let dst_ptr = dst.as_mut_ptr();
-
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
-
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
-
-        premultiply_alpha_rgba_row(rem, src_rem, max_colors);
-    }
-}
-
-struct Avx2PremultiplyExecutor16Bit {}
-
-impl Default for Avx2PremultiplyExecutor16Bit {
-    fn default() -> Self {
-        Avx2PremultiplyExecutor16Bit {}
-    }
-}
-
-impl Avx2PremultiplyExecutor16Bit {
-    fn create() -> impl Avx2PremultiplyExecutor {
-        Avx2PremultiplyExecutor16Bit::default()
-    }
-}
-
-impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor16Bit {
-    #[target_feature(enable = "avx2")]
-    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
-        let max_colors = (1 << bit_depth) - 1;
-
-        let mut rem = dst;
-        let mut src_rem = src;
-
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-            let zeros = _mm256_setzero_si256();
-            let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
-            let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
-
-            let new_rrr = _mm256_packus_epi32(
-                _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                    _mm256_unpacklo_epi16(pixel.0, zeros),
-                    low_alpha,
-                )),
-                _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                    _mm256_unpackhi_epi16(pixel.0, zeros),
-                    high_alpha,
-                )),
-            );
-            let new_ggg = _mm256_packus_epi32(
-                _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                    _mm256_unpacklo_epi16(pixel.1, zeros),
-                    low_alpha,
-                )),
-                _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                    _mm256_unpackhi_epi16(pixel.1, zeros),
-                    high_alpha,
-                )),
-            );
-            let new_bbb = _mm256_packus_epi32(
-                _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                    _mm256_unpacklo_epi16(pixel.2, zeros),
-                    low_alpha,
-                )),
-                _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
                     _mm256_unpackhi_epi16(pixel.2, zeros),
                     high_alpha,
                 )),
@@ -353,20 +179,9 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor16Bit {
     }
 }
 
+#[derive(Default)]
 struct Avx2PremultiplyExecutorAnyBit {}
 
-impl Default for Avx2PremultiplyExecutorAnyBit {
-    fn default() -> Self {
-        Avx2PremultiplyExecutorAnyBit {}
-    }
-}
-
-impl Avx2PremultiplyExecutorAnyBit {
-    fn create() -> impl Avx2PremultiplyExecutor {
-        Avx2PremultiplyExecutorAnyBit {}
-    }
-}
-
 impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorAnyBit {
     #[target_feature(enable = "avx2")]
     unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
@@ -425,13 +240,33 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorAnyBit {
 /// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) {
     if bit_depth == 10 {
-        avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutor10Bit::create());
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorDefault::<10>::default(),
+        );
     } else if bit_depth == 12 {
-        avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutor12Bit::create());
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorDefault::<12>::default(),
+        );
     } else if bit_depth == 16 {
-        avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutor16Bit::create());
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorDefault::<16>::default(),
+        );
     } else {
-        avx_pa_dispatch(dst, src, bit_depth, Avx2PremultiplyExecutorAnyBit::create());
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorAnyBit::default(),
+        );
     };
 }
 
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 35c8f94..4ecff54 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -110,102 +110,100 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        for (dst, src) in rem
-            .chunks_exact_mut(32 * 4)
-            .zip(src_rem.chunks_exact(32 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
-            let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
-            let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let zeros = _mm256_setzero_si256();
-
-            let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
-            let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);
-
-            let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
-            let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);
-
-            let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
-            let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
-
-            let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
-            let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
-
-            rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
-            rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
-            ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low));
-            ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high));
-            bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
-            bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
-
-            let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
-            let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
-            let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
-
-            let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
-            let dst_ptr = dst.as_mut_ptr();
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
-            _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
-            _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(32 * 4).remainder();
-
-        let zeros = _mm_setzero_si128();
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
-            let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
-
-            let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
-            let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
-
-            let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
-            let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
-
-            let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
-            let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
-
-            rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
-            rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
-            ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
-            ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
-            bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
-            bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
-
-            let rrr = _mm_packus_epi16(rrr_low, rrr_high);
-            let ggg = _mm_packus_epi16(ggg_low, ggg_high);
-            let bbb = _mm_packus_epi16(bbb_low, bbb_high);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+    for (dst, src) in rem
+        .chunks_exact_mut(32 * 4)
+        .zip(src_rem.chunks_exact(32 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
+        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
+        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let zeros = _mm256_setzero_si256();
+
+        let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
+        let dst_ptr = dst.as_mut_ptr();
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
+        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
+        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
+    }
+
+    rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(32 * 4).remainder();
+
+    let zeros = _mm_setzero_si128();
+    for (dst, src) in rem
+        .chunks_exact_mut(16 * 4)
+        .zip(src_rem.chunks_exact(16 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
     premultiply_alpha_rgba_row_impl(rem, src_rem);
 }
 
@@ -249,54 +247,52 @@ pub(crate) fn avx_unpremultiply_alpha_rgba(
 unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(32 * 4) {
-            let src_ptr = dst.as_ptr();
-            let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
-            let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
-            let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = avx2_unpremultiply_row(rrr, aaa);
-            let ggg = avx2_unpremultiply_row(ggg, aaa);
-            let bbb = avx2_unpremultiply_row(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
-            _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
-            _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
-
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = sse_unpremultiply_row(rrr, aaa);
-            let ggg = sse_unpremultiply_row(ggg, aaa);
-            let bbb = sse_unpremultiply_row(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    for dst in rem.chunks_exact_mut(32 * 4) {
+        let src_ptr = dst.as_ptr();
+        let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
+        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
+        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = avx2_unpremultiply_row(rrr, aaa);
+        let ggg = avx2_unpremultiply_row(ggg, aaa);
+        let bbb = avx2_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
+        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
+        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
+    }
+
+    rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = sse_unpremultiply_row(rrr, aaa);
+        let ggg = sse_unpremultiply_row(ggg, aaa);
+        let bbb = sse_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
     unpremultiply_alpha_rgba_row_impl(rem);
 }
 
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index 105b2f2..b2bc4d5 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16<const FMA: bool>(
     let src_ptr = src.add(start_x * COMPONENTS);
     let rgb_pixel = _mm_loadu_si64(src_ptr as *const u8);
     let pixels = avx_combine_ps(_mm_cvtph_ps(rgb_pixel), _mm_setzero_ps());
-    let acc = _mm256_fma_ps::<FMA>(store_0, pixels, weight0);
-    acc
+    _mm256_fma_ps::<FMA>(store_0, pixels, weight0)
 }
 
 #[inline(always)]
@@ -70,8 +69,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16<const FMA: bool>(
     let rgb_pixel_1 = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(rgb_pixels_row_0));
 
     let acc = _mm256_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
-    let acc = _mm256_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
-    acc
+    _mm256_fma_ps::<FMA>(acc, rgb_pixel_1, weight1)
 }
 
 #[inline(always)]
@@ -112,8 +110,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const FMA: bool>(
     const COMPONENTS: usize = 4;
     let src_ptr = src.add(start_x * COMPONENTS);
     let rgb_pixels = _mm_loadu_si128(src_ptr as *const __m128i);
-    let acc = _mm256_fma_ps::<FMA>(store_0, _mm256_cvtph_ps(rgb_pixels), weight0);
-    acc
+    _mm256_fma_ps::<FMA>(store_0, _mm256_cvtph_ps(rgb_pixels), weight0)
 }
 
 pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
diff --git a/src/avx2/rgba_u8_lb.rs b/src/avx2/rgba_u8_lb.rs
index b38ad25..d9663d3 100644
--- a/src/avx2/rgba_u8_lb.rs
+++ b/src/avx2/rgba_u8_lb.rs
@@ -98,220 +98,214 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
+    const CHANNELS: usize = 4;
 
-        const SCALE: i32 = 6;
-        const V_SHR: i32 = SCALE;
-        const ROUNDING: i16 = 1 << (V_SHR - 1);
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (V_SHR - 1);
 
-        let vld = _mm256_setr_epi16(
-            ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        );
+    let vld = _mm256_setr_epi16(
+        ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    );
+
+    let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
 
-        let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            while jx + 8 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-
-                let w0 = w_ptr[0];
-                let w1 = w_ptr[1];
-                let w2 = w_ptr[2];
-                let w3 = w_ptr[3];
-                let w4 = w_ptr[4];
-                let w5 = w_ptr[5];
-                let w6 = w_ptr[6];
-                let w7 = w_ptr[7];
-
-                let weights0 = _mm256_setr_epi16(
-                    w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5,
-                );
-                let weights1 = _mm256_setr_epi16(
-                    w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7,
-                );
-
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm256_loadu_si256(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-                let rgb_pixel_1 = _mm256_loadu_si256(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-                let rgb_pixel_2 = _mm256_loadu_si256(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-                let rgb_pixel_3 = _mm256_loadu_si256(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-
-                store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1);
-                store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1);
-                store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1);
-                store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1);
-
-                jx += 8;
-            }
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-
-                let w0 = w_ptr[0];
-                let w1 = w_ptr[1];
-                let w2 = w_ptr[2];
-                let w3 = w_ptr[3];
-
-                let weights = _mm256_setr_epi16(
-                    w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3,
-                );
-
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-
-                let rgb_pixel_1 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-                let rgb_pixel_2 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-                let rgb_pixel_3 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-
-                store_0 = hdot2(store_0, rgb_pixel_0, weights);
-                store_1 = hdot2(store_1, rgb_pixel_1, weights);
-                store_2 = hdot2(store_2, rgb_pixel_2, weights);
-                store_3 = hdot2(store_3, rgb_pixel_3, weights);
-
-                jx += 4;
-            }
-
-            let mut store_0 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_0),
-                _mm256_extracti128_si256::<1>(store_0),
+            let w0 = w_ptr[0];
+            let w1 = w_ptr[1];
+            let w2 = w_ptr[2];
+            let w3 = w_ptr[3];
+            let w4 = w_ptr[4];
+            let w5 = w_ptr[5];
+            let w6 = w_ptr[6];
+            let w7 = w_ptr[7];
+
+            let weights0 = _mm256_setr_epi16(
+                w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5,
             );
-            let mut store_1 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_1),
-                _mm256_extracti128_si256::<1>(store_1),
+            let weights1 = _mm256_setr_epi16(
+                w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7,
             );
-            let mut store_2 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_2),
-                _mm256_extracti128_si256::<1>(store_2),
+
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm256_loadu_si256(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-            let mut store_3 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_3),
-                _mm256_extracti128_si256::<1>(store_3),
+            let rgb_pixel_1 = _mm256_loadu_si256(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-
-                let rgb_pixel_0 =
-                    _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_1 =
-                    _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_2 =
-                    _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_3 =
-                    _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-
-                store_0 = hdot(store_0, rgb_pixel_0, weight01);
-                store_1 = hdot(store_1, rgb_pixel_1, weight01);
-                store_2 = hdot(store_2, rgb_pixel_2, weight01);
-                store_3 = hdot(store_3, rgb_pixel_3, weight01);
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-
-                let weight0 = _mm_set1_epi16(w_ptr[0]);
-
-                let start_bounds = bounds.start + jx;
-
-                store_0 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            store_0 = _mm_add_hi_lo_epi16(store_0);
-            store_1 = _mm_add_hi_lo_epi16(store_1);
-            store_2 = _mm_add_hi_lo_epi16(store_2);
-            store_3 = _mm_add_hi_lo_epi16(store_3);
-
-            let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
-            let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
-            let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
-            let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
-
-            _mm_storeu_si32(
-                chunk0.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_0, store_16_8_0),
+            let rgb_pixel_2 = _mm256_loadu_si256(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-            _mm_storeu_si32(
-                chunk1.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_1, store_16_8_1),
+            let rgb_pixel_3 = _mm256_loadu_si256(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-            _mm_storeu_si32(
-                chunk2.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_2, store_16_8_2),
+
+            store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1);
+            store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1);
+            store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1);
+            store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1);
+
+            jx += 8;
+        }
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+
+            let w0 = w_ptr[0];
+            let w1 = w_ptr[1];
+            let w2 = w_ptr[2];
+            let w3 = w_ptr[3];
+
+            let weights = _mm256_setr_epi16(
+                w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3,
             );
-            _mm_storeu_si32(
-                chunk3.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+
+            let rgb_pixel_1 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+            let rgb_pixel_2 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+            let rgb_pixel_3 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+
+            store_0 = hdot2(store_0, rgb_pixel_0, weights);
+            store_1 = hdot2(store_1, rgb_pixel_1, weights);
+            store_2 = hdot2(store_2, rgb_pixel_2, weights);
+            store_3 = hdot2(store_3, rgb_pixel_3, weights);
+
+            jx += 4;
+        }
+
+        let mut store_0 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_0),
+            _mm256_extracti128_si256::<1>(store_0),
+        );
+        let mut store_1 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_1),
+            _mm256_extracti128_si256::<1>(store_1),
+        );
+        let mut store_2 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_2),
+            _mm256_extracti128_si256::<1>(store_2),
+        );
+        let mut store_3 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_3),
+            _mm256_extracti128_si256::<1>(store_3),
+        );
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
             );
+
+            let rgb_pixel_0 =
+                _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_1 =
+                _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_2 =
+                _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_3 =
+                _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+
+            store_0 = hdot(store_0, rgb_pixel_0, weight01);
+            store_1 = hdot(store_1, rgb_pixel_1, weight01);
+            store_2 = hdot(store_2, rgb_pixel_2, weight01);
+            store_3 = hdot(store_3, rgb_pixel_3, weight01);
+
+            jx += 2;
         }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+
+            let weight0 = _mm_set1_epi16(w_ptr[0]);
+
+            let start_bounds = bounds.start + jx;
+
+            store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
+            jx += 1;
+        }
+
+        store_0 = _mm_add_hi_lo_epi16(store_0);
+        store_1 = _mm_add_hi_lo_epi16(store_1);
+        store_2 = _mm_add_hi_lo_epi16(store_2);
+        store_3 = _mm_add_hi_lo_epi16(store_3);
+
+        let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
+        let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
+        let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
+        let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
+
+        _mm_storeu_si32(
+            chunk0.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_0, store_16_8_0),
+        );
+        _mm_storeu_si32(
+            chunk1.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_1, store_16_8_1),
+        );
+        _mm_storeu_si32(
+            chunk2.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_2, store_16_8_2),
+        );
+        _mm_storeu_si32(
+            chunk3.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_3, store_16_8_3),
+        );
     }
 }
 
diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs
index b5ed21d..43f8bd0 100644
--- a/src/sse/alpha_f16.rs
+++ b/src/sse/alpha_f16.rs
@@ -195,78 +195,76 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16c(
 unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl<const F16C: bool>(in_place: &mut [half::f16]) {
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(8 * 4) {
-            let src_ptr = dst.as_ptr();
-            let lane0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-            let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-            let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-            let low_alpha = _mm_cvtph_psx::<F16C>(pixel.3);
-            let zeros = _mm_setzero_ps();
-            let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros);
-            let low_r = _mm_blendv_ps(
-                _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.0), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_g = _mm_blendv_ps(
-                _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.1), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_b = _mm_blendv_ps(
-                _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.2), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
+    for dst in rem.chunks_exact_mut(8 * 4) {
+        let src_ptr = dst.as_ptr();
+        let lane0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+        let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+        let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let high_alpha = _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.3));
-            let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros);
-            let high_r = _mm_blendv_ps(
-                _mm_mul_ps(
-                    _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.0)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_g = _mm_blendv_ps(
-                _mm_mul_ps(
-                    _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.1)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_b = _mm_blendv_ps(
-                _mm_mul_ps(
-                    _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.2)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let r_values =
-                _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_r), _mm_cvtps_phx::<F16C>(high_r));
-            let g_values =
-                _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_g), _mm_cvtps_phx::<F16C>(high_g));
-            let b_values =
-                _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_b), _mm_cvtps_phx::<F16C>(high_b));
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
-            _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0);
-            _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2);
-            _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3);
-        }
+        let low_alpha = _mm_cvtph_psx::<F16C>(pixel.3);
+        let zeros = _mm_setzero_ps();
+        let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros);
+        let low_r = _mm_blendv_ps(
+            _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.0), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_g = _mm_blendv_ps(
+            _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.1), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_b = _mm_blendv_ps(
+            _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.2), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
 
-        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+        let high_alpha = _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.3));
+        let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros);
+        let high_r = _mm_blendv_ps(
+            _mm_mul_ps(
+                _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.0)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_g = _mm_blendv_ps(
+            _mm_mul_ps(
+                _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.1)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_b = _mm_blendv_ps(
+            _mm_mul_ps(
+                _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.2)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let r_values =
+            _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_r), _mm_cvtps_phx::<F16C>(high_r));
+        let g_values =
+            _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_g), _mm_cvtps_phx::<F16C>(high_g));
+        let b_values =
+            _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_b), _mm_cvtps_phx::<F16C>(high_b));
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
+        _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0);
+        _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2);
+        _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3);
     }
 
+    rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
     unpremultiply_pixel_f16_row(rem);
 }
 
diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs
index 38e79e4..5910361 100644
--- a/src/sse/alpha_u16.rs
+++ b/src/sse/alpha_u16.rs
@@ -88,6 +88,17 @@ pub(crate) unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
     _mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
 }
 
+#[inline(always)]
+unsafe fn _mm_div_by<const BIT_DEPTH: usize>(v: __m128i) -> __m128i {
+    if BIT_DEPTH == 10 {
+        _mm_div_by_1023_epi32(v)
+    } else if BIT_DEPTH == 12 {
+        _mm_div_by_4095_epi32(v)
+    } else {
+        _mm_div_by_65535_epi32(v)
+    }
+}
+
 pub(crate) fn unpremultiply_alpha_sse_rgba_u16(
     in_place: &mut [u16],
     width: usize,
@@ -226,169 +237,144 @@ pub(crate) fn premultiply_alpha_sse_rgba_u16(
     }
 }
 
-#[target_feature(enable = "sse4.1")]
-unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) {
-    let max_colors = (1 << bit_depth) - 1;
+trait Sse41PremultiplyExecutor {
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize);
+}
 
-    let mut rem = dst;
-    let mut src_rem = src;
+#[derive(Default)]
+struct Sse41PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
 
-    unsafe {
-        if bit_depth == 10 {
-            let zeros = _mm_setzero_si128();
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
-                let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
-
-                let new_rrrr = _mm_packus_epi32(
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
-                );
-                let new_gggg = _mm_packus_epi32(
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
-                );
-                let new_bbbb = _mm_packus_epi32(
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
-                );
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
-        } else if bit_depth == 12 {
-            let zeros = _mm_setzero_si128();
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
-                let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
-
-                let new_rrrr = _mm_packus_epi32(
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
-                );
-                let new_gggg = _mm_packus_epi32(
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
-                );
-                let new_bbbb = _mm_packus_epi32(
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
-                );
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
-        } else if bit_depth == 16 {
-            let zeros = _mm_setzero_si128();
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
-                let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
-
-                let new_rrrr = _mm_packus_epi32(
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpacklo_epi16(rrrr, zeros),
-                        a_lo_f,
-                    )),
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpackhi_epi16(rrrr, zeros),
-                        a_hi_f,
-                    )),
-                );
-                let new_gggg = _mm_packus_epi32(
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpacklo_epi16(gggg, zeros),
-                        a_lo_f,
-                    )),
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpackhi_epi16(gggg, zeros),
-                        a_hi_f,
-                    )),
-                );
-                let new_bbbb = _mm_packus_epi32(
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpacklo_epi16(bbbb, zeros),
-                        a_lo_f,
-                    )),
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpackhi_epi16(bbbb, zeros),
-                        a_hi_f,
-                    )),
-                );
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
-        } else {
-            let v_max_colors_scale =
-                _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors)));
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128()));
-                let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128()));
-
-                let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale);
-                let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale);
-                let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale);
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
+impl<const BIT_DEPTH: usize> Sse41PremultiplyExecutor
+    for Sse41PremultiplyExecutorDefault<BIT_DEPTH>
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = dst;
+        let mut src_rem = src;
+
+        let zeros = _mm_setzero_si128();
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            let src_ptr = src.as_ptr();
+            let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+            let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+            let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+            let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+            let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+
+            let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
+            let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
+
+            let new_rrrr = _mm_packus_epi32(
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
+            );
+            let new_gggg = _mm_packus_epi32(
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
+            );
+            let new_bbbb = _mm_packus_epi32(
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
+            );
+
+            let (rgba0, rgba1, rgba2, rgba3) =
+                sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
+
+            let dst_ptr = dst.as_mut_ptr();
+            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
+            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
+            _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
         }
 
         rem = rem.chunks_exact_mut(8 * 4).into_remainder();
         src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
+        premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32);
     }
+}
+
+#[derive(Default)]
+struct Sse41PremultiplyExecutorAny {}
+
+impl Sse41PremultiplyExecutor for Sse41PremultiplyExecutorAny {
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = dst;
+        let mut src_rem = src;
 
-    premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32);
+        let v_max_colors_scale =
+            _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors)));
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            let src_ptr = src.as_ptr();
+            let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+            let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+            let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+            let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+            let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+
+            let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128()));
+            let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128()));
+
+            let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale);
+            let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale);
+            let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale);
+
+            let (rgba0, rgba1, rgba2, rgba3) =
+                sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
+
+            let dst_ptr = dst.as_mut_ptr();
+            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
+            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
+            _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
+        }
+
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
+        premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32);
+    }
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn pma_sse41_rgba16_dispatch(
+    dst: &mut [u16],
+    src: &[u16],
+    bit_depth: usize,
+    executor: impl Sse41PremultiplyExecutor,
+) {
+    executor.premultiply(dst, src, bit_depth);
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) {
+    if bit_depth == 10 {
+        pma_sse41_rgba16_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Sse41PremultiplyExecutorDefault::<10>::default(),
+        )
+    } else if bit_depth == 12 {
+        pma_sse41_rgba16_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Sse41PremultiplyExecutorDefault::<12>::default(),
+        )
+    } else if bit_depth == 16 {
+        pma_sse41_rgba16_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Sse41PremultiplyExecutorDefault::<16>::default(),
+        )
+    } else {
+        pma_sse41_rgba16_dispatch(dst, src, bit_depth, Sse41PremultiplyExecutorAny::default())
+    }
 }
diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs
index 32f01cd..53a75ab 100644
--- a/src/sse/alpha_u8.rs
+++ b/src/sse/alpha_u8.rs
@@ -111,55 +111,53 @@ unsafe fn sse_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        let zeros = _mm_setzero_si128();
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
-            let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
-
-            let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
-            let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
-
-            let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
-            let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
-
-            let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
-            let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
-
-            rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
-            rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
-            ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
-            ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
-            bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
-            bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
-
-            let rrr = _mm_packus_epi16(rrr_low, rrr_high);
-            let ggg = _mm_packus_epi16(ggg_low, ggg_high);
-            let bbb = _mm_packus_epi16(bbb_low, bbb_high);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+    let zeros = _mm_setzero_si128();
+    for (dst, src) in rem
+        .chunks_exact_mut(16 * 4)
+        .zip(src_rem.chunks_exact(16 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
     premultiply_alpha_rgba_row_impl(rem, src_rem);
 }
 
@@ -203,31 +201,30 @@ pub(crate) fn sse_unpremultiply_alpha_rgba(
 #[target_feature(enable = "sse4.1")]
 unsafe fn sse_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
     let mut rem = in_place;
-    unsafe {
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = sse_unpremultiply_row(rrr, aaa);
-            let ggg = sse_unpremultiply_row(ggg, aaa);
-            let bbb = sse_unpremultiply_row(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
 
-        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = sse_unpremultiply_row(rrr, aaa);
+        let ggg = sse_unpremultiply_row(ggg, aaa);
+        let bbb = sse_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
     for dst in rem.chunks_exact_mut(4) {
         let a = dst[3];
         if a != 0 {
diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs
index 99ed16a..7c83a9a 100644
--- a/src/sse/plane_f32.rs
+++ b/src/sse/plane_f32.rs
@@ -316,7 +316,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_fma(
     );
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
@@ -326,139 +326,135 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
     dst: &mut [f32],
     dst_stride: usize,
 ) {
-    unsafe {
-        let mut filter_offset = 0usize;
-        let zeros = _mm_setzero_ps();
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = filter_weights.bounds.get_unchecked(x);
-            let mut jx = 0usize;
-            let mut store_0 = zeros;
-            let mut store_1 = zeros;
-            let mut store_2 = zeros;
-            let mut store_3 = zeros;
-
-            while jx + 16 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights0 = _mm_loadu_ps(ptr);
-                let read_weights1 = _mm_loadu_ps(ptr.add(4));
-                let read_weights2 = _mm_loadu_ps(ptr.add(8));
-                let read_weights3 = _mm_loadu_ps(ptr.add(12));
-                let weights = (read_weights0, read_weights1, read_weights2, read_weights3);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA);
-                let s_ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA);
-                let s_ptr3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA);
-                jx += 16;
-            }
-
-            while jx + 8 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights0 = _mm_loadu_ps(ptr);
-                let read_weights1 = _mm_loadu_ps(ptr.add(4));
-                let read_weights = (read_weights0, read_weights1);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    src,
-                    read_weights.0,
-                    read_weights.1,
-                    store_0,
-                    FMA
-                );
-                let s_ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    s_ptr_1,
-                    read_weights.0,
-                    read_weights.1,
-                    store_1,
-                    FMA
-                );
-                let s_ptr2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    s_ptr2,
-                    read_weights.0,
-                    read_weights.1,
-                    store_2,
-                    FMA
-                );
-                let s_ptr3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    s_ptr3,
-                    read_weights.0,
-                    read_weights.1,
-                    store_3,
-                    FMA
-                );
-                jx += 8;
-            }
-
-            while jx + 4 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = _mm_loadu_ps(ptr);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA);
-                let s_ptr_1 = src.get_unchecked(src_stride..);
-                store_1 =
-                    conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA);
-                let s_ptr3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA);
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weights =
-                    _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA);
-                let ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA);
-                let ptr_2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA);
-                let ptr_3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA);
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weight0 = _mm_set1_ps(ptr.read_unaligned());
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA);
-                let ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA);
-                let ptr_2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA);
-                let ptr_3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA);
-                jx += 1;
-            }
-
-            let px = x;
-            let dest_ptr = dst.get_unchecked_mut(px);
-            *dest_ptr = _mm_hsum_ps(store_0);
-
-            let dest_ptr = dst.get_unchecked_mut(px + dst_stride);
-            *dest_ptr = _mm_hsum_ps(store_1);
-
-            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2);
-            *dest_ptr = _mm_hsum_ps(store_2);
-
-            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3);
-            *dest_ptr = _mm_hsum_ps(store_3);
-
-            filter_offset += filter_weights.aligned_size;
+    let mut filter_offset = 0usize;
+    let zeros = _mm_setzero_ps();
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store_0 = zeros;
+        let mut store_1 = zeros;
+        let mut store_2 = zeros;
+        let mut store_3 = zeros;
+
+        while jx + 16 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights0 = _mm_loadu_ps(ptr);
+            let read_weights1 = _mm_loadu_ps(ptr.add(4));
+            let read_weights2 = _mm_loadu_ps(ptr.add(8));
+            let read_weights3 = _mm_loadu_ps(ptr.add(12));
+            let weights = (read_weights0, read_weights1, read_weights2, read_weights3);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA);
+            jx += 16;
         }
+
+        while jx + 8 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights0 = _mm_loadu_ps(ptr);
+            let read_weights1 = _mm_loadu_ps(ptr.add(4));
+            let read_weights = (read_weights0, read_weights1);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                src,
+                read_weights.0,
+                read_weights.1,
+                store_0,
+                FMA
+            );
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                s_ptr_1,
+                read_weights.0,
+                read_weights.1,
+                store_1,
+                FMA
+            );
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                s_ptr2,
+                read_weights.0,
+                read_weights.1,
+                store_2,
+                FMA
+            );
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                s_ptr3,
+                read_weights.0,
+                read_weights.1,
+                store_3,
+                FMA
+            );
+            jx += 8;
+        }
+
+        while jx + 4 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = _mm_loadu_ps(ptr);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA);
+            let ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA);
+            let ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA);
+            let ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA);
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = _mm_set1_ps(ptr.read_unaligned());
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA);
+            let ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA);
+            let ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA);
+            let ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA);
+            jx += 1;
+        }
+
+        let px = x;
+        let dest_ptr = dst.get_unchecked_mut(px);
+        *dest_ptr = _mm_hsum_ps(store_0);
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride);
+        *dest_ptr = _mm_hsum_ps(store_1);
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2);
+        *dest_ptr = _mm_hsum_ps(store_2);
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3);
+        *dest_ptr = _mm_hsum_ps(store_3);
+
+        filter_offset += filter_weights.aligned_size;
     }
 }
diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs
index ede3433..637b774 100644
--- a/src/sse/rgb_f16.rs
+++ b/src/sse/rgb_f16.rs
@@ -82,8 +82,7 @@ unsafe fn convolve_horizontal_parts_4_rgb_f16<const F16C: bool, const FMA: bool>
     let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_2, weight2);
-    let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3);
-    acc
+    _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3)
 }
 
 #[inline(always)]
@@ -142,8 +141,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f16<const F16C: bool, const FMA: boo
         0,
         0,
     ));
-    let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0);
-    acc
+    _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
 pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FMA: bool>(
diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs
index 227eb7e..cc1b041 100644
--- a/src/sse/rgb_u8.rs
+++ b/src/sse/rgb_u8.rs
@@ -77,10 +77,9 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 3;
+    const CHANNELS: usize = 3;
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_lo = _mm_setr_epi8(0, -1,
                                                3, -1,
                                                1, -1,
@@ -90,7 +89,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
                                                -1, -1,
                                                -1, -1);
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_hi = _mm_setr_epi8(6, -1,
                                                9, -1,
                                                7, -1,
@@ -100,147 +99,142 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
                                                -1, -1,
                                                -1, -1);
 
-        let vld = _mm_set1_epi32(ROUNDING_CONST);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            // Will make step in 4 items however since it is RGB it is necessary to make a safe offset
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
-                const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
-                let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
-                const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
-                let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
-                let bounds_start = (bounds.start + jx) * CHANNELS;
-
-                let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..));
-                let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..));
-                let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..));
-                let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..));
-
-                let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
-
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
-
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
-
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = (bounds.start + jx) * CHANNELS;
-                let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
-
-                let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..));
-                let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..));
-                let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..));
-                let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..));
-
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let bounds_start = bounds.start + jx;
-
-                let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
-
-                store_0 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            let store_0_8 = compress_i32(store_0);
-            let store_1_8 = compress_i32(store_1);
-            let store_2_8 = compress_i32(store_2);
-            let store_3_8 = compress_i32(store_3);
-
-            let element_0 = _mm_extract_epi32::<0>(store_0_8);
-            let element_1 = _mm_extract_epi32::<0>(store_1_8);
-            let element_2 = _mm_extract_epi32::<0>(store_2_8);
-            let element_3 = _mm_extract_epi32::<0>(store_3_8);
-
-            let bytes = element_0.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk0.get_unchecked_mut(2) = bytes[2];
-
-            let bytes = element_1.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk1.get_unchecked_mut(2) = bytes[2];
-
-            let bytes = element_2.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk2.get_unchecked_mut(2) = bytes[2];
-
-            let bytes = element_3.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk3.get_unchecked_mut(2) = bytes[2];
+    let vld = _mm_set1_epi32(ROUNDING_CONST);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        // Will make step in 4 items however since it is RGB it is necessary to make a safe offset
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
+            const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
+            let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
+            const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
+            let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
+            let bounds_start = (bounds.start + jx) * CHANNELS;
+
+            let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..));
+            let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..));
+            let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..));
+            let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..));
+
+            let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
+
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
+
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
+
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = (bounds.start + jx) * CHANNELS;
+            let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
+
+            let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..));
+            let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..));
+            let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..));
+            let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..));
+
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+
+            jx += 2;
         }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let bounds_start = bounds.start + jx;
+
+            let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
+
+            store_0 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3);
+            jx += 1;
+        }
+
+        let store_0_8 = compress_i32(store_0);
+        let store_1_8 = compress_i32(store_1);
+        let store_2_8 = compress_i32(store_2);
+        let store_3_8 = compress_i32(store_3);
+
+        let element_0 = _mm_extract_epi32::<0>(store_0_8);
+        let element_1 = _mm_extract_epi32::<0>(store_1_8);
+        let element_2 = _mm_extract_epi32::<0>(store_2_8);
+        let element_3 = _mm_extract_epi32::<0>(store_3_8);
+
+        let bytes = element_0.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk0.get_unchecked_mut(2) = bytes[2];
+
+        let bytes = element_1.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk1.get_unchecked_mut(2) = bytes[2];
+
+        let bytes = element_2.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk2.get_unchecked_mut(2) = bytes[2];
+
+        let bytes = element_3.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk3.get_unchecked_mut(2) = bytes[2];
     }
 }
 
@@ -261,71 +255,68 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_impl(
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 3;
-
-        let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1);
-
-        let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1);
-
-        for ((dst, bounds), weights) in dst
-            .chunks_exact_mut(CHANNELS)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let bounds_size = bounds.size;
-            let mut jx = 0usize;
-            let mut store = _mm_setzero_si128();
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
-                const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
-                let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
-                const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
-                let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
-                let bounds_start = bounds.start + jx;
-                let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..);
-
-                let rgb_pixel = load_rgb_x4(src_ptr_0);
-
-                let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
-                store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
-                let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..);
-                let rgb_pixel = load_rgb_x2(src_ptr);
-                let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0));
-                jx += 2;
-            }
-
-            while jx < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
-                store =
-                    convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store);
-                jx += 1;
-            }
-
-            let store_16_8 = compress_i32(store);
-
-            let element = _mm_extract_epi32::<0>(store_16_8);
-            let bytes = element.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *dst.get_unchecked_mut(2) = bytes[2];
+    const CHANNELS: usize = 3;
+
+    let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1);
+
+    let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1);
+
+    for ((dst, bounds), weights) in dst
+        .chunks_exact_mut(CHANNELS)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let bounds_size = bounds.size;
+        let mut jx = 0usize;
+        let mut store = _mm_setzero_si128();
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
+            const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
+            let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
+            const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
+            let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
+            let bounds_start = bounds.start + jx;
+            let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..);
+
+            let rgb_pixel = load_rgb_x4(src_ptr_0);
+
+            let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+            let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+            store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
+            store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
+            let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..);
+            let rgb_pixel = load_rgb_x2(src_ptr);
+            let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+            store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0));
+            jx += 2;
         }
+
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
+            store = convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store);
+            jx += 1;
+        }
+
+        let store_16_8 = compress_i32(store);
+
+        let element = _mm_extract_epi32::<0>(store_16_8);
+        let bytes = element.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *dst.get_unchecked_mut(2) = bytes[2];
     }
 }
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index d032a21..5e367d2 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16<const F16C: bool, const FMA: bo
     let src_ptr = src.add(start_x * COMPONENTS);
     let rgb_pixel = _mm_loadu_si64(src_ptr as *const u8);
     let pixels = _mm_cvtph_psx::<F16C>(rgb_pixel);
-    let acc = _mm_prefer_fma_ps::<FMA>(store_0, pixels, weight0);
-    acc
+    _mm_prefer_fma_ps::<FMA>(store_0, pixels, weight0)
 }
 
 #[inline(always)]
@@ -77,8 +76,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16<const F16C: bool, const FMA: bool
     let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_2, weight2);
-    let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3);
-    acc
+    _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3)
 }
 
 #[inline(always)]
@@ -95,12 +93,11 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const F16C: bool, const FMA: bool
     let rgb_pixels = _mm_loadu_si128(src_ptr as *const __m128i);
 
     let acc = _mm_prefer_fma_ps::<FMA>(store_0, _mm_cvtph_psx::<F16C>(rgb_pixels), weight0);
-    let acc = _mm_prefer_fma_ps::<FMA>(
+    _mm_prefer_fma_ps::<FMA>(
         acc,
         _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(rgb_pixels)),
         weight1,
-    );
-    acc
+    )
 }
 
 pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const FMA: bool>(
diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs
index 34beb03..b5dc978 100644
--- a/src/sse/rgba_f32.rs
+++ b/src/sse/rgba_f32.rs
@@ -119,71 +119,65 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
     src: &[f32],
     dst: &mut [f32],
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = filter_weights.bounds.get_unchecked(x);
-            let mut jx = 0usize;
-            let mut store = _mm_setzero_ps();
-
-            while jx + 4 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr);
-                let filter_start = jx + bounds.start;
-                store = convolve_horizontal_parts_4_rgba_f32::<FMA>(
-                    filter_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8));
-                const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0);
-                let weight0 =
-                    _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_0>(_mm_castps_si128(weights)));
-                const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1);
-                let weight1 =
-                    _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_1>(_mm_castps_si128(weights)));
-                let filter_start = jx + bounds.start;
-                store = convolve_horizontal_parts_2_rgba_f32::<FMA>(
-                    filter_start,
-                    src,
-                    weight0,
-                    weight1,
-                    store,
-                );
-                jx += 2
-            }
-
-            while jx < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weight0 = _mm_load1_ps(ptr);
-                let filter_start = jx + bounds.start;
-                store = convolve_horizontal_parts_one_rgba_f32::<FMA>(
-                    filter_start,
-                    src,
-                    weight0,
-                    store,
-                );
-                jx += 1;
-            }
-
-            let px = x * CHANNELS;
-            let dest_ptr = dst.get_unchecked_mut(px..);
-            _mm_storeu_ps(dest_ptr.as_mut_ptr(), store);
-
-            filter_offset += filter_weights.aligned_size;
+    const CHANNELS: usize = 4;
+    let mut filter_offset = 0usize;
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store = _mm_setzero_ps();
+
+        while jx + 4 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr);
+            let filter_start = jx + bounds.start;
+            store = convolve_horizontal_parts_4_rgba_f32::<FMA>(
+                filter_start,
+                src,
+                weight0,
+                weight1,
+                weight2,
+                weight3,
+                store,
+            );
+            jx += 4;
         }
+
+        while jx + 2 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8));
+            const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0);
+            let weight0 =
+                _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_0>(_mm_castps_si128(weights)));
+            const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1);
+            let weight1 =
+                _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_1>(_mm_castps_si128(weights)));
+            let filter_start = jx + bounds.start;
+            store = convolve_horizontal_parts_2_rgba_f32::<FMA>(
+                filter_start,
+                src,
+                weight0,
+                weight1,
+                store,
+            );
+            jx += 2
+        }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = _mm_load1_ps(ptr);
+            let filter_start = jx + bounds.start;
+            store =
+                convolve_horizontal_parts_one_rgba_f32::<FMA>(filter_start, src, weight0, store);
+            jx += 1;
+        }
+
+        let px = x * CHANNELS;
+        let dest_ptr = dst.get_unchecked_mut(px..);
+        _mm_storeu_ps(dest_ptr.as_mut_ptr(), store);
+
+        filter_offset += filter_weights.aligned_size;
     }
 }
 
diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs
index c746c33..1f3dc99 100644
--- a/src/sse/rgba_u8.rs
+++ b/src/sse/rgba_u8.rs
@@ -72,10 +72,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
+    const CHANNELS: usize = 4;
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_lo = _mm_setr_epi8(0, -1,
                                                4, -1,
                                                1, -1,
@@ -85,7 +84,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                                                3, -1,
                                                7, -1);
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_hi = _mm_setr_epi8(8, -1,
                                                12, -1,
                                                9, -1,
@@ -95,152 +94,147 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                                                11, -1,
                                                15, -1);
 
-        let vld = _mm_set1_epi32(ROUNDING_CONST);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
-                const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
-                let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
-                const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
-                let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-
-                let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
-
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
-
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
-
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-
-                let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
-
-                let rgb_pixel_0 =
-                    _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_1 =
-                    _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_2 =
-                    _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_3 =
-                    _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-
-                let weight0 = _mm_set1_epi32(w_ptr[0] as i32);
-
-                let start_bounds = bounds.start + jx;
-
-                store_0 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            let store_16_8_0 = compress_i32(store_0);
-            let store_16_8_1 = compress_i32(store_1);
-            let store_16_8_2 = compress_i32(store_2);
-            let store_16_8_3 = compress_i32(store_3);
-
-            _mm_storeu_si32(
-                chunk0.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_0, store_16_8_0),
+    let vld = _mm_set1_epi32(ROUNDING_CONST);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
+            const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
+            let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
+            const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
+            let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
-            _mm_storeu_si32(
-                chunk1.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_1, store_16_8_1),
+            let rgb_pixel_1 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
-            _mm_storeu_si32(
-                chunk2.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_2, store_16_8_2),
+            let rgb_pixel_2 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
-            _mm_storeu_si32(
-                chunk3.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+            let rgb_pixel_3 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
+
+            let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
+
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
+
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
+
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+
+            let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
+
+            let rgb_pixel_0 =
+                _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_1 =
+                _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_2 =
+                _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_3 =
+                _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+
+            let weight0 = _mm_set1_epi32(w_ptr[0] as i32);
+
+            let start_bounds = bounds.start + jx;
+
+            store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
+            jx += 1;
         }
+
+        let store_16_8_0 = compress_i32(store_0);
+        let store_16_8_1 = compress_i32(store_1);
+        let store_16_8_2 = compress_i32(store_2);
+        let store_16_8_3 = compress_i32(store_3);
+
+        _mm_storeu_si32(
+            chunk0.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_0, store_16_8_0),
+        );
+        _mm_storeu_si32(
+            chunk1.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_1, store_16_8_1),
+        );
+        _mm_storeu_si32(
+            chunk2.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_2, store_16_8_2),
+        );
+        _mm_storeu_si32(
+            chunk3.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_3, store_16_8_3),
+        );
     }
 }
 
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index e5b1934..8590618 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -112,254 +112,240 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
-
-        const SCALE: i32 = 6;
-        const V_SHR: i32 = SCALE;
-        const ROUNDING: i16 = 1 << (V_SHR - 1);
-
-        let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0);
-
-        let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            while jx + 8 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-                let weight23 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-
-                let weight45 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-
-                let weight67 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_0_1 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1_0 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2_1 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3_1 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-
-                store_0 = hdot4(
-                    store_0,
-                    rgb_pixel_0,
-                    rgb_pixel_0_1,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-                store_1 = hdot4(
-                    store_1,
-                    rgb_pixel_1,
-                    rgb_pixel_1_0,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-                store_2 = hdot4(
-                    store_2,
-                    rgb_pixel_2,
-                    rgb_pixel_2_1,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-                store_3 = hdot4(
-                    store_3,
-                    rgb_pixel_3,
-                    rgb_pixel_3_1,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-
-                jx += 8;
-            }
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-                let weight23 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-
-                store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23);
-                store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23);
-                store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23);
-                store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23);
-
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-
-                let rgb_pixel_0 =
-                    _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_1 =
-                    _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_2 =
-                    _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_3 =
-                    _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-
-                store_0 = hdot(store_0, rgb_pixel_0, weight01);
-                store_1 = hdot(store_1, rgb_pixel_1, weight01);
-                store_2 = hdot(store_2, rgb_pixel_2, weight01);
-                store_3 = hdot(store_3, rgb_pixel_3, weight01);
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-
-                let weight0 = _mm_set1_epi16(w_ptr[0]);
-
-                let start_bounds = bounds.start + jx;
-
-                store_0 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            store_0 = _mm_add_hi_lo_epi16(store_0);
-            store_1 = _mm_add_hi_lo_epi16(store_1);
-            store_2 = _mm_add_hi_lo_epi16(store_2);
-            store_3 = _mm_add_hi_lo_epi16(store_3);
-
-            let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
-            let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
-            let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
-            let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
-
-            _mm_storeu_si32(
-                chunk0.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_0, store_16_8_0),
-            );
-            _mm_storeu_si32(
-                chunk1.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_1, store_16_8_1),
-            );
-            _mm_storeu_si32(
-                chunk2.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_2, store_16_8_2),
-            );
-            _mm_storeu_si32(
-                chunk3.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+    const CHANNELS: usize = 4;
+
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (V_SHR - 1);
+
+    let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0);
+
+    let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+            let weight23 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let weight45 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let weight67 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_0_1 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_1 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_1_0 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_2 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
+            let rgb_pixel_2_1 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_3 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_3_1 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+
+            store_0 = hdot4(
+                store_0,
+                rgb_pixel_0,
+                rgb_pixel_0_1,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+            store_1 = hdot4(
+                store_1,
+                rgb_pixel_1,
+                rgb_pixel_1_0,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+            store_2 = hdot4(
+                store_2,
+                rgb_pixel_2,
+                rgb_pixel_2_1,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+            store_3 = hdot4(
+                store_3,
+                rgb_pixel_3,
+                rgb_pixel_3_1,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+
+            jx += 8;
         }
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+            let weight23 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_1 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_2 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_3 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+
+            store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23);
+            store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23);
+            store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23);
+            store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23);
+
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let rgb_pixel_0 =
+                _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_1 =
+                _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_2 =
+                _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_3 =
+                _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+
+            store_0 = hdot(store_0, rgb_pixel_0, weight01);
+            store_1 = hdot(store_1, rgb_pixel_1, weight01);
+            store_2 = hdot(store_2, rgb_pixel_2, weight01);
+            store_3 = hdot(store_3, rgb_pixel_3, weight01);
+
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+
+            let weight0 = _mm_set1_epi16(w_ptr[0]);
+
+            let start_bounds = bounds.start + jx;
+
+            store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
+            jx += 1;
+        }
+
+        store_0 = _mm_add_hi_lo_epi16(store_0);
+        store_1 = _mm_add_hi_lo_epi16(store_1);
+        store_2 = _mm_add_hi_lo_epi16(store_2);
+        store_3 = _mm_add_hi_lo_epi16(store_3);
+
+        let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
+        let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
+        let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
+        let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
+
+        _mm_storeu_si32(
+            chunk0.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_0, store_16_8_0),
+        );
+        _mm_storeu_si32(
+            chunk1.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_1, store_16_8_1),
+        );
+        _mm_storeu_si32(
+            chunk2.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_2, store_16_8_2),
+        );
+        _mm_storeu_si32(
+            chunk3.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_3, store_16_8_3),
+        );
     }
 }