diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index a72a34b..2fdcdcc 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -44,6 +44,17 @@ jobs:
       - uses: dtolnay/rust-toolchain@stable
       - run: cargo clippy
 
+  tests:
+    name: Testing
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo test
+
   fuzz_rgba_8bit:
     name: Fuzzing 8bit
     strategy:
diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs
index 3ac926c..93b8f04 100644
--- a/app/benches/resize_rgba/main.rs
+++ b/app/benches/resize_rgba/main.rs
@@ -4,7 +4,8 @@ use fast_image_resize::FilterType::Lanczos3;
 use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer};
 use image::{GenericImageView, ImageReader};
 use pic_scale::{
-    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy,
+    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ScalingU16,
+    ThreadingPolicy,
 };
 
 pub fn criterion_benchmark(c: &mut Criterion) {
@@ -14,114 +15,164 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         .unwrap();
     let dimensions = img.dimensions();
     let src_bytes = img.as_bytes();
-    c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| {
-        let mut copied: Vec<u8> = Vec::from(src_bytes);
-        b.iter(|| {
-            let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
-            scaler.set_threading_policy(ThreadingPolicy::Single);
-            let store = ImageStore::<u8, 4>::from_slice(
-                &mut copied,
-                dimensions.0 as usize,
-                dimensions.1 as usize,
-            )
-            .unwrap();
-            let mut target =
-                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
-            _ = scaler.resize_rgba(&store, &mut target, true);
-        })
-    });
-
-    let f32_image: Vec<f32> = src_bytes.iter().map(|&x| x as f32 / 255f32).collect();
+    // c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| {
+    //     let mut copied: Vec<u8> = Vec::from(src_bytes);
+    //     b.iter(|| {
+    //         let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    //         scaler.set_threading_policy(ThreadingPolicy::Single);
+    //         let store = ImageStore::<u8, 4>::from_slice(
+    //             &mut copied,
+    //             dimensions.0 as usize,
+    //             dimensions.1 as usize,
+    //         )
+    //         .unwrap();
+    //         let mut target =
+    //             ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+    //         _ = scaler.resize_rgba(&store, &mut target, true);
+    //     })
+    // });
+    //
+    // let f32_image: Vec<f32> = src_bytes.iter().map(|&x| x as f32 / 255f32).collect();
+    //
+    // c.bench_function("Pic scale RGBA with alpha f32: Lanczos 3", |b| {
+    //     let mut copied: Vec<f32> = Vec::from(f32_image.clone());
+    //     b.iter(|| {
+    //         let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    //         scaler.set_threading_policy(ThreadingPolicy::Single);
+    //         let store = ImageStore::<f32, 4>::from_slice(
+    //             &mut copied,
+    //             dimensions.0 as usize,
+    //             dimensions.1 as usize,
+    //         )
+    //         .unwrap();
+    //         let mut target =
+    //             ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+    //         _ = scaler.resize_rgba_f32(&store, &mut target, false);
+    //     })
+    // });
+    //
+    // c.bench_function("Fast image resize RGBA with alpha: Lanczos 3", |b| {
+    //     let mut vc = Vec::from(img.as_bytes());
+    //     b.iter(|| {
+    //         let pixel_type: PixelType = PixelType::U8x4;
+    //         let src_image =
+    //             Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
+    //         let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type);
+    //
+    //         let mut resizer = Resizer::new();
+    //         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    //         unsafe {
+    //             resizer.set_cpu_extensions(CpuExtensions::Neon);
+    //         }
+    //         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    //         unsafe {
+    //             resizer.set_cpu_extensions(CpuExtensions::Avx2);
+    //         }
+    //         resizer
+    //             .resize(
+    //                 &src_image,
+    //                 &mut dst_image,
+    //                 &ResizeOptions::new()
+    //                     .resize_alg(ResizeAlg::Convolution(Lanczos3))
+    //                     .use_alpha(true),
+    //             )
+    //             .unwrap();
+    //     })
+    // });
+    //
+    // c.bench_function("Pic scale RGBA without alpha: Lanczos 3", |b| {
+    //     let mut copied: Vec<u8> = Vec::from(src_bytes);
+    //     b.iter(|| {
+    //         let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    //         scaler.set_threading_policy(ThreadingPolicy::Single);
+    //         let store = ImageStore::<u8, 4>::from_slice(
+    //             &mut copied,
+    //             dimensions.0 as usize,
+    //             dimensions.1 as usize,
+    //         )
+    //         .unwrap();
+    //         let mut target =
+    //             ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+    //         _ = scaler.resize_rgba(&store, &mut target, false);
+    //     })
+    // });
+    //
+    // c.bench_function("Fast image resize RGBA without alpha: Lanczos 3", |b| {
+    //     let mut vc = Vec::from(img.as_bytes());
+    //     b.iter(|| {
+    //         let pixel_type: PixelType = PixelType::U8x4;
+    //         let src_image =
+    //             Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
+    //         let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type);
+    //
+    //         let mut resizer = Resizer::new();
+    //         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    //         unsafe {
+    //             resizer.set_cpu_extensions(CpuExtensions::Neon);
+    //         }
+    //         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    //         unsafe {
+    //             resizer.set_cpu_extensions(CpuExtensions::Avx2);
+    //         }
+    //         resizer
+    //             .resize(
+    //                 &src_image,
+    //                 &mut dst_image,
+    //                 &ResizeOptions::new()
+    //                     .resize_alg(ResizeAlg::Convolution(Lanczos3))
+    //                     .use_alpha(false),
+    //             )
+    //             .unwrap();
+    //     })
+    // });
 
-    c.bench_function("Pic scale RGBA with alpha f32: Lanczos 3", |b| {
-        let mut copied: Vec<f32> = Vec::from(f32_image.clone());
+    c.bench_function("Pic scale RGBA10 with alpha: Lanczos 3", |b| {
+        let mut copied: Vec<u16> = Vec::from(
+            src_bytes
+                .iter()
+                .map(|&x| ((x as u16) << 2) | ((x as u16) >> 6))
+                .collect::<Vec<_>>(),
+        );
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
             scaler.set_threading_policy(ThreadingPolicy::Single);
-            let store = ImageStore::<f32, 4>::from_slice(
+            let store = ImageStore::<u16, 4>::from_slice(
                 &mut copied,
                 dimensions.0 as usize,
                 dimensions.1 as usize,
             )
             .unwrap();
-            let mut target =
-                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
-            _ = scaler.resize_rgba_f32(&store, &mut target, false);
+            let mut target = ImageStoreMut::alloc_with_depth(
+                dimensions.0 as usize / 4,
+                dimensions.1 as usize / 4,
+                10,
+            );
+            _ = scaler.resize_rgba_u16(&store, &mut target, true);
         })
     });
 
-    c.bench_function("Fast image resize RGBA with alpha: Lanczos 3", |b| {
-        let mut vc = Vec::from(img.as_bytes());
-        b.iter(|| {
-            let pixel_type: PixelType = PixelType::U8x4;
-            let src_image =
-                Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
-            let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type);
-
-            let mut resizer = Resizer::new();
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            unsafe {
-                resizer.set_cpu_extensions(CpuExtensions::Neon);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            unsafe {
-                resizer.set_cpu_extensions(CpuExtensions::Avx2);
-            }
-            resizer
-                .resize(
-                    &src_image,
-                    &mut dst_image,
-                    &ResizeOptions::new()
-                        .resize_alg(ResizeAlg::Convolution(Lanczos3))
-                        .use_alpha(true),
-                )
-                .unwrap();
-        })
-    });
-
-    c.bench_function("Pic scale RGBA without alpha: Lanczos 3", |b| {
-        let mut copied: Vec<u8> = Vec::from(src_bytes);
+    c.bench_function("Pic scale RGBA10 without alpha: Lanczos 3", |b| {
+        let mut copied: Vec<u16> = Vec::from(
+            src_bytes
+                .iter()
+                .map(|&x| ((x as u16) << 2) | ((x as u16) >> 6))
+                .collect::<Vec<_>>(),
+        );
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
             scaler.set_threading_policy(ThreadingPolicy::Single);
-            let store = ImageStore::<u8, 4>::from_slice(
+            let store = ImageStore::<u16, 4>::from_slice(
                 &mut copied,
                 dimensions.0 as usize,
                 dimensions.1 as usize,
             )
             .unwrap();
-            let mut target =
-                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
-            _ = scaler.resize_rgba(&store, &mut target, false);
-        })
-    });
-
-    c.bench_function("Fast image resize RGBA without alpha: Lanczos 3", |b| {
-        let mut vc = Vec::from(img.as_bytes());
-        b.iter(|| {
-            let pixel_type: PixelType = PixelType::U8x4;
-            let src_image =
-                Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
-            let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type);
-
-            let mut resizer = Resizer::new();
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            unsafe {
-                resizer.set_cpu_extensions(CpuExtensions::Neon);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            unsafe {
-                resizer.set_cpu_extensions(CpuExtensions::Avx2);
-            }
-            resizer
-                .resize(
-                    &src_image,
-                    &mut dst_image,
-                    &ResizeOptions::new()
-                        .resize_alg(ResizeAlg::Convolution(Lanczos3))
-                        .use_alpha(false),
-                )
-                .unwrap();
+            let mut target = ImageStoreMut::alloc_with_depth(
+                dimensions.0 as usize / 4,
+                dimensions.1 as usize / 4,
+                10,
+            );
+            _ = scaler.resize_rgba_u16(&store, &mut target, false);
         })
     });
 }
diff --git a/app/src/main.rs b/app/src/main.rs
index 52a7faf..25367ac 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -3,17 +3,14 @@ mod split;
 
 use std::time::Instant;
 
-use crate::merge::merge_channels_3;
-use crate::split::split_channels_3;
 use fast_image_resize::images::Image;
 use fast_image_resize::{
     CpuExtensions, FilterType, IntoImageView, PixelType, ResizeAlg, ResizeOptions, Resizer,
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, JzazbzScaler, LChScaler, LabScaler,
-    LinearApproxScaler, LinearScaler, LuvScaler, OklabScaler, ResamplingFunction, Scaler, Scaling,
-    ScalingU16, SigmoidalScaler, ThreadingPolicy, TransferFunction, XYZScaler,
+    ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingU16,
+    ThreadingPolicy,
 };
 
 fn resize_plane(
@@ -53,15 +50,16 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
 
     let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
+
     //
     let store =
-        ImageStore::<u16, 4>::from_slice(&mut choke, dimensions.0 as usize, dimensions.1 as usize)
+        ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
@@ -78,13 +76,13 @@ fn main() {
     //     .unwrap();
 
     let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
-        dimensions.0 as usize / 3,
-        dimensions.1 as usize / 3,
+        dimensions.0 as usize,
+        dimensions.1 as usize / 2,
         10,
     );
 
     scaler
-        .resize_rgba_u16(&store, &mut dst_store, false)
+        .resize_rgba_u16(&store, &mut dst_store, true)
         .unwrap();
 
     let elapsed_time = start_time.elapsed();
@@ -167,7 +165,7 @@ fn main() {
         .iter()
         .map(|&x| (x >> 2) as u8)
         .collect();
-    //
+
     // let dst = dst_store.as_bytes();
     // let dst = resized;
     // image::save_buffer(
diff --git a/src/alpha_check.rs b/src/alpha_check.rs
index 8f253b9..2accea8 100644
--- a/src/alpha_check.rs
+++ b/src/alpha_check.rs
@@ -30,10 +30,12 @@
 use num_traits::AsPrimitive;
 use std::ops::{AddAssign, BitXor};
 
+#[allow(dead_code)]
 pub(crate) fn has_non_constant_cap_alpha_rgba8(store: &[u8], width: usize) -> bool {
     has_non_constant_cap_alpha::<u8, u32, 3, 4>(store, width)
 }
 
+#[allow(dead_code)]
 pub(crate) fn has_non_constant_cap_alpha_rgba16(store: &[u16], width: usize) -> bool {
     has_non_constant_cap_alpha::<u16, u64, 3, 4>(store, width)
 }
@@ -61,7 +63,7 @@ where
     if store.is_empty() {
         return false;
     }
-    let first = store[0];
+    let first = store[ALPHA_CHANNEL_INDEX];
     let mut row_sums: J = 0u32.as_();
     for row in store.chunks_exact(width * CHANNELS) {
         for color in row.chunks_exact(CHANNELS) {
@@ -87,7 +89,7 @@ fn has_non_constant_cap_alpha_f32_impl<const ALPHA_CHANNEL_INDEX: usize, const C
     if store.is_empty() {
         return false;
     }
-    let first = store[0].to_bits();
+    let first = store[ALPHA_CHANNEL_INDEX].to_bits();
     let mut row_sums: u64 = 0u64;
     for row in store.chunks_exact(width * CHANNELS) {
         for color in row.chunks_exact(CHANNELS) {
diff --git a/src/alpha_handle_f16.rs b/src/alpha_handle_f16.rs
index 74ac366..79d44e6 100644
--- a/src/alpha_handle_f16.rs
+++ b/src/alpha_handle_f16.rs
@@ -28,13 +28,13 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
-#[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::cpu_features::is_aarch_f16_supported;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16};
-#[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::{neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full};
-#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"),))]
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -49,9 +49,9 @@ pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [half::f16]) {
         let a = dst[3].to_f32();
         if a != 0. {
             let scale_alpha = 1. / a;
-            r = r * scale_alpha;
-            g = g * scale_alpha;
-            b = b * scale_alpha;
+            r *= scale_alpha;
+            g *= scale_alpha;
+            b *= scale_alpha;
         } else {
             r = 0.;
             g = 0.;
diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
index 62679de..8cbca73 100644
--- a/src/avx2/alpha_f16.rs
+++ b/src/avx2/alpha_f16.rs
@@ -55,61 +55,59 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
+    for (dst, src) in rem
+        .chunks_exact_mut(16 * 4)
+        .zip(src_rem.chunks_exact(16 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+        let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
+        let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
-            let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha);
-            let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha);
-            let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha);
+        let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
+        let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha);
+        let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha);
+        let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha);
 
-            let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
-            let high_r = _mm256_mul_ps(
-                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
-                high_alpha,
-            );
-            let high_g = _mm256_mul_ps(
-                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
-                high_alpha,
-            );
-            let high_b = _mm256_mul_ps(
-                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
-                high_alpha,
-            );
-            let r_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
-            );
-            let g_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
-            );
-            let b_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
-            );
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+        let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
+        let high_r = _mm256_mul_ps(
+            _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
+            high_alpha,
+        );
+        let high_g = _mm256_mul_ps(
+            _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
+            high_alpha,
+        );
+        let high_b = _mm256_mul_ps(
+            _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
+            high_alpha,
+        );
+        let r_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
+        );
+        let g_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
+        );
+        let b_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
+        );
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+        _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+        _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
     premultiply_pixel_f16_row(rem, src_rem);
 }
 
@@ -155,84 +153,82 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
 unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) {
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-            let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
-            let zeros = _mm256_setzero_ps();
-            let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros);
-            let low_r = _mm256_blendv_ps(
-                _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_g = _mm256_blendv_ps(
-                _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_b = _mm256_blendv_ps(
-                _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+        let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
+        let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
-            let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros);
-            let high_r = _mm256_blendv_ps(
-                _mm256_mul_ps(
-                    _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_g = _mm256_blendv_ps(
-                _mm256_mul_ps(
-                    _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_b = _mm256_blendv_ps(
-                _mm256_mul_ps(
-                    _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let r_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
-            );
-            let g_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
-            );
-            let b_values = avx_combine_epi(
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
-                _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
-            );
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
+        let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
+        let zeros = _mm256_setzero_ps();
+        let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros);
+        let low_r = _mm256_blendv_ps(
+            _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_g = _mm256_blendv_ps(
+            _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_b = _mm256_blendv_ps(
+            _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
 
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+        let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
+        let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros);
+        let high_r = _mm256_blendv_ps(
+            _mm256_mul_ps(
+                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_g = _mm256_blendv_ps(
+            _mm256_mul_ps(
+                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_b = _mm256_blendv_ps(
+            _mm256_mul_ps(
+                _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let r_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
+        );
+        let g_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
+        );
+        let b_values = avx_combine_epi(
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
+            _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
+        );
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+        _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+        _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
     unpremultiply_pixel_f16_row(rem);
 }
 
diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
index f136208..f5b4d40 100644
--- a/src/avx2/alpha_f32.rs
+++ b/src/avx2/alpha_f32.rs
@@ -129,34 +129,32 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32])
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm256_loadu_ps(src_ptr);
-            let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
-            let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
-            let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
-            let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = _mm256_mul_ps(rrr, aaa);
-            let ggg = _mm256_mul_ps(ggg, aaa);
-            let bbb = _mm256_mul_ps(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm256_storeu_ps(dst_ptr, rgba0);
-            _mm256_storeu_ps(dst_ptr.add(8), rgba1);
-            _mm256_storeu_ps(dst_ptr.add(16), rgba2);
-            _mm256_storeu_ps(dst_ptr.add(24), rgba3);
-
-            _cx += 8;
-        }
-
-        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(8 * 4).remainder();
+    for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm256_loadu_ps(src_ptr);
+        let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
+        let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
+        let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
+        let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = _mm256_mul_ps(rrr, aaa);
+        let ggg = _mm256_mul_ps(ggg, aaa);
+        let bbb = _mm256_mul_ps(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm256_storeu_ps(dst_ptr, rgba0);
+        _mm256_storeu_ps(dst_ptr.add(8), rgba1);
+        _mm256_storeu_ps(dst_ptr.add(16), rgba2);
+        _mm256_storeu_ps(dst_ptr.add(24), rgba3);
+
+        _cx += 8;
     }
 
+    rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
     premultiply_pixel_f32_row(rem, src_rem);
 }
 
diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
index 83f6550..a77b32d 100644
--- a/src/avx2/alpha_u16.rs
+++ b/src/avx2/alpha_u16.rs
@@ -75,6 +75,17 @@ pub(crate) unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i {
     _mm256_srli_epi32::<DIVIDING_BY>(_mm256_add_epi32(v, _mm256_srli_epi32::<DIVIDING_BY>(v)))
 }
 
+#[inline(always)]
+unsafe fn _mm256_div_by_epi32<const BIT_DEPTH: usize>(v: __m256i) -> __m256i {
+    if BIT_DEPTH == 10 {
+        _mm256_div_by_1023_epi32(v)
+    } else if BIT_DEPTH == 12 {
+        _mm256_div_by_4095_epi32(v)
+    } else {
+        _mm256_div_by_65535_epi32(v)
+    }
+}
+
 pub(crate) fn avx_premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
@@ -88,234 +99,187 @@ pub(crate) fn avx_premultiply_alpha_rgba_u16(
     }
 }
 
-#[target_feature(enable = "avx2")]
-/// This inlining is required to activate all features for runtime dispatch
-unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) {
-    let max_colors = (1 << bit_depth) - 1;
+trait Avx2PremultiplyExecutor {
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize);
+}
 
-    let mut rem = dst;
-    let mut src_rem = src;
+#[derive(Default)]
+struct Avx2PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
+
+impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutor for Avx2PremultiplyExecutorDefault<BIT_DEPTH> {
+    #[target_feature(enable = "avx2")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = dst;
+        let mut src_rem = src;
+
+        for (dst, src) in rem
+            .chunks_exact_mut(16 * 4)
+            .zip(src_rem.chunks_exact(16 * 4))
+        {
+            let src_ptr = src.as_ptr();
+            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
+
+            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
+
+            let zeros = _mm256_setzero_si256();
+            let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
+            let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
+
+            let new_rrr = _mm256_packus_epi32(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
+                    _mm256_unpacklo_epi16(pixel.0, zeros),
+                    low_alpha,
+                )),
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
+                    _mm256_unpackhi_epi16(pixel.0, zeros),
+                    high_alpha,
+                )),
+            );
+            let new_ggg = _mm256_packus_epi32(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
+                    _mm256_unpacklo_epi16(pixel.1, zeros),
+                    low_alpha,
+                )),
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
+                    _mm256_unpackhi_epi16(pixel.1, zeros),
+                    high_alpha,
+                )),
+            );
+            let new_bbb = _mm256_packus_epi32(
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
+                    _mm256_unpacklo_epi16(pixel.2, zeros),
+                    low_alpha,
+                )),
+                _mm256_div_by_epi32::<BIT_DEPTH>(_mm256_madd_epi16(
+                    _mm256_unpackhi_epi16(pixel.2, zeros),
+                    high_alpha,
+                )),
+            );
+
+            let dst_ptr = dst.as_mut_ptr();
+
+            let (d_lane0, d_lane1, d_lane2, d_lane3) =
+                avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
+
+            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
+        }
 
-    unsafe {
-        if bit_depth == 10 {
-            for (dst, src) in rem
-                .chunks_exact_mut(16 * 4)
-                .zip(src_rem.chunks_exact(16 * 4))
-            {
-                let src_ptr = src.as_ptr();
-                let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-                let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-                let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-                let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-                let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-                let zeros = _mm256_setzero_si256();
-                let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
-                let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
-
-                let new_rrr = _mm256_packus_epi32(
-                    _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_unpacklo_epi16(pixel.0, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_unpackhi_epi16(pixel.0, zeros),
-                        high_alpha,
-                    )),
-                );
-                let new_ggg = _mm256_packus_epi32(
-                    _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_unpacklo_epi16(pixel.1, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_unpackhi_epi16(pixel.1, zeros),
-                        high_alpha,
-                    )),
-                );
-                let new_bbb = _mm256_packus_epi32(
-                    _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_unpacklo_epi16(pixel.2, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_unpackhi_epi16(pixel.2, zeros),
-                        high_alpha,
-                    )),
-                );
-
-                let dst_ptr = dst.as_mut_ptr();
-
-                let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                    avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
-
-                _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-                _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-                _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-                _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-            }
-        } else if bit_depth == 12 {
-            for (dst, src) in rem
-                .chunks_exact_mut(16 * 4)
-                .zip(src_rem.chunks_exact(16 * 4))
-            {
-                let src_ptr = src.as_ptr();
-                let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-                let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-                let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-                let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-                let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-                let zeros = _mm256_setzero_si256();
-                let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
-                let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
-
-                let new_rrr = _mm256_packus_epi32(
-                    _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_unpacklo_epi16(pixel.0, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_unpackhi_epi16(pixel.0, zeros),
-                        high_alpha,
-                    )),
-                );
-                let new_ggg = _mm256_packus_epi32(
-                    _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_unpacklo_epi16(pixel.1, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_unpackhi_epi16(pixel.1, zeros),
-                        high_alpha,
-                    )),
-                );
-                let new_bbb = _mm256_packus_epi32(
-                    _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_unpacklo_epi16(pixel.2, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_unpackhi_epi16(pixel.2, zeros),
-                        high_alpha,
-                    )),
-                );
-
-                let dst_ptr = dst.as_mut_ptr();
-
-                let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                    avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
-
-                _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-                _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-                _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-                _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-            }
-        } else if bit_depth == 16 {
-            for (dst, src) in rem
-                .chunks_exact_mut(16 * 4)
-                .zip(src_rem.chunks_exact(16 * 4))
-            {
-                let src_ptr = src.as_ptr();
-                let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-                let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-                let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-                let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-                let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-                let zeros = _mm256_setzero_si256();
-                let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
-                let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
-
-                let new_rrr = _mm256_packus_epi32(
-                    _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_unpacklo_epi16(pixel.0, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_unpackhi_epi16(pixel.0, zeros),
-                        high_alpha,
-                    )),
-                );
-                let new_ggg = _mm256_packus_epi32(
-                    _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_unpacklo_epi16(pixel.1, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_unpackhi_epi16(pixel.1, zeros),
-                        high_alpha,
-                    )),
-                );
-                let new_bbb = _mm256_packus_epi32(
-                    _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_unpacklo_epi16(pixel.2, zeros),
-                        low_alpha,
-                    )),
-                    _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_unpackhi_epi16(pixel.2, zeros),
-                        high_alpha,
-                    )),
-                );
-
-                let dst_ptr = dst.as_mut_ptr();
-
-                let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                    avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
-
-                _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-                _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-                _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-                _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-            }
-        } else {
-            let v_scale_colors = _mm256_set1_ps((1. / max_colors as f64) as f32);
-            for (dst, src) in rem
-                .chunks_exact_mut(16 * 4)
-                .zip(src_rem.chunks_exact(16 * 4))
-            {
-                let src_ptr = src.as_ptr();
-                let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-                let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-                let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-                let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-                let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-                let zeros = _mm256_setzero_si256();
-
-                let low_alpha = _mm256_mul_ps(
-                    _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)),
-                    v_scale_colors,
-                );
-                let high_alpha = _mm256_mul_ps(
-                    _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)),
-                    v_scale_colors,
-                );
-
-                let new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha);
-                let new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha);
-                let new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha);
-
-                let dst_ptr = dst.as_mut_ptr();
-
-                let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                    avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
-
-                _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-                _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-                _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-                _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-            }
+        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
+        premultiply_alpha_rgba_row(rem, src_rem, max_colors);
+    }
+}
+
+#[derive(Default)]
+struct Avx2PremultiplyExecutorAnyBit {}
+
+impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorAnyBit {
+    #[target_feature(enable = "avx2")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = dst;
+        let mut src_rem = src;
+
+        let v_scale_colors = _mm256_set1_ps((1. / max_colors as f64) as f32);
+        for (dst, src) in rem
+            .chunks_exact_mut(16 * 4)
+            .zip(src_rem.chunks_exact(16 * 4))
+        {
+            let src_ptr = src.as_ptr();
+            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
+
+            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
+
+            let zeros = _mm256_setzero_si256();
+
+            let low_alpha = _mm256_mul_ps(
+                _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)),
+                v_scale_colors,
+            );
+            let high_alpha = _mm256_mul_ps(
+                _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)),
+                v_scale_colors,
+            );
+
+            let new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha);
+            let new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha);
+            let new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha);
+
+            let dst_ptr = dst.as_mut_ptr();
+
+            let (d_lane0, d_lane1, d_lane2, d_lane3) =
+                avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
+
+            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
         }
+
         rem = rem.chunks_exact_mut(16 * 4).into_remainder();
         src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
+        premultiply_alpha_rgba_row(rem, src_rem, max_colors);
     }
+}
+
+#[target_feature(enable = "avx2")]
+/// This inlining is required to activate all features for runtime dispatch
+unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) {
+    if bit_depth == 10 {
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorDefault::<10>::default(),
+        );
+    } else if bit_depth == 12 {
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorDefault::<12>::default(),
+        );
+    } else if bit_depth == 16 {
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorDefault::<16>::default(),
+        );
+    } else {
+        avx_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Avx2PremultiplyExecutorAnyBit::default(),
+        );
+    };
+}
 
-    premultiply_alpha_rgba_row(rem, src_rem, max_colors);
+#[target_feature(enable = "avx2")]
+/// This inlining is required to activate all features for runtime dispatch
+#[inline]
+unsafe fn avx_pa_dispatch(
+    dst: &mut [u16],
+    src: &[u16],
+    bit_depth: usize,
+    dispatch: impl Avx2PremultiplyExecutor,
+) {
+    dispatch.premultiply(dst, src, bit_depth);
 }
 
 #[target_feature(enable = "avx2")]
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 35c8f94..4ecff54 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -110,102 +110,100 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        for (dst, src) in rem
-            .chunks_exact_mut(32 * 4)
-            .zip(src_rem.chunks_exact(32 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
-            let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
-            let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let zeros = _mm256_setzero_si256();
-
-            let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
-            let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);
-
-            let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
-            let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);
-
-            let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
-            let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
-
-            let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
-            let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
-
-            rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
-            rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
-            ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low));
-            ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high));
-            bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
-            bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
-
-            let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
-            let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
-            let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
-
-            let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
-            let dst_ptr = dst.as_mut_ptr();
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
-            _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
-            _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(32 * 4).remainder();
-
-        let zeros = _mm_setzero_si128();
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
-            let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
-
-            let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
-            let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
-
-            let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
-            let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
-
-            let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
-            let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
-
-            rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
-            rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
-            ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
-            ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
-            bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
-            bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
-
-            let rrr = _mm_packus_epi16(rrr_low, rrr_high);
-            let ggg = _mm_packus_epi16(ggg_low, ggg_high);
-            let bbb = _mm_packus_epi16(bbb_low, bbb_high);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+    for (dst, src) in rem
+        .chunks_exact_mut(32 * 4)
+        .zip(src_rem.chunks_exact(32 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
+        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
+        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let zeros = _mm256_setzero_si256();
+
+        let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
+        let dst_ptr = dst.as_mut_ptr();
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
+        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
+        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
+    }
+
+    rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(32 * 4).remainder();
+
+    let zeros = _mm_setzero_si128();
+    for (dst, src) in rem
+        .chunks_exact_mut(16 * 4)
+        .zip(src_rem.chunks_exact(16 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
     premultiply_alpha_rgba_row_impl(rem, src_rem);
 }
 
@@ -249,54 +247,52 @@ pub(crate) fn avx_unpremultiply_alpha_rgba(
 unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(32 * 4) {
-            let src_ptr = dst.as_ptr();
-            let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
-            let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
-            let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = avx2_unpremultiply_row(rrr, aaa);
-            let ggg = avx2_unpremultiply_row(ggg, aaa);
-            let bbb = avx2_unpremultiply_row(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
-            _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
-            _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
-
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = sse_unpremultiply_row(rrr, aaa);
-            let ggg = sse_unpremultiply_row(ggg, aaa);
-            let bbb = sse_unpremultiply_row(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    for dst in rem.chunks_exact_mut(32 * 4) {
+        let src_ptr = dst.as_ptr();
+        let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
+        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
+        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = avx2_unpremultiply_row(rrr, aaa);
+        let ggg = avx2_unpremultiply_row(ggg, aaa);
+        let bbb = avx2_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
+        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
+        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
+    }
+
+    rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = sse_unpremultiply_row(rrr, aaa);
+        let ggg = sse_unpremultiply_row(ggg, aaa);
+        let bbb = sse_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
     unpremultiply_alpha_rgba_row_impl(rem);
 }
 
diff --git a/src/avx2/check_alpha.rs b/src/avx2/check_alpha.rs
new file mode 100644
index 0000000..fe6ac32
--- /dev/null
+++ b/src/avx2/check_alpha.rs
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk 12/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::sse::_mm_hsum_epi32;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+/// Checks if image has constant alpha by xor rows
+pub(crate) fn avx_has_non_constant_cap_alpha_rgba8(
+    store: &[u8],
+    width: usize,
+    stride: usize,
+) -> bool {
+    unsafe { avx_has_non_constant_cap_alpha_rgba8_impl(store, width, stride) }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn avx_has_non_constant_cap_alpha_rgba8_impl(
+    store: &[u8],
+    width: usize,
+    stride: usize,
+) -> bool {
+    if store.is_empty() {
+        return true;
+    }
+
+    let ash0 = _mm256_setr_epi8(
+        3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 3, -1, -1, -1, 7, -1, -1, -1,
+        11, -1, -1, -1, 15, -1, -1, -1,
+    );
+
+    let sh0 = _mm_setr_epi8(3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1);
+
+    let first_alpha = store[3];
+    let def_alpha = _mm256_set1_epi32(first_alpha as i32);
+
+    for row in store.chunks_exact(stride) {
+        let row = &row[0..width * 4];
+        let mut sums = _mm256_set1_epi32(0);
+
+        for chunk in row.chunks_exact(32 * 4) {
+            let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i);
+            let mut r1 = _mm256_loadu_si256(chunk.get_unchecked(32..).as_ptr() as *const __m256i);
+            let mut r2 = _mm256_loadu_si256(chunk.get_unchecked(64..).as_ptr() as *const __m256i);
+            let mut r3 = _mm256_loadu_si256(chunk.get_unchecked(96..).as_ptr() as *const __m256i);
+
+            r0 = _mm256_xor_si256(_mm256_shuffle_epi8(r0, ash0), def_alpha);
+            r1 = _mm256_xor_si256(_mm256_shuffle_epi8(r1, ash0), def_alpha);
+            r2 = _mm256_xor_si256(_mm256_shuffle_epi8(r2, ash0), def_alpha);
+            r3 = _mm256_xor_si256(_mm256_shuffle_epi8(r3, ash0), def_alpha);
+
+            sums = _mm256_add_epi32(sums, r0);
+            sums = _mm256_add_epi32(sums, r1);
+            sums = _mm256_add_epi32(sums, r2);
+            sums = _mm256_add_epi32(sums, r3);
+        }
+
+        let row = row.chunks_exact(32 * 4).remainder();
+
+        for chunk in row.chunks_exact(32) {
+            let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i);
+
+            r0 = _mm256_xor_si256(_mm256_shuffle_epi8(r0, ash0), def_alpha);
+
+            sums = _mm256_add_epi32(sums, r0);
+        }
+
+        let row = row.chunks_exact(32).remainder();
+
+        let mut sums = _mm_add_epi32(
+            _mm256_castsi256_si128(sums),
+            _mm256_extracti128_si256::<1>(sums),
+        );
+        let def_alpha = _mm_set1_epi32(first_alpha as i32);
+
+        for chunk in row.chunks_exact(16) {
+            let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+
+            r0 = _mm_shuffle_epi8(r0, sh0);
+
+            let alphas = _mm_xor_si128(r0, def_alpha);
+
+            sums = _mm_add_epi32(sums, alphas);
+        }
+
+        let row = row.chunks_exact(16).remainder();
+
+        let mut h_sum = _mm_hsum_epi32(sums);
+
+        for chunk in row.chunks_exact(4) {
+            h_sum += chunk[3] as i32 ^ first_alpha as i32;
+        }
+
+        if h_sum != 0 {
+            return true;
+        }
+    }
+    false
+}
+
+/// Checks if image has constant alpha by xor rows for image 16bits
+pub(crate) fn avx_has_non_constant_cap_alpha_rgba16(
+    store: &[u16],
+    width: usize,
+    stride: usize,
+) -> bool {
+    unsafe { avx_has_non_constant_cap_alpha_rgba16_impl(store, width, stride) }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn avx_has_non_constant_cap_alpha_rgba16_impl(
+    store: &[u16],
+    width: usize,
+    stride: usize,
+) -> bool {
+    if store.is_empty() {
+        return true;
+    }
+
+    let ash0 = _mm256_setr_epi8(
+        6, 7, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, 14, 15, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1,
+    );
+
+    let first_alpha = store[3];
+    let def_alpha = _mm256_set1_epi32(first_alpha as i32);
+
+    for row in store.chunks_exact(stride) {
+        let row = &row[0..width * 4];
+        let mut sums = _mm256_set1_epi32(0);
+        for chunk in row.chunks_exact(16 * 4) {
+            let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i);
+            let mut r1 = _mm256_loadu_si256(chunk.get_unchecked(16..).as_ptr() as *const __m256i);
+            let mut r2 = _mm256_loadu_si256(chunk.get_unchecked(32..).as_ptr() as *const __m256i);
+            let mut r3 = _mm256_loadu_si256(chunk.get_unchecked(48..).as_ptr() as *const __m256i);
+
+            r0 = _mm256_shuffle_epi8(r0, ash0);
+            r1 = _mm256_shuffle_epi8(r1, ash0);
+            r2 = _mm256_shuffle_epi8(r2, ash0);
+            r3 = _mm256_shuffle_epi8(r3, ash0);
+
+            let r01 = _mm256_xor_si256(_mm256_unpacklo_epi32(r0, r1), def_alpha);
+            let r23 = _mm256_xor_si256(_mm256_unpacklo_epi32(r2, r3), def_alpha);
+
+            sums = _mm256_add_epi32(sums, r01);
+            sums = _mm256_add_epi32(sums, r23);
+        }
+
+        let row = row.chunks_exact(16 * 4).remainder();
+
+        for chunk in row.chunks_exact(16) {
+            let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i);
+
+            r0 = _mm256_shuffle_epi8(r0, ash0);
+
+            let alphas = _mm256_xor_si256(_mm256_unpacklo_epi32(r0, r0), def_alpha);
+
+            sums = _mm256_add_epi32(sums, alphas);
+        }
+
+        let row = row.chunks_exact(16).remainder();
+
+        let mut h_sum = _mm_hsum_epi32(_mm_add_epi32(
+            _mm256_castsi256_si128(sums),
+            _mm256_extracti128_si256::<1>(sums),
+        ));
+
+        for chunk in row.chunks_exact(4) {
+            h_sum += chunk[3] as i32 ^ first_alpha as i32;
+        }
+
+        if h_sum != 0 {
+            return true;
+        }
+    }
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_alpha_exists_rgba8() {
+        let image_size = 256usize;
+        let mut image = vec![0u8; image_size * image_size * 4];
+        image[3 + 150 * 4] = 75;
+        let has_alpha = avx_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4);
+        assert_eq!(true, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba8() {
+        let image_size = 256usize;
+        let image = vec![255u8; image_size * image_size * 4];
+        let has_alpha = avx_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4);
+        assert_eq!(false, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_exists_rgba16() {
+        let image_size = 256usize;
+        let mut image = vec![0u16; image_size * image_size * 4];
+        image[3] = 715;
+        image[7] = 715;
+        image[11] = 715;
+        image[15] = 715;
+        let has_alpha = avx_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4);
+        assert_eq!(true, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba16() {
+        let image_size = 256usize;
+        let image = vec![255u16; image_size * image_size * 4];
+        let has_alpha = avx_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4);
+        assert_eq!(false, has_alpha);
+    }
+}
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 39fb038..9cb38ad 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -32,6 +32,7 @@ mod alpha_f16;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
+mod check_alpha;
 #[cfg(feature = "half")]
 mod rgba_f16;
 mod rgba_f32;
@@ -51,6 +52,9 @@ pub(crate) use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
 pub(crate) use alpha_u16::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16};
 pub(crate) use alpha_u8::avx_premultiply_alpha_rgba;
 pub(crate) use alpha_u8::avx_unpremultiply_alpha_rgba;
+pub(crate) use check_alpha::{
+    avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8,
+};
 #[cfg(feature = "half")]
 pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index 105b2f2..b2bc4d5 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16<const FMA: bool>(
     let src_ptr = src.add(start_x * COMPONENTS);
     let rgb_pixel = _mm_loadu_si64(src_ptr as *const u8);
     let pixels = avx_combine_ps(_mm_cvtph_ps(rgb_pixel), _mm_setzero_ps());
-    let acc = _mm256_fma_ps::<FMA>(store_0, pixels, weight0);
-    acc
+    _mm256_fma_ps::<FMA>(store_0, pixels, weight0)
 }
 
 #[inline(always)]
@@ -70,8 +69,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16<const FMA: bool>(
     let rgb_pixel_1 = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(rgb_pixels_row_0));
 
     let acc = _mm256_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
-    let acc = _mm256_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
-    acc
+    _mm256_fma_ps::<FMA>(acc, rgb_pixel_1, weight1)
 }
 
 #[inline(always)]
@@ -112,8 +110,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const FMA: bool>(
     const COMPONENTS: usize = 4;
     let src_ptr = src.add(start_x * COMPONENTS);
     let rgb_pixels = _mm_loadu_si128(src_ptr as *const __m128i);
-    let acc = _mm256_fma_ps::<FMA>(store_0, _mm256_cvtph_ps(rgb_pixels), weight0);
-    acc
+    _mm256_fma_ps::<FMA>(store_0, _mm256_cvtph_ps(rgb_pixels), weight0)
 }
 
 pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
diff --git a/src/avx2/rgba_u8_lb.rs b/src/avx2/rgba_u8_lb.rs
index b38ad25..d9663d3 100644
--- a/src/avx2/rgba_u8_lb.rs
+++ b/src/avx2/rgba_u8_lb.rs
@@ -98,220 +98,214 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
+    const CHANNELS: usize = 4;
 
-        const SCALE: i32 = 6;
-        const V_SHR: i32 = SCALE;
-        const ROUNDING: i16 = 1 << (V_SHR - 1);
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (V_SHR - 1);
 
-        let vld = _mm256_setr_epi16(
-            ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        );
+    let vld = _mm256_setr_epi16(
+        ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    );
+
+    let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
 
-        let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            while jx + 8 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-
-                let w0 = w_ptr[0];
-                let w1 = w_ptr[1];
-                let w2 = w_ptr[2];
-                let w3 = w_ptr[3];
-                let w4 = w_ptr[4];
-                let w5 = w_ptr[5];
-                let w6 = w_ptr[6];
-                let w7 = w_ptr[7];
-
-                let weights0 = _mm256_setr_epi16(
-                    w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5,
-                );
-                let weights1 = _mm256_setr_epi16(
-                    w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7,
-                );
-
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm256_loadu_si256(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-                let rgb_pixel_1 = _mm256_loadu_si256(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-                let rgb_pixel_2 = _mm256_loadu_si256(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-                let rgb_pixel_3 = _mm256_loadu_si256(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
-                );
-
-                store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1);
-                store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1);
-                store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1);
-                store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1);
-
-                jx += 8;
-            }
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-
-                let w0 = w_ptr[0];
-                let w1 = w_ptr[1];
-                let w2 = w_ptr[2];
-                let w3 = w_ptr[3];
-
-                let weights = _mm256_setr_epi16(
-                    w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3,
-                );
-
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-
-                let rgb_pixel_1 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-                let rgb_pixel_2 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-                let rgb_pixel_3 =
-                    _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
-                        src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                    )));
-
-                store_0 = hdot2(store_0, rgb_pixel_0, weights);
-                store_1 = hdot2(store_1, rgb_pixel_1, weights);
-                store_2 = hdot2(store_2, rgb_pixel_2, weights);
-                store_3 = hdot2(store_3, rgb_pixel_3, weights);
-
-                jx += 4;
-            }
-
-            let mut store_0 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_0),
-                _mm256_extracti128_si256::<1>(store_0),
+            let w0 = w_ptr[0];
+            let w1 = w_ptr[1];
+            let w2 = w_ptr[2];
+            let w3 = w_ptr[3];
+            let w4 = w_ptr[4];
+            let w5 = w_ptr[5];
+            let w6 = w_ptr[6];
+            let w7 = w_ptr[7];
+
+            let weights0 = _mm256_setr_epi16(
+                w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5,
             );
-            let mut store_1 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_1),
-                _mm256_extracti128_si256::<1>(store_1),
+            let weights1 = _mm256_setr_epi16(
+                w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7,
             );
-            let mut store_2 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_2),
-                _mm256_extracti128_si256::<1>(store_2),
+
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm256_loadu_si256(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-            let mut store_3 = _mm_add_epi16(
-                _mm256_castsi256_si128(store_3),
-                _mm256_extracti128_si256::<1>(store_3),
+            let rgb_pixel_1 = _mm256_loadu_si256(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-
-                let rgb_pixel_0 =
-                    _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_1 =
-                    _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_2 =
-                    _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_3 =
-                    _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-
-                store_0 = hdot(store_0, rgb_pixel_0, weight01);
-                store_1 = hdot(store_1, rgb_pixel_1, weight01);
-                store_2 = hdot(store_2, rgb_pixel_2, weight01);
-                store_3 = hdot(store_3, rgb_pixel_3, weight01);
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-
-                let weight0 = _mm_set1_epi16(w_ptr[0]);
-
-                let start_bounds = bounds.start + jx;
-
-                store_0 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            store_0 = _mm_add_hi_lo_epi16(store_0);
-            store_1 = _mm_add_hi_lo_epi16(store_1);
-            store_2 = _mm_add_hi_lo_epi16(store_2);
-            store_3 = _mm_add_hi_lo_epi16(store_3);
-
-            let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
-            let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
-            let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
-            let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
-
-            _mm_storeu_si32(
-                chunk0.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_0, store_16_8_0),
+            let rgb_pixel_2 = _mm256_loadu_si256(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-            _mm_storeu_si32(
-                chunk1.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_1, store_16_8_1),
+            let rgb_pixel_3 = _mm256_loadu_si256(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i,
             );
-            _mm_storeu_si32(
-                chunk2.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_2, store_16_8_2),
+
+            store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1);
+            store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1);
+            store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1);
+            store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1);
+
+            jx += 8;
+        }
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+
+            let w0 = w_ptr[0];
+            let w1 = w_ptr[1];
+            let w2 = w_ptr[2];
+            let w3 = w_ptr[3];
+
+            let weights = _mm256_setr_epi16(
+                w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3,
             );
-            _mm_storeu_si32(
-                chunk3.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+
+            let rgb_pixel_1 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+            let rgb_pixel_2 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+            let rgb_pixel_3 =
+                _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128(
+                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+                )));
+
+            store_0 = hdot2(store_0, rgb_pixel_0, weights);
+            store_1 = hdot2(store_1, rgb_pixel_1, weights);
+            store_2 = hdot2(store_2, rgb_pixel_2, weights);
+            store_3 = hdot2(store_3, rgb_pixel_3, weights);
+
+            jx += 4;
+        }
+
+        let mut store_0 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_0),
+            _mm256_extracti128_si256::<1>(store_0),
+        );
+        let mut store_1 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_1),
+            _mm256_extracti128_si256::<1>(store_1),
+        );
+        let mut store_2 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_2),
+            _mm256_extracti128_si256::<1>(store_2),
+        );
+        let mut store_3 = _mm_add_epi16(
+            _mm256_castsi256_si128(store_3),
+            _mm256_extracti128_si256::<1>(store_3),
+        );
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
             );
+
+            let rgb_pixel_0 =
+                _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_1 =
+                _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_2 =
+                _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_3 =
+                _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+
+            store_0 = hdot(store_0, rgb_pixel_0, weight01);
+            store_1 = hdot(store_1, rgb_pixel_1, weight01);
+            store_2 = hdot(store_2, rgb_pixel_2, weight01);
+            store_3 = hdot(store_3, rgb_pixel_3, weight01);
+
+            jx += 2;
         }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+
+            let weight0 = _mm_set1_epi16(w_ptr[0]);
+
+            let start_bounds = bounds.start + jx;
+
+            store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
+            jx += 1;
+        }
+
+        store_0 = _mm_add_hi_lo_epi16(store_0);
+        store_1 = _mm_add_hi_lo_epi16(store_1);
+        store_2 = _mm_add_hi_lo_epi16(store_2);
+        store_3 = _mm_add_hi_lo_epi16(store_3);
+
+        let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
+        let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
+        let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
+        let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
+
+        _mm_storeu_si32(
+            chunk0.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_0, store_16_8_0),
+        );
+        _mm_storeu_si32(
+            chunk1.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_1, store_16_8_1),
+        );
+        _mm_storeu_si32(
+            chunk2.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_2, store_16_8_2),
+        );
+        _mm_storeu_si32(
+            chunk3.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_3, store_16_8_3),
+        );
     }
 }
 
diff --git a/src/avx2/vertical_u16_lb.rs b/src/avx2/vertical_u16_lb.rs
index c083c57..2622766 100644
--- a/src/avx2/vertical_u16_lb.rs
+++ b/src/avx2/vertical_u16_lb.rs
@@ -59,7 +59,7 @@ unsafe fn convolve_column_lb_avx_u16_impl(
     weight: &[i16],
     bit_depth: u32,
 ) {
-    assert!(bit_depth >= 1 && bit_depth <= 16);
+    assert!((1..=16).contains(&bit_depth));
     let max_colors = (1 << bit_depth) - 1;
     let mut cx = 0usize;
 
@@ -217,8 +217,8 @@ unsafe fn convolve_column_lb_avx_u16_impl(
         if bounds_size == 2 {
             let weights = weight.get_unchecked(0..2);
 
-            let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-            let v_weight1 = _mm_set1_epi32(weights[1] as i32);
+            let v_weight0 = _mm_set1_epi16(weights[0]);
+            let v_weight1 = _mm_set1_epi16(weights[1]);
 
             let py = bounds.start;
             let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -238,9 +238,9 @@ unsafe fn convolve_column_lb_avx_u16_impl(
         } else if bounds_size == 3 {
             let weights = weight.get_unchecked(0..3);
 
-            let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-            let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-            let v_weight2 = _mm_set1_epi32(weights[2] as i32);
+            let v_weight0 = _mm_set1_epi16(weights[0]);
+            let v_weight1 = _mm_set1_epi16(weights[1]);
+            let v_weight2 = _mm_set1_epi16(weights[2]);
 
             let py = bounds.start;
             let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -267,10 +267,10 @@ unsafe fn convolve_column_lb_avx_u16_impl(
         } else if bounds_size == 4 {
             let weights = weight.get_unchecked(0..4);
 
-            let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-            let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-            let v_weight2 = _mm_set1_epi32(weights[2] as i32);
-            let v_weight3 = _mm_set1_epi32(weights[3] as i32);
+            let v_weight0 = _mm_set1_epi16(weights[0]);
+            let v_weight1 = _mm_set1_epi16(weights[1]);
+            let v_weight2 = _mm_set1_epi16(weights[2]);
+            let v_weight3 = _mm_set1_epi16(weights[3]);
 
             let py = bounds.start;
             let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -306,7 +306,7 @@ unsafe fn convolve_column_lb_avx_u16_impl(
                 let py = bounds.start + j;
                 let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
 
-                let v_weight = _mm_set1_epi32(k_weight as i32);
+                let v_weight = _mm_set1_epi16(k_weight);
 
                 let item_row = _mm_loadu_si64(src_ptr.as_ptr() as *const u8);
 
diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs
index 31fb2e6..cb1ddee 100644
--- a/src/avx2/vertical_u8.rs
+++ b/src/avx2/vertical_u8.rs
@@ -70,6 +70,7 @@ unsafe fn convolve_vertical_part_avx_64(
 ) {
     let zeros = _mm256_setzero_si256();
     let vld = _mm256_set1_epi32(ROUNDING_CONST);
+
     let mut store_0 = vld;
     let mut store_1 = vld;
     let mut store_2 = vld;
@@ -129,7 +130,7 @@ unsafe fn convolve_vertical_part_avx_64(
     for j in jj..bounds_size {
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
-        let v_weight = _mm256_set1_epi32(weight as i32);
+        let v_weight = _mm256_set1_epi16(weight);
         let src_ptr = src.get_unchecked((src_stride * py + px)..);
 
         let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
@@ -188,7 +189,7 @@ unsafe fn convolve_vertical_part_avx_32(
     for j in 0..bounds_size {
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
-        let v_weight = _mm256_set1_epi32(weight as i32);
+        let v_weight = _mm256_set1_epi16(weight);
         let src_ptr = src.get_unchecked((src_stride * py + px)..);
 
         let item_row = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
@@ -232,14 +233,14 @@ unsafe fn convolve_vertical_part_8_avx(
     for j in 0..bounds_size {
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
-        let v_weight = _mm256_set1_epi32(weight as i32);
+        let v_weight = _mm256_set1_epi16(weight);
         let src_ptr = src.get_unchecked((src_stride * py + px)..);
         let item_row = _mm256_cvtepu16_epi32(_mm_unpacklo_epi8(
             _mm_loadu_si64(src_ptr.as_ptr()),
             _mm_setzero_si128(),
         ));
 
-        store_0 = _mm256_add_epi32(store_0, _mm256_mullo_epi32(item_row, v_weight));
+        store_0 = _mm256_add_epi32(store_0, _mm256_madd_epi16(item_row, v_weight));
     }
 
     const MASK: i32 = shuffle(3, 1, 2, 0);
@@ -278,8 +279,8 @@ unsafe fn convolve_vertical_part_avx(
     if bounds_size == 2 {
         let py = start_y;
         let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
+        let v_weight0 = _mm256_set1_epi16(weight[0]);
+        let v_weight1 = _mm256_set1_epi16(weight[1]);
         let src_ptr0 = src.get_unchecked(src_stride * py + px);
         let src_ptr1 = src.get_unchecked(src_stride * (py + 1) + px);
         let item_row0 = _mm256_insert_epi8::<0>(_mm256_setzero_si256(), *src_ptr0 as i8);
@@ -290,9 +291,9 @@ unsafe fn convolve_vertical_part_avx(
     } else if bounds_size == 3 {
         let py = start_y;
         let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm256_set1_epi32(weight[2] as i32);
+        let v_weight0 = _mm256_set1_epi16(weight[0]);
+        let v_weight1 = _mm256_set1_epi16(weight[1]);
+        let v_weight2 = _mm256_set1_epi16(weight[2]);
         let src_ptr0 = src.get_unchecked(src_stride * py + px);
         let src_ptr1 = src.get_unchecked(src_stride * (py + 1) + px);
         let src_ptr2 = src.get_unchecked(src_stride * (py + 2) + px);
@@ -306,10 +307,10 @@ unsafe fn convolve_vertical_part_avx(
     } else if bounds_size == 4 {
         let py = start_y;
         let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm256_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm256_set1_epi32(weight[3] as i32);
+        let v_weight0 = _mm256_set1_epi16(weight[0]);
+        let v_weight1 = _mm256_set1_epi16(weight[1]);
+        let v_weight2 = _mm256_set1_epi16(weight[2]);
+        let v_weight3 = _mm256_set1_epi16(weight[3]);
         let src_ptr0 = src.get_unchecked(src_stride * py + px);
         let src_ptr1 = src.get_unchecked(src_stride * (py + 1) + px);
         let src_ptr2 = src.get_unchecked(src_stride * (py + 2) + px);
@@ -327,7 +328,7 @@ unsafe fn convolve_vertical_part_avx(
         for j in 0..bounds.size {
             let py = start_y + j;
             let weight = *filter.get_unchecked(j);
-            let v_weight = _mm256_set1_epi32(weight as i32);
+            let v_weight = _mm256_set1_epi16(weight);
             let src_ptr = src.get_unchecked(src_stride * py + px);
             let item_row = _mm256_setr_epi32(*src_ptr as i32, 0, 0, 0, 0, 0, 0, 0);
 
diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index 7139282..f6dcda0 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -85,6 +85,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
     let mut cx = 0usize;
 
     let mut rem = dst;
+
     let iter_64 = rem.chunks_exact_mut(64);
 
     for dst in iter_64 {
@@ -228,6 +229,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
     }
 
     rem = rem.chunks_exact_mut(64).into_remainder();
+
     let iter_32 = rem.chunks_exact_mut(32);
 
     for dst in iter_32 {
diff --git a/src/cpu_features.rs b/src/cpu_features.rs
index 975fd4a..e110a04 100644
--- a/src/cpu_features.rs
+++ b/src/cpu_features.rs
@@ -64,7 +64,7 @@ fn apple_has_cpu_feature(_feature_name: &str) -> bool {
 
 /// Test aarch64 cpu with *fp16* check,
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
-#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 pub(crate) fn is_aarch_f16_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs
index 03bd2d2..27ede3b 100644
--- a/src/dispatch_group_f16.rs
+++ b/src/dispatch_group_f16.rs
@@ -35,6 +35,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
 
+#[allow(clippy::type_complexity)]
 pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
     image_store: &ImageStore<f16, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
@@ -78,6 +79,7 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
     }
 }
 
+#[allow(clippy::type_complexity)]
 pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
     image_store: &ImageStore<f16, CHANNELS>,
     filter_weights: FilterWeights<f32>,
diff --git a/src/dispatch_group_u16.rs b/src/dispatch_group_u16.rs
index 0beee63..af33b6a 100644
--- a/src/dispatch_group_u16.rs
+++ b/src/dispatch_group_u16.rs
@@ -27,7 +27,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::filter_weights::FilterWeights;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+use crate::cpu_features::is_aarch_f16_supported;
+use crate::filter_weights::{
+    DefaultWeightsConverter, FilterBounds, FilterWeights, WeightsConverter,
+};
 use crate::handler_provider::{
     ColumnHandlerFixedPoint, ColumnHandlerFloatingPoint, RowHandlerFixedPoint,
     RowHandlerFloatingPoint,
@@ -189,25 +193,50 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                         );
                     });
             } else {
-                let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
-                destination_image
-                    .par_chunks_exact_mut(dst_stride)
-                    .enumerate()
-                    .for_each(|(y, row)| {
-                        let bounds = filter_weights.bounds[y];
-                        let filter_offset = y * filter_weights.aligned_size;
-                        let weights = &approx.weights[filter_offset..];
-                        let source_buffer = image_store.buffer.as_ref();
-                        u16::handle_fixed_column::<i32, COMPONENTS>(
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                {
+                    if is_aarch_f16_supported() {
+                        use crate::filter_weights::WeightFloat16Converter;
+                        execute_low_precision_row(
+                            true,
+                            image_store,
+                            &filter_weights,
+                            src_stride,
+                            dst_stride,
+                            bit_depth,
                             dst_width,
-                            &bounds,
-                            source_buffer,
-                            row,
+                            destination_image,
+                            HighBitDepthFloat16LowerHandler::default(),
+                            WeightFloat16Converter::default(),
+                        );
+                    } else {
+                        execute_low_precision_row(
+                            true,
+                            image_store,
+                            &filter_weights,
                             src_stride,
-                            weights,
-                            bit_depth as u32,
+                            dst_stride,
+                            bit_depth,
+                            dst_width,
+                            destination_image,
+                            DefaultHighBitDepthLowerHandler::default(),
+                            DefaultWeightsConverter::default(),
                         );
-                    });
+                    }
+                }
+                #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))]
+                execute_low_precision_row(
+                    true,
+                    image_store,
+                    &filter_weights,
+                    src_stride,
+                    dst_stride,
+                    bit_depth,
+                    dst_width,
+                    destination_image,
+                    DefaultHighBitDepthLowerHandler::default(),
+                    DefaultWeightsConverter::default(),
+                );
             }
         });
     } else if bit_depth > 12 {
@@ -231,8 +260,145 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                 );
             });
     } else {
-        let destination_image = destination.buffer.borrow_mut();
-        let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+        {
+            if is_aarch_f16_supported() {
+                use crate::filter_weights::WeightFloat16Converter;
+                execute_low_precision_row(
+                    false,
+                    image_store,
+                    &filter_weights,
+                    src_stride,
+                    dst_stride,
+                    bit_depth,
+                    dst_width,
+                    destination.buffer.borrow_mut(),
+                    HighBitDepthFloat16LowerHandler::default(),
+                    WeightFloat16Converter::default(),
+                );
+            } else {
+                execute_low_precision_row(
+                    false,
+                    image_store,
+                    &filter_weights,
+                    src_stride,
+                    dst_stride,
+                    bit_depth,
+                    dst_width,
+                    destination.buffer.borrow_mut(),
+                    DefaultHighBitDepthLowerHandler::default(),
+                    DefaultWeightsConverter::default(),
+                );
+            }
+        }
+        #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))]
+        execute_low_precision_row(
+            false,
+            image_store,
+            &filter_weights,
+            src_stride,
+            dst_stride,
+            bit_depth,
+            dst_width,
+            destination.buffer.borrow_mut(),
+            DefaultHighBitDepthLowerHandler::default(),
+            DefaultWeightsConverter::default(),
+        );
+    }
+}
+
+trait HandleHighBitDepthLower<const COMPONENTS: usize> {
+    fn handle_fixed_column(
+        &self,
+        dst_width: usize,
+        bounds: &FilterBounds,
+        src: &[u16],
+        dst: &mut [u16],
+        src_stride: usize,
+        weight: &[i16],
+        bit_depth: u32,
+    );
+}
+
+#[derive(Default)]
+struct DefaultHighBitDepthLowerHandler {}
+
+impl<const COMPONENTS: usize> HandleHighBitDepthLower<COMPONENTS>
+    for DefaultHighBitDepthLowerHandler
+{
+    fn handle_fixed_column(
+        &self,
+        dst_width: usize,
+        bounds: &FilterBounds,
+        src: &[u16],
+        dst: &mut [u16],
+        src_stride: usize,
+        weight: &[i16],
+        bit_depth: u32,
+    ) {
+        u16::handle_fixed_column::<i32, COMPONENTS>(
+            dst_width, bounds, src, dst, src_stride, weight, bit_depth,
+        );
+    }
+}
+
+#[derive(Default)]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+struct HighBitDepthFloat16LowerHandler {}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+impl<const COMPONENTS: usize> HandleHighBitDepthLower<COMPONENTS>
+    for HighBitDepthFloat16LowerHandler
+{
+    fn handle_fixed_column(
+        &self,
+        dst_width: usize,
+        bounds: &FilterBounds,
+        src: &[u16],
+        dst: &mut [u16],
+        src_stride: usize,
+        weight: &[i16],
+        bit_depth: u32,
+    ) {
+        use crate::neon::convolve_column_lb_u16_f16;
+        convolve_column_lb_u16_f16(dst_width, bounds, src, dst, src_stride, weight, bit_depth);
+    }
+}
+
+#[inline]
+fn execute_low_precision_row<const COMPONENTS: usize>(
+    is_parallel: bool,
+    image_store: &ImageStore<u16, COMPONENTS>,
+    filter_weights: &FilterWeights<f32>,
+    src_stride: usize,
+    dst_stride: usize,
+    bit_depth: usize,
+    dst_width: usize,
+    destination_image: &mut [u16],
+    handler: impl HandleHighBitDepthLower<COMPONENTS> + Sync,
+    weights: impl WeightsConverter,
+) {
+    let approx = weights.prepare_weights(filter_weights);
+    if is_parallel {
+        destination_image
+            .par_chunks_exact_mut(dst_stride)
+            .enumerate()
+            .for_each(|(y, row)| {
+                let bounds = filter_weights.bounds[y];
+                let filter_offset = y * filter_weights.aligned_size;
+                let weights = &approx.weights[filter_offset..];
+                let source_buffer = image_store.buffer.as_ref();
+                handler.handle_fixed_column(
+                    dst_width,
+                    &bounds,
+                    source_buffer,
+                    row,
+                    src_stride,
+                    weights,
+                    bit_depth as u32,
+                );
+            });
+    } else {
         destination_image
             .chunks_exact_mut(dst_stride)
             .enumerate()
@@ -241,7 +407,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                 let filter_offset = y * filter_weights.aligned_size;
                 let weights = &approx.weights[filter_offset..];
                 let source_buffer = image_store.buffer.as_ref();
-                u16::handle_fixed_column::<i32, COMPONENTS>(
+                handler.handle_fixed_column(
                     dst_width,
                     &bounds,
                     source_buffer,
diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs
index 89d9eaa..bcdb3cf 100644
--- a/src/dispatch_group_u8.rs
+++ b/src/dispatch_group_u8.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::filter_weights::{FilterBounds, FilterWeights, WeightsConverter};
 use crate::image_store::ImageStoreMut;
 use crate::support::PRECISION;
 use crate::ImageStore;
@@ -44,8 +44,9 @@ pub(crate) fn convolve_horizontal_dispatch_u8<const CHANNELS: usize>(
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>)>,
     dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights<i16>),
+    weights_converter: impl WeightsConverter,
 ) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+    let approx_weights = weights_converter.prepare_weights(&filter_weights);
 
     let src = image_store.buffer.as_ref();
     let dst = destination.buffer.borrow_mut();
diff --git a/src/f16.rs b/src/f16.rs
index a445a9e..6573ace 100644
--- a/src/f16.rs
+++ b/src/f16.rs
@@ -95,16 +95,18 @@ fn convolve_horizontal_rgb_native_row_f16<const CHANNELS: usize>(
     convolve_row_handler_floating_point::<f16, f32, f32, CHANNELS>(src, dst, filter_weights, 8)
 }
 
-impl<'a> HorizontalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
+impl HorizontalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 4>,
         pool: &Option<ThreadPool>,
     ) {
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
         > = Some(convolve_horizontal_rgba_4_row_f16::<4>);
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]) =
             convolve_horizontal_rgb_native_row_f16::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -167,13 +169,14 @@ fn convolve_vertical_rgb_native_row_f16<const COMPONENTS: usize>(
     column_handler_floating_point::<f16, f32, f32>(bounds, src, dst, src_stride, weight, 8);
 }
 
-impl<'a> VerticalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
+impl VerticalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 4>,
         pool: &Option<ThreadPool>,
     ) {
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
             convolve_vertical_rgb_native_row_f16::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -211,16 +214,18 @@ impl<'a> VerticalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
     }
 }
 
-impl<'a> HorizontalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
+impl HorizontalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 3>,
         pool: &Option<ThreadPool>,
     ) {
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
         > = Some(convolve_horizontal_rgba_4_row_f16::<3>);
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]) =
             convolve_horizontal_rgb_native_row_f16::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -263,13 +268,14 @@ impl<'a> HorizontalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
     }
 }
 
-impl<'a> VerticalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
+impl VerticalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 3>,
         pool: &Option<ThreadPool>,
     ) {
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
             convolve_vertical_rgb_native_row_f16::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -307,13 +313,14 @@ impl<'a> VerticalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
     }
 }
 
-impl<'a> HorizontalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
+impl HorizontalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 1>,
         pool: &Option<ThreadPool>,
     ) {
+        #[allow(clippy::type_complexity)]
         let _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
         > = Some(convolve_horizontal_rgba_4_row_f16::<1>);
@@ -330,13 +337,14 @@ impl<'a> HorizontalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
     }
 }
 
-impl<'a> VerticalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
+impl VerticalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 1>,
         pool: &Option<ThreadPool>,
     ) {
+        #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
             convolve_vertical_rgb_native_row_f16::<1>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
diff --git a/src/filter_weights.rs b/src/filter_weights.rs
index 6e2823d..b1694b7 100644
--- a/src/filter_weights.rs
+++ b/src/filter_weights.rs
@@ -109,3 +109,40 @@ impl FilterWeights<f32> {
         )
     }
 }
+
+pub(crate) trait WeightsConverter {
+    fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<i16>;
+}
+
+#[derive(Default)]
+pub(crate) struct DefaultWeightsConverter {}
+
+impl WeightsConverter for DefaultWeightsConverter {
+    fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<i16> {
+        use crate::support::PRECISION;
+        weights.numerical_approximation_i16::<PRECISION>(0)
+    }
+}
+
+#[derive(Default)]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+pub(crate) struct WeightFloat16Converter {}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+impl WeightsConverter for WeightFloat16Converter {
+    fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<i16> {
+        use crate::neon::convert_weights_to_f16;
+        let converted_weights = convert_weights_to_f16(&weights.weights);
+
+        let new_bounds = weights.bounds.to_vec();
+
+        FilterWeights::new(
+            converted_weights,
+            weights.kernel_size,
+            weights.kernel_size,
+            weights.distinct_elements,
+            weights.coeffs_size,
+            new_bounds,
+        )
+    }
+}
diff --git a/src/image_store.rs b/src/image_store.rs
index 2f7f490..beff353 100644
--- a/src/image_store.rs
+++ b/src/image_store.rs
@@ -26,10 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::alpha_check::{
-    has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8,
-    has_non_constant_cap_alpha_rgba_f32,
-};
+use crate::alpha_check::has_non_constant_cap_alpha_rgba_f32;
 #[cfg(feature = "half")]
 use crate::alpha_handle_f16::{premultiply_alpha_rgba_f16, unpremultiply_alpha_rgba_f16};
 use crate::alpha_handle_f32::{premultiply_alpha_rgba_f32, unpremultiply_alpha_rgba_f32};
@@ -64,7 +61,7 @@ where
     /// Image height
     pub height: usize,
     /// This is private field, currently used only for u16, will be automatically passed from upper func
-    pub(crate) bit_depth: usize,
+    pub bit_depth: usize,
 }
 
 #[derive(Debug)]
@@ -398,7 +395,39 @@ impl AssociateAlpha<u8, 4> for ImageStore<'_, u8, 4> {
         premultiply_alpha_rgba(dst, src, self.width, self.height, pool);
     }
 
+    #[cfg(not(any(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        all(target_arch = "aarch64", target_feature = "neon")
+    )))]
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
+        has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width)
+    }
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     fn is_alpha_premultiplication_needed(&self) -> bool {
+        use crate::neon::neon_has_non_constant_cap_alpha_rgba8;
+        neon_has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width, self.width * 4)
+    }
+
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
+        use crate::avx2::avx_has_non_constant_cap_alpha_rgba8;
+        use crate::sse::sse_has_non_constant_cap_alpha_rgba8;
+        if std::arch::is_x86_feature_detected!("avx2") {
+            return avx_has_non_constant_cap_alpha_rgba8(
+                self.buffer.as_ref(),
+                self.width,
+                self.width * 4,
+            );
+        } else if std::arch::is_x86_feature_detected!("sse4.1") {
+            return sse_has_non_constant_cap_alpha_rgba8(
+                self.buffer.as_ref(),
+                self.width,
+                self.width * 4,
+            );
+        }
         has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width)
     }
 }
@@ -414,10 +443,42 @@ impl AssociateAlpha<u16, 4> for ImageStore<'_, u16, 4> {
     fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, u16, 4>, pool: &Option<ThreadPool>) {
         let dst = into.buffer.borrow_mut();
         let src = self.buffer.as_ref();
-        premultiply_alpha_rgba_u16(dst, src, self.width, self.height, self.bit_depth, pool);
+        premultiply_alpha_rgba_u16(dst, src, self.width, self.height, into.bit_depth, pool);
+    }
+
+    #[cfg(not(any(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        all(target_arch = "aarch64", target_feature = "neon")
+    )))]
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        use crate::alpha_check::has_non_constant_cap_alpha_rgba16;
+        has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width)
+    }
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        use crate::neon::neon_has_non_constant_cap_alpha_rgba16;
+        neon_has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width, self.width * 4)
     }
 
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     fn is_alpha_premultiplication_needed(&self) -> bool {
+        use crate::alpha_check::has_non_constant_cap_alpha_rgba16;
+        use crate::avx2::avx_has_non_constant_cap_alpha_rgba16;
+        use crate::sse::sse_has_non_constant_cap_alpha_rgba16;
+        if std::arch::is_x86_feature_detected!("avx2") {
+            return avx_has_non_constant_cap_alpha_rgba16(
+                self.buffer.as_ref(),
+                self.width,
+                self.width * 4,
+            );
+        } else if std::arch::is_x86_feature_detected!("sse4.1") {
+            return sse_has_non_constant_cap_alpha_rgba16(
+                self.buffer.as_ref(),
+                self.width,
+                self.width * 4,
+            );
+        }
         has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width)
     }
 }
@@ -472,3 +533,27 @@ impl UnassociateAlpha<half::f16, 4> for ImageStoreMut<'_, half::f16, 4> {
         unpremultiply_alpha_rgba_f16(dst, self.width, self.height, pool);
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn image_store_alpha_test_rgba8() {
+        let image_size = 256usize;
+        let mut image = vec![0u8; image_size * image_size * 4];
+        image[3 + 150 * 4] = 75;
+        let store = ImageStore::<u8, 4>::from_slice(&image, image_size, image_size).unwrap();
+        let has_alpha = store.is_alpha_premultiplication_needed();
+        assert_eq!(true, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba8() {
+        let image_size = 256usize;
+        let image = vec![255u8; image_size * image_size * 4];
+        let store = ImageStore::<u8, 4>::from_slice(&image, image_size, image_size).unwrap();
+        let has_alpha = store.is_alpha_premultiplication_needed();
+        assert_eq!(false, has_alpha);
+    }
+}
diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs
index bc7048f..084f989 100644
--- a/src/neon/alpha_u16.rs
+++ b/src/neon/alpha_u16.rs
@@ -26,7 +26,9 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::alpha_handle_u16::{premultiply_alpha_rgba_row, unpremultiply_alpha_rgba_row};
+use crate::cpu_features::is_aarch_f16_supported;
+use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16};
+use crate::neon::xreinterpretq_f16_u16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
@@ -47,118 +49,199 @@ unsafe fn neon_div_by_65535_n(v: uint32x4_t) -> uint16x4_t {
     vqrshrn_n_u32::<16>(vrsraq_n_u32::<16>(v, v))
 }
 
-pub fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) {
-    assert_ne!(bit_depth, 0, "Something goes wrong!");
-    let max_colors = (1 << bit_depth) - 1;
+#[inline(always)]
+unsafe fn neon_div_by<const BIT_DEPTH: usize>(v: uint32x4_t) -> uint16x4_t {
+    match BIT_DEPTH {
+        10 => neon_div_by_1023_n(v),
+        12 => neon_div_by_4095_n(v),
+        16 => neon_div_by_65535_n(v),
+        _ => neon_div_by_1023_n(v),
+    }
+}
 
-    let v_max_colors_scale = unsafe { vdupq_n_f32((1. / max_colors as f64) as f32) };
+trait NeonPremultiplyExecutor {
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize);
+}
 
-    let mut rem = dst;
-    let mut src_rem = src;
+#[derive(Default)]
+struct NeonPremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
 
-    unsafe {
-        if bit_depth == 10 {
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let pixel = vld4q_u16(src.as_ptr());
+impl<const BIT_DEPTH: usize> NeonPremultiplyExecutor for NeonPremultiplyExecutorDefault<BIT_DEPTH> {
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        assert_ne!(bit_depth, 0, "Something goes wrong!");
+        assert!((1..=16).contains(&bit_depth));
 
-                let low_a = vget_low_u16(pixel.3);
+        let mut rem = dst;
+        let mut src_rem = src;
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            let pixel = vld4q_u16(src.as_ptr());
+
+            let low_a = vget_low_u16(pixel.3);
+
+            let new_r = vcombine_u16(
+                neon_div_by::<BIT_DEPTH>(vmull_u16(vget_low_u16(pixel.0), low_a)),
+                neon_div_by::<BIT_DEPTH>(vmull_high_u16(pixel.0, pixel.3)),
+            );
+
+            let new_g = vcombine_u16(
+                neon_div_by::<BIT_DEPTH>(vmull_u16(vget_low_u16(pixel.1), low_a)),
+                neon_div_by::<BIT_DEPTH>(vmull_high_u16(pixel.1, pixel.3)),
+            );
 
-                let new_r = vcombine_u16(
-                    neon_div_by_1023_n(vmull_u16(vget_low_u16(pixel.0), low_a)),
-                    neon_div_by_1023_n(vmull_high_u16(pixel.0, pixel.3)),
-                );
+            let new_b = vcombine_u16(
+                neon_div_by::<BIT_DEPTH>(vmull_u16(vget_low_u16(pixel.2), low_a)),
+                neon_div_by::<BIT_DEPTH>(vmull_high_u16(pixel.2, pixel.3)),
+            );
 
-                let new_g = vcombine_u16(
-                    neon_div_by_1023_n(vmull_u16(vget_low_u16(pixel.1), low_a)),
-                    neon_div_by_1023_n(vmull_high_u16(pixel.1, pixel.3)),
-                );
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
 
-                let new_b = vcombine_u16(
-                    neon_div_by_1023_n(vmull_u16(vget_low_u16(pixel.2), low_a)),
-                    neon_div_by_1023_n(vmull_high_u16(pixel.2, pixel.3)),
-                );
+            vst4q_u16(dst.as_mut_ptr(), new_px);
+        }
 
-                let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(8 * 4).remainder();
 
-                vst4q_u16(dst.as_mut_ptr(), new_px);
-            }
-        } else if bit_depth == 12 {
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let pixel = vld4q_u16(src.as_ptr());
+        if !rem.is_empty() {
+            assert!(src_rem.len() < 8 * 4);
+            assert!(rem.len() < 8 * 4);
+            let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
 
-                let low_a = vget_low_u16(pixel.3);
+            let pixel = vld4q_u16(buffer.as_ptr());
 
-                let new_r = vcombine_u16(
-                    neon_div_by_4095_n(vmull_u16(vget_low_u16(pixel.0), low_a)),
-                    neon_div_by_4095_n(vmull_high_u16(pixel.0, pixel.3)),
-                );
+            let low_a = vget_low_u16(pixel.3);
 
-                let new_g = vcombine_u16(
-                    neon_div_by_4095_n(vmull_u16(vget_low_u16(pixel.1), low_a)),
-                    neon_div_by_4095_n(vmull_high_u16(pixel.1, pixel.3)),
-                );
+            let new_r = vcombine_u16(
+                neon_div_by::<BIT_DEPTH>(vmull_u16(vget_low_u16(pixel.0), low_a)),
+                neon_div_by::<BIT_DEPTH>(vmull_high_u16(pixel.0, pixel.3)),
+            );
 
-                let new_b = vcombine_u16(
-                    neon_div_by_4095_n(vmull_u16(vget_low_u16(pixel.2), low_a)),
-                    neon_div_by_4095_n(vmull_high_u16(pixel.2, pixel.3)),
-                );
+            let new_g = vcombine_u16(
+                neon_div_by::<BIT_DEPTH>(vmull_u16(vget_low_u16(pixel.1), low_a)),
+                neon_div_by::<BIT_DEPTH>(vmull_high_u16(pixel.1, pixel.3)),
+            );
 
-                let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+            let new_b = vcombine_u16(
+                neon_div_by::<BIT_DEPTH>(vmull_u16(vget_low_u16(pixel.2), low_a)),
+                neon_div_by::<BIT_DEPTH>(vmull_high_u16(pixel.2, pixel.3)),
+            );
 
-                vst4q_u16(dst.as_mut_ptr(), new_px);
-            }
-        } else if bit_depth == 16 {
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let pixel = vld4q_u16(src.as_ptr());
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
 
-                let low_a = vget_low_u16(pixel.3);
+            vst4q_u16(buffer.as_mut_ptr(), new_px);
 
-                let new_r = vcombine_u16(
-                    neon_div_by_65535_n(vmull_u16(vget_low_u16(pixel.0), low_a)),
-                    neon_div_by_65535_n(vmull_high_u16(pixel.0, pixel.3)),
-                );
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
 
-                let new_g = vcombine_u16(
-                    neon_div_by_65535_n(vmull_u16(vget_low_u16(pixel.1), low_a)),
-                    neon_div_by_65535_n(vmull_high_u16(pixel.1, pixel.3)),
-                );
+#[derive(Default)]
+struct NeonPremultiplyExecutorAnyBitDepth {}
 
-                let new_b = vcombine_u16(
-                    neon_div_by_65535_n(vmull_u16(vget_low_u16(pixel.2), low_a)),
-                    neon_div_by_65535_n(vmull_high_u16(pixel.2, pixel.3)),
-                );
+impl NeonPremultiplyExecutor for NeonPremultiplyExecutorAnyBitDepth {
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        assert_ne!(bit_depth, 0, "Something goes wrong!");
+        assert!((1..=16).contains(&bit_depth));
+        let max_colors = (1 << bit_depth) - 1;
+        let mut rem = dst;
+        let mut src_rem = src;
 
-                let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+        let v_max_colors_scale = vdupq_n_f32((1. / max_colors as f64) as f32);
 
-                vst4q_u16(dst.as_mut_ptr(), new_px);
-            }
-        } else {
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let pixel = vld4q_u16(src.as_ptr());
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            let pixel = vld4q_u16(src.as_ptr());
 
-                let low_a = vmovl_u16(vget_low_u16(pixel.3));
-                let high_a = vmovl_high_u16(pixel.3);
+            let low_a = vmovl_u16(vget_low_u16(pixel.3));
+            let high_a = vmovl_high_u16(pixel.3);
 
-                let low_a = vmulq_f32(vcvtq_f32_u32(low_a), v_max_colors_scale);
-                let hi_a = vmulq_f32(vcvtq_f32_u32(high_a), v_max_colors_scale);
+            let low_a = vmulq_f32(vcvtq_f32_u32(low_a), v_max_colors_scale);
+            let hi_a = vmulq_f32(vcvtq_f32_u32(high_a), v_max_colors_scale);
 
-                let new_r = v_scale_by_alpha(pixel.0, low_a, hi_a);
+            let new_r = v_scale_by_alpha(pixel.0, low_a, hi_a);
 
-                let new_g = v_scale_by_alpha(pixel.1, low_a, hi_a);
+            let new_g = v_scale_by_alpha(pixel.1, low_a, hi_a);
 
-                let new_b = v_scale_by_alpha(pixel.2, low_a, hi_a);
+            let new_b = v_scale_by_alpha(pixel.2, low_a, hi_a);
 
-                let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
 
-                vst4q_u16(dst.as_mut_ptr(), new_px);
-            }
+            vst4q_u16(dst.as_mut_ptr(), new_px);
         }
 
         rem = rem.chunks_exact_mut(8 * 4).into_remainder();
         src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
+        if !rem.is_empty() {
+            assert!(src_rem.len() < 8 * 4);
+            assert!(rem.len() < 8 * 4);
+            let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
+
+            let pixel = vld4q_u16(buffer.as_ptr());
+
+            let low_a = vmovl_u16(vget_low_u16(pixel.3));
+            let high_a = vmovl_high_u16(pixel.3);
+
+            let low_a = vmulq_f32(vcvtq_f32_u32(low_a), v_max_colors_scale);
+            let hi_a = vmulq_f32(vcvtq_f32_u32(high_a), v_max_colors_scale);
+
+            let new_r = v_scale_by_alpha(pixel.0, low_a, hi_a);
+
+            let new_g = v_scale_by_alpha(pixel.1, low_a, hi_a);
+
+            let new_b = v_scale_by_alpha(pixel.2, low_a, hi_a);
+
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+
+            vst4q_u16(buffer.as_mut_ptr(), new_px);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) {
+    assert_ne!(bit_depth, 0, "Something goes wrong!");
+
+    if bit_depth == 10 {
+        neon_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            NeonPremultiplyExecutorDefault::<10>::default(),
+        )
+    } else if bit_depth == 12 {
+        neon_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            NeonPremultiplyExecutorDefault::<12>::default(),
+        )
+    } else if bit_depth == 16 {
+        neon_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            NeonPremultiplyExecutorDefault::<16>::default(),
+        )
+    } else {
+        neon_pa_dispatch(
+            dst,
+            src,
+            bit_depth,
+            NeonPremultiplyExecutorAnyBitDepth::default(),
+        )
     }
+}
 
-    premultiply_alpha_rgba_row(rem, src_rem, max_colors);
+#[inline]
+fn neon_pa_dispatch(
+    dst: &mut [u16],
+    src: &[u16],
+    bit_depth: usize,
+    dispatch: impl NeonPremultiplyExecutor,
+) {
+    unsafe { dispatch.premultiply(dst, src, bit_depth) }
 }
 
 pub(crate) fn neon_premultiply_alpha_rgba_u16(
@@ -204,12 +287,19 @@ unsafe fn v_scale_by_alpha(
     vcombine_u16(vmovn_u32(new_ll), vmovn_u32(new_lh))
 }
 
-fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize) {
-    let max_colors = (1 << bit_depth) - 1;
+trait DisassociateAlpha {
+    unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize);
+}
 
-    let mut rem = in_place;
+#[derive(Default)]
+struct NeonDisassociateAlpha {}
+
+impl DisassociateAlpha for NeonDisassociateAlpha {
+    unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = in_place;
 
-    unsafe {
         let v_max_colors_f = vdupq_n_f32(max_colors as f32);
         let ones = vdupq_n_f32(1.);
         for dst in rem.chunks_exact_mut(8 * 4) {
@@ -245,11 +335,165 @@ fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize)
 
             vst4q_u16(dst.as_mut_ptr(), new_px);
         }
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
+        if !rem.is_empty() {
+            assert!(rem.len() < 8 * 4);
+            let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
+
+            let pixel = vld4q_u16(buffer.as_ptr());
+
+            let is_alpha_zero_mask = vceqzq_u16(pixel.3);
+
+            let low_a = vmovl_u16(vget_low_u16(pixel.3));
+            let high_a = vmovl_high_u16(pixel.3);
+
+            let low_a = vmulq_f32(vdivq_f32(ones, vcvtq_f32_u32(low_a)), v_max_colors_f);
+            let hi_a = vmulq_f32(vdivq_f32(ones, vcvtq_f32_u32(high_a)), v_max_colors_f);
+
+            let new_r = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.0,
+                v_scale_by_alpha(pixel.0, low_a, hi_a),
+            );
+
+            let new_g = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.1,
+                v_scale_by_alpha(pixel.1, low_a, hi_a),
+            );
+
+            let new_b = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.2,
+                v_scale_by_alpha(pixel.2, low_a, hi_a),
+            );
+
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+
+            vst4q_u16(buffer.as_mut_ptr(), new_px);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+#[derive(Default)]
+struct NeonDisassociateAlphaFloat16 {}
+
+impl DisassociateAlpha for NeonDisassociateAlphaFloat16 {
+    #[target_feature(enable = "fp16")]
+    unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = in_place;
+
+        let v_max_colors_f = xvcvtq_f16_u16(vdupq_n_u16(max_colors as u16));
+
+        let ones = xreinterpretq_f16_u16(vdupq_n_u16(15360)); // 15360 = 1f16
+
+        for dst in rem.chunks_exact_mut(8 * 4) {
+            let pixel = vld4q_u16(dst.as_ptr());
+
+            let is_alpha_zero_mask = vceqzq_u16(pixel.3);
+
+            let a_vals = xvdivq_f16(ones, xvcvtq_f16_u16(pixel.3));
+
+            let new_r = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.0,
+                xvcvtaq_u16_f16(xvmulq_f16(
+                    xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_vals),
+                    v_max_colors_f,
+                )),
+            );
+
+            let new_g = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.1,
+                xvcvtaq_u16_f16(xvmulq_f16(
+                    xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_vals),
+                    v_max_colors_f,
+                )),
+            );
+
+            let new_b = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.2,
+                xvcvtaq_u16_f16(xvmulq_f16(
+                    xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_vals),
+                    v_max_colors_f,
+                )),
+            );
+
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+
+            vst4q_u16(dst.as_mut_ptr(), new_px);
+        }
 
         rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
+        if !rem.is_empty() {
+            assert!(rem.len() < 8 * 4);
+            let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
+
+            let pixel = vld4q_u16(buffer.as_ptr());
+
+            let is_alpha_zero_mask = vceqzq_u16(pixel.3);
+
+            let a_vals = xvdivq_f16(ones, xvcvtq_f16_u16(pixel.3));
+
+            let new_r = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.0,
+                xvcvtaq_u16_f16(xvmulq_f16(
+                    xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_vals),
+                    v_max_colors_f,
+                )),
+            );
+
+            let new_g = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.1,
+                xvcvtaq_u16_f16(xvmulq_f16(
+                    xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_vals),
+                    v_max_colors_f,
+                )),
+            );
+
+            let new_b = vbslq_u16(
+                is_alpha_zero_mask,
+                pixel.2,
+                xvcvtaq_u16_f16(xvmulq_f16(
+                    xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_vals),
+                    v_max_colors_f,
+                )),
+            );
+
+            let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+
+            vst4q_u16(buffer.as_mut_ptr(), new_px);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
     }
+}
+
+#[inline]
+unsafe fn neon_un_row(in_place: &mut [u16], bit_depth: usize, handler: impl DisassociateAlpha) {
+    handler.disassociate(in_place, bit_depth);
+}
 
-    unpremultiply_alpha_rgba_row(rem, max_colors);
+fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize) {
+    unsafe {
+        if (bit_depth == 10 || bit_depth == 12) && is_aarch_f16_supported() {
+            neon_un_row(in_place, bit_depth, NeonDisassociateAlphaFloat16::default());
+        } else {
+            neon_un_row(in_place, bit_depth, NeonDisassociateAlpha::default());
+        }
+    }
 }
 
 pub(crate) fn neon_unpremultiply_alpha_rgba_u16(
diff --git a/src/neon/alpha_u8.rs b/src/neon/alpha_u8.rs
index c296a63..553d992 100644
--- a/src/neon/alpha_u8.rs
+++ b/src/neon/alpha_u8.rs
@@ -27,7 +27,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiply_alpha_rgba_row_impl};
+use crate::alpha_handle_u8::premultiply_alpha_rgba_row_impl;
+use crate::cpu_features::is_aarch_f16_supported;
+use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvmulq_f16, xvrecpeq_f16};
+use crate::neon::xreinterpretq_f16_u16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
@@ -48,33 +51,6 @@ macro_rules! premultiply_vec {
     }};
 }
 
-macro_rules! unpremultiply_vec {
-    ($v: expr, $a_values: expr) => {{
-        let scale = vdupq_n_u8(255);
-        let hi = vmull_high_u8($v, scale);
-        let lo = vmull_u8(vget_low_u8($v), vget_low_u8(scale));
-        let lo_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo)));
-        let lo_hi = vcvtq_f32_u32(vmovl_high_u16(lo));
-        let hi_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi)));
-        let hi_hi = vcvtq_f32_u32(vmovl_high_u16(hi));
-        let zero_mask = vmvnq_u8(vceqzq_u8($a_values));
-        let a_hi = vmovl_high_u8($a_values);
-        let a_lo = vmovl_u8(vget_low_u8($a_values));
-        let a_lo_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_lo))));
-        let a_lo_hi = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_lo)));
-        let a_hi_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_hi))));
-        let a_hi_ho = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_hi)));
-
-        let lo_lo = vcvtaq_u32_f32(vmulq_f32(lo_lo, a_lo_lo));
-        let lo_hi = vcvtaq_u32_f32(vmulq_f32(lo_hi, a_lo_hi));
-        let hi_lo = vcvtaq_u32_f32(vmulq_f32(hi_lo, a_hi_lo));
-        let hi_hi = vcvtaq_u32_f32(vmulq_f32(hi_hi, a_hi_ho));
-        let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi));
-        let hi = vcombine_u16(vmovn_u32(hi_lo), vmovn_u32(hi_hi));
-        vandq_u8(vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)), zero_mask)
-    }};
-}
-
 unsafe fn neon_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     let mut rem = dst;
     let mut src_rem = src;
@@ -158,24 +134,192 @@ pub(crate) fn neon_premultiply_alpha_rgba(
     }
 }
 
-unsafe fn neon_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
-    let mut rem = in_place;
+trait DisassociateAlpha {
+    unsafe fn disassociate(&self, in_place: &mut [u8]);
+}
+
+#[derive(Default)]
+struct NeonDisassociateAlpha {}
+
+impl NeonDisassociateAlpha {
+    #[inline(always)]
+    unsafe fn unpremultiply_vec(v: uint8x16_t, a_values: uint8x16_t) -> uint8x16_t {
+        let scale = vdupq_n_u8(255);
+        let hi = vmull_high_u8(v, scale);
+        let lo = vmull_u8(vget_low_u8(v), vget_low_u8(scale));
+        let lo_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo)));
+        let lo_hi = vcvtq_f32_u32(vmovl_high_u16(lo));
+        let hi_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi)));
+        let hi_hi = vcvtq_f32_u32(vmovl_high_u16(hi));
+        let zero_mask = vmvnq_u8(vceqzq_u8(a_values));
+        let a_hi = vmovl_high_u8(a_values);
+        let a_lo = vmovl_u8(vget_low_u8(a_values));
+        let a_lo_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_lo))));
+        let a_lo_hi = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_lo)));
+        let a_hi_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_hi))));
+        let a_hi_ho = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_hi)));
+
+        let lo_lo = vcvtaq_u32_f32(vmulq_f32(lo_lo, a_lo_lo));
+        let lo_hi = vcvtaq_u32_f32(vmulq_f32(lo_hi, a_lo_hi));
+        let hi_lo = vcvtaq_u32_f32(vmulq_f32(hi_lo, a_hi_lo));
+        let hi_hi = vcvtaq_u32_f32(vmulq_f32(hi_hi, a_hi_ho));
+        let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi));
+        let hi = vcombine_u16(vmovn_u32(hi_lo), vmovn_u32(hi_hi));
+        vandq_u8(vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)), zero_mask)
+    }
+
+    #[inline(always)]
+    unsafe fn unpremultiply_vech(v: uint8x8_t, a_values: uint8x8_t) -> uint8x8_t {
+        let scale = vdupq_n_u8(255);
+        let lo = vmull_u8(v, vget_low_u8(scale));
+        let lo_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo)));
+        let lo_hi = vcvtq_f32_u32(vmovl_high_u16(lo));
+        let zero_mask = vmvn_u8(vceqz_u8(a_values));
+        let a_lo = vmovl_u8(a_values);
+        let a_lo_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_lo))));
+        let a_lo_hi = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_lo)));
+
+        let lo_lo = vcvtaq_u32_f32(vmulq_f32(lo_lo, a_lo_lo));
+        let lo_hi = vcvtaq_u32_f32(vmulq_f32(lo_hi, a_lo_hi));
+        let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi));
+        vand_u8(vqmovn_u16(lo), zero_mask)
+    }
+}
+
+impl DisassociateAlpha for NeonDisassociateAlpha {
+    unsafe fn disassociate(&self, in_place: &mut [u8]) {
+        let mut rem = in_place;
 
-    unsafe {
         for dst in rem.chunks_exact_mut(16 * 4) {
             let src_ptr = dst.as_ptr();
             let mut pixel = vld4q_u8(src_ptr);
-            pixel.0 = unpremultiply_vec!(pixel.0, pixel.3);
-            pixel.1 = unpremultiply_vec!(pixel.1, pixel.3);
-            pixel.2 = unpremultiply_vec!(pixel.2, pixel.3);
+            pixel.0 = Self::unpremultiply_vec(pixel.0, pixel.3);
+            pixel.1 = Self::unpremultiply_vec(pixel.1, pixel.3);
+            pixel.2 = Self::unpremultiply_vec(pixel.2, pixel.3);
+            let dst_ptr = dst.as_mut_ptr();
+            vst4q_u8(dst_ptr, pixel);
+        }
+
+        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
+        for dst in rem.chunks_exact_mut(8 * 4) {
+            let src_ptr = dst.as_ptr();
+            let mut pixel = vld4_u8(src_ptr);
+            pixel.0 = Self::unpremultiply_vech(pixel.0, pixel.3);
+            pixel.1 = Self::unpremultiply_vech(pixel.1, pixel.3);
+            pixel.2 = Self::unpremultiply_vech(pixel.2, pixel.3);
+            vst4_u8(dst.as_mut_ptr(), pixel);
+        }
+
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
+        if !rem.is_empty() {
+            assert!(rem.len() < 8 * 4);
+            let mut buffer: [u8; 8 * 4] = [0u8; 8 * 4];
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
+
+            let mut pixel = vld4_u8(buffer.as_ptr());
+            pixel.0 = Self::unpremultiply_vech(pixel.0, pixel.3);
+            pixel.1 = Self::unpremultiply_vech(pixel.1, pixel.3);
+            pixel.2 = Self::unpremultiply_vech(pixel.2, pixel.3);
+            vst4_u8(buffer.as_mut_ptr(), pixel);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+#[derive(Default)]
+struct NeonDisassociateAlphaFloat16 {}
+
+impl NeonDisassociateAlphaFloat16 {
+    #[inline]
+    #[target_feature(enable = "fp16")]
+    /// Float16 feature un pre-multiplication, it's faster with a little slower precision
+    unsafe fn unpremultiply_vec_f16(v: uint8x16_t, a_values: uint8x16_t) -> uint8x16_t {
+        // 23544 = 255f16
+        let v_scale = xreinterpretq_f16_u16(vdupq_n_u16(23544));
+
+        let lo = xvmulq_f16(xvcvtq_f16_u16(vmovl_u8(vget_low_u8(v))), v_scale);
+        let hi = xvmulq_f16(xvcvtq_f16_u16(vmovl_high_u8(v)), v_scale);
+
+        let lo_a = xvrecpeq_f16(xvcvtq_f16_u16(vmovl_u8(vget_low_u8(a_values))));
+        let hi_a = xvrecpeq_f16(xvcvtq_f16_u16(vmovl_high_u8(a_values)));
+
+        let zero_mask = vmvnq_u8(vceqzq_u8(a_values));
+
+        let lo = xvcvtaq_u16_f16(xvmulq_f16(lo, lo_a));
+        let hi = xvcvtaq_u16_f16(xvmulq_f16(hi, hi_a));
+        vandq_u8(vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)), zero_mask)
+    }
+
+    #[inline]
+    #[target_feature(enable = "fp16")]
+    /// Float16 feature un pre-multiplication, it's faster with a little slower precision
+    unsafe fn unpremultiply_vec_f16h(v: uint8x8_t, a_values: uint8x8_t) -> uint8x8_t {
+        // 23544 = 255f16
+        let v_scale = xreinterpretq_f16_u16(vdupq_n_u16(23544));
+        let lo = xvmulq_f16(xvcvtq_f16_u16(vmovl_u8(v)), v_scale);
+        let lo_a = xvrecpeq_f16(xvcvtq_f16_u16(vmovl_u8(a_values)));
+        let zero_mask = vmvn_u8(vceqz_u8(a_values));
+        let lo = xvcvtaq_u16_f16(xvmulq_f16(lo, lo_a));
+        vand_u8(vqmovn_u16(lo), zero_mask)
+    }
+}
+
+impl DisassociateAlpha for NeonDisassociateAlphaFloat16 {
+    #[target_feature(enable = "fp16")]
+    unsafe fn disassociate(&self, in_place: &mut [u8]) {
+        let mut rem = in_place;
+
+        for dst in rem.chunks_exact_mut(16 * 4) {
+            let mut pixel = vld4q_u8(dst.as_ptr());
+            pixel.0 = Self::unpremultiply_vec_f16(pixel.0, pixel.3);
+            pixel.1 = Self::unpremultiply_vec_f16(pixel.1, pixel.3);
+            pixel.2 = Self::unpremultiply_vec_f16(pixel.2, pixel.3);
             let dst_ptr = dst.as_mut_ptr();
             vst4q_u8(dst_ptr, pixel);
         }
 
         rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
+        for dst in rem.chunks_exact_mut(8 * 4) {
+            let mut pixel = vld4_u8(dst.as_ptr());
+            pixel.0 = Self::unpremultiply_vec_f16h(pixel.0, pixel.3);
+            pixel.1 = Self::unpremultiply_vec_f16h(pixel.1, pixel.3);
+            pixel.2 = Self::unpremultiply_vec_f16h(pixel.2, pixel.3);
+            let dst_ptr = dst.as_mut_ptr();
+            vst4_u8(dst_ptr, pixel);
+        }
+
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
+        if !rem.is_empty() {
+            assert!(rem.len() < 8 * 4);
+            let mut buffer: [u8; 8 * 4] = [0u8; 8 * 4];
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
+
+            let mut pixel = vld4_u8(buffer.as_ptr());
+            pixel.0 = Self::unpremultiply_vec_f16h(pixel.0, pixel.3);
+            pixel.1 = Self::unpremultiply_vec_f16h(pixel.1, pixel.3);
+            pixel.2 = Self::unpremultiply_vec_f16h(pixel.2, pixel.3);
+            vst4_u8(buffer.as_mut_ptr(), pixel);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
     }
+}
+
+unsafe fn neon_dis_dispatch(in_place: &mut [u8], handler: impl DisassociateAlpha) {
+    handler.disassociate(in_place);
+}
 
-    unpremultiply_alpha_rgba_row_impl(rem);
+unsafe fn neon_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
+    if is_aarch_f16_supported() {
+        neon_dis_dispatch(in_place, NeonDisassociateAlphaFloat16::default());
+    } else {
+        neon_dis_dispatch(in_place, NeonDisassociateAlpha::default());
+    }
 }
 
 pub(crate) fn neon_unpremultiply_alpha_rgba(
diff --git a/src/neon/check_alpha.rs b/src/neon/check_alpha.rs
new file mode 100644
index 0000000..a2944ff
--- /dev/null
+++ b/src/neon/check_alpha.rs
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use std::arch::aarch64::*;
+
+/// Checks if image has constant alpha by xor rows
+pub(crate) fn neon_has_non_constant_cap_alpha_rgba8(
+    store: &[u8],
+    width: usize,
+    stride: usize,
+) -> bool {
+    unsafe {
+        if store.is_empty() {
+            return true;
+        }
+
+        let first_alpha = store[3];
+
+        let v_first_alpha = vdupq_n_u8(first_alpha);
+
+        for row in store.chunks_exact(stride) {
+            let row = &row[0..width * 4];
+            let mut sums = vdupq_n_u32(0);
+            for chunk in row.chunks_exact(16 * 4) {
+                let loaded = vld4q_u8(chunk.as_ptr());
+                let blend_result = veorq_u8(loaded.3, v_first_alpha);
+                let blend32 = vpaddlq_u16(vpaddlq_u8(blend_result));
+                sums = vaddq_u32(sums, blend32);
+            }
+
+            let row = row.chunks_exact(16 * 4).remainder();
+
+            for chunk in row.chunks_exact(8 * 4) {
+                let loaded = vld4_u8(chunk.as_ptr());
+                let blend_result = veor_u8(loaded.3, vget_low_u8(v_first_alpha));
+                let blend32 = vpaddl_u16(vpaddl_u8(blend_result));
+                sums = vaddq_u32(sums, vcombine_u32(blend32, blend32));
+            }
+
+            let row = row.chunks_exact(8 * 4).remainder();
+
+            let mut h_sum = vaddvq_u32(sums);
+
+            for chunk in row.chunks_exact(4) {
+                h_sum += chunk[3] as u32 ^ first_alpha as u32;
+            }
+
+            if h_sum != 0 {
+                return true;
+            }
+        }
+
+        false
+    }
+}
+
+/// Checks if image has constant alpha by xor rows for image 16bits
+pub(crate) fn neon_has_non_constant_cap_alpha_rgba16(
+    store: &[u16],
+    width: usize,
+    stride: usize,
+) -> bool {
+    unsafe {
+        if store.is_empty() {
+            return true;
+        }
+
+        let first_alpha = store[3];
+        let def_alpha = vdupq_n_u16(first_alpha);
+
+        for row in store.chunks_exact(stride) {
+            let row = &row[0..width * 4];
+            let mut sums = vdupq_n_u32(0);
+            for chunk in row.chunks_exact(8 * 4) {
+                let r0 = vld4q_u16(chunk.as_ptr());
+
+                let pxor = veorq_u16(r0.3, def_alpha);
+                sums = vaddq_u32(sums, vpaddlq_u16(pxor));
+            }
+
+            let row = row.chunks_exact(8 * 4).remainder();
+
+            for chunk in row.chunks_exact(4 * 4) {
+                let r0 = vld4_u16(chunk.as_ptr());
+
+                let pxor = veor_u16(r0.3, vget_low_u16(def_alpha));
+                let pw = vpaddl_u16(pxor);
+                sums = vaddq_u32(sums, vcombine_u32(pw, pw));
+            }
+
+            let row = row.chunks_exact(4 * 4).remainder();
+
+            let mut h_sum = vaddvq_u32(sums);
+
+            for chunk in row.chunks_exact(4) {
+                h_sum += chunk[3] as u32 ^ first_alpha as u32;
+            }
+
+            if h_sum != 0 {
+                return true;
+            }
+        }
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_alpha_exists_rgba8() {
+        let image_size = 256usize;
+        let mut image = vec![0u8; image_size * image_size * 4];
+        image[3 + 150 * 4] = 75;
+        let has_alpha = neon_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4);
+        assert_eq!(true, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba8() {
+        let image_size = 256usize;
+        let image = vec![255u8; image_size * image_size * 4];
+        let has_alpha = neon_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4);
+        assert_eq!(false, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba16() {
+        let image_size = 256usize;
+        let image = vec![255u16; image_size * image_size * 4];
+        let has_alpha = neon_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4);
+        assert_eq!(false, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_exists_rgba16() {
+        let image_size = 256usize;
+        let mut image = vec![0u16; image_size * image_size * 4];
+        image[3] = 715;
+        image[7] = 715;
+        image[11] = 715;
+        image[15] = 715;
+        let has_alpha = neon_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4);
+        assert_eq!(true, has_alpha);
+    }
+}
diff --git a/src/neon/f16_utils.rs b/src/neon/f16_utils.rs
index 0a94d05..4b4f9ee 100644
--- a/src/neon/f16_utils.rs
+++ b/src/neon/f16_utils.rs
@@ -32,7 +32,6 @@ use std::arch::aarch64::*;
 use std::arch::asm;
 
 /// Provides basic support for f16
-
 #[allow(unused)]
 macro_rules! static_assert {
     ($e:expr) => {
@@ -90,18 +89,21 @@ pub(crate) struct x_float16x8x4_t(
 );
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t {
-    let store: uint16x4_t = vld1_u16(std::mem::transmute(ptr));
+    let store: uint16x4_t = vld1_u16(ptr as *const _);
     std::mem::transmute(store)
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t {
-    let store: uint16x8_t = vld1q_u16(std::mem::transmute(ptr));
+    let store: uint16x8_t = vld1q_u16(ptr as *const _);
     std::mem::transmute(store)
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t {
     let ptr_u16 = ptr as *const u16;
     x_float16x8x2_t(
@@ -111,6 +113,7 @@ pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t {
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t {
     let ptr_u16 = ptr as *const u16;
     x_float16x8x4_t(
@@ -123,19 +126,25 @@ pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t {
 
 #[inline]
 pub(crate) unsafe fn xvget_low_f16(x: x_float16x8_t) -> x_float16x4_t {
-    std::mem::transmute(vget_low_u16(std::mem::transmute(x)))
+    std::mem::transmute::<uint16x4_t, x_float16x4_t>(vget_low_u16(std::mem::transmute::<
+        x_float16x8_t,
+        uint16x8_t,
+    >(x)))
 }
 
 #[inline]
 pub(crate) unsafe fn xvget_high_f16(x: x_float16x8_t) -> x_float16x4_t {
-    std::mem::transmute(vget_high_u16(std::mem::transmute(x)))
+    std::mem::transmute::<uint16x4_t, x_float16x4_t>(vget_high_u16(std::mem::transmute::<
+        x_float16x8_t,
+        uint16x8_t,
+    >(x)))
 }
 
 #[inline]
 pub(crate) unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16x8_t {
-    std::mem::transmute(vcombine_u16(
-        std::mem::transmute(low),
-        std::mem::transmute(high),
+    std::mem::transmute::<uint16x8_t, x_float16x8_t>(vcombine_u16(
+        std::mem::transmute::<x_float16x4_t, uint16x4_t>(low),
+        std::mem::transmute::<x_float16x4_t, uint16x4_t>(high),
     ))
 }
 
@@ -159,12 +168,14 @@ pub(crate) unsafe fn xreinterpretq_f16_u16(x: uint16x8_t) -> x_float16x8_t {
     std::mem::transmute(x)
 }
 
-#[inline]
+/// Sets register to f16 zero
+#[inline(always)]
 pub(super) unsafe fn xvzerosq_f16() -> x_float16x8_t {
     xreinterpretq_f16_u16(vdupq_n_u16(0))
 }
 
-#[inline]
+/// Sets register to f16 zero
+#[inline(always)]
 pub(super) unsafe fn xvzeros_f16() -> x_float16x4_t {
     xreinterpret_f16_u16(vdup_n_u16(0))
 }
@@ -192,6 +203,94 @@ pub(super) unsafe fn xvcvt_f16_f32(v: float32x4_t) -> x_float16x4_t {
     xreinterpret_f16_u16(result)
 }
 
+/// This instruction converts each element in a vector from fixed-point to floating-point
+/// using the rounding mode that is specified by the FPCR, and writes the result
+/// to the SIMD&FP destination register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f16_u16)
+#[inline]
+#[target_feature(enable = "fp16")]
+pub(super) unsafe fn xvcvtq_f16_u16(v: uint16x8_t) -> x_float16x8_t {
+    let result: uint16x8_t;
+    asm!(
+    "ucvtf {0:v}.8h, {1:v}.8h",
+    out(vreg) result,
+    in(vreg) v,
+    options(pure, nomem, nostack));
+    xreinterpretq_f16_u16(result)
+}
+
+/// This instruction converts each element in a vector from fixed-point to floating-point
+/// using the rounding mode that is specified by the FPCR, and writes the result
+/// to the SIMD&FP destination register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f16_u16)
+#[inline]
+#[target_feature(enable = "fp16")]
+pub(super) unsafe fn xvcvt_f16_u16(v: uint16x4_t) -> x_float16x4_t {
+    let result: uint16x4_t;
+    asm!(
+    "ucvtf {0:v}.4h, {1:v}.4h",
+    out(vreg) result,
+    in(vreg) v,
+    options(pure, nomem, nostack));
+    xreinterpret_f16_u16(result)
+}
+
+/// Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector).
+/// This instruction converts each element in a vector from a floating-point value to an unsigned
+/// integer value using the Round to Nearest with Ties to Away rounding mode and writes the result
+/// to the SIMD&FP destination register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_u16_f16)
+#[inline]
+#[target_feature(enable = "fp16")]
+pub(super) unsafe fn xvcvtaq_u16_f16(v: x_float16x8_t) -> uint16x8_t {
+    let result: uint16x8_t;
+    asm!(
+    "fcvtau {0:v}.8h, {1:v}.8h",
+    out(vreg) result,
+    in(vreg) xreinterpretq_u16_f16(v),
+    options(pure, nomem, nostack));
+    result
+}
+
+/// Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector).
+/// This instruction converts each element in a vector from a floating-point value to an unsigned
+/// integer value using the Round to Nearest with Ties to Away rounding mode and writes the result
+/// to the SIMD&FP destination register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_u16_f16)
+#[inline]
+#[target_feature(enable = "fp16")]
+pub(super) unsafe fn xvcvta_u16_f16(v: x_float16x4_t) -> uint16x4_t {
+    let result: uint16x4_t;
+    asm!(
+    "fcvtau {0:v}.4h, {1:v}.4h",
+    out(vreg) result,
+    in(vreg) xreinterpret_u16_f16(v),
+    options(pure, nomem, nostack));
+    result
+}
+
+/// Floating-point Reciprocal Estimate.
+/// This instruction finds an approximate reciprocal estimate for each vector element
+/// in the source SIMD&FP register, places the result in a vector,
+/// and writes the vector to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeq_f16)
+#[inline]
+#[target_feature(enable = "fp16")]
+pub(super) unsafe fn xvrecpeq_f16(v: x_float16x8_t) -> x_float16x8_t {
+    let result: uint16x8_t;
+    asm!(
+    "frecpe {0:v}.8h, {1:v}.8h",
+    out(vreg) result,
+    in(vreg) xreinterpretq_u16_f16(v),
+    options(pure, nomem, nostack));
+    xreinterpretq_f16_u16(result)
+}
+
 // #[inline]
 // pub(super) unsafe fn xvadd_f16(v1: x_float16x4_t, v2: x_float16x4_t) -> x_float16x4_t {
 //     let result: uint16x4_t;
@@ -239,6 +338,12 @@ pub(super) unsafe fn xvcombine_f16(v1: x_float16x4_t, v2: x_float16x4_t) -> x_fl
 //     xreinterpret_f16_u16(result)
 // }
 
+/// Floating-point fused Multiply-Add to accumulator (vector).
+/// This instruction multiplies corresponding floating-point values in the vectors
+/// in the two source SIMD&FP registers, adds the product to the corresponding vector element
+/// of the destination SIMD&FP register, and writes the result to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvfmla_f16(
@@ -257,6 +362,13 @@ pub(super) unsafe fn xvfmla_f16(
     xreinterpret_f16_u16(result)
 }
 
+/// Floating-point fused Multiply-Add to accumulator (vector).
+/// This instruction multiplies corresponding floating-point values in the vectors
+/// in the two source SIMD&FP registers, adds the product to the corresponding
+/// vector element of the destination SIMD&FP register,
+/// and writes the result to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvfmla_laneq_f16<const LANE: i32>(
@@ -335,6 +447,13 @@ pub(super) unsafe fn xvfmla_laneq_f16<const LANE: i32>(
     xreinterpret_f16_u16(result)
 }
 
+/// Floating-point fused Multiply-Add to accumulator (vector).
+/// This instruction multiplies corresponding floating-point values in the vectors
+/// in the two source SIMD&FP registers, adds the product to the corresponding
+/// vector element of the destination SIMD&FP register,
+/// and writes the result to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvfmla_lane_f16<const LANE: i32>(
@@ -381,6 +500,13 @@ pub(super) unsafe fn xvfmla_lane_f16<const LANE: i32>(
     xreinterpret_f16_u16(result)
 }
 
+/// Floating-point fused Multiply-Add to accumulator (vector).
+/// This instruction multiplies corresponding floating-point values in the vectors
+/// in the two source SIMD&FP registers, adds the product to the corresponding
+/// vector element of the destination SIMD&FP register,
+/// and writes the result to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvfmlaq_f16(
@@ -427,6 +553,12 @@ pub(super) unsafe fn xvfmlaq_f16(
 //     xvadd_f16(a, xvmul_f16(b, c))
 // }
 
+/// Floating-point Multiply (vector).
+/// This instruction multiplies corresponding floating-point values in the vectors in the two
+/// source SIMD&FP registers,
+/// places the result in a vector, and writes the vector to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvmulq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float16x8_t {
@@ -441,6 +573,33 @@ pub(super) unsafe fn xvmulq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float
     xreinterpretq_f16_u16(result)
 }
 
+/// Floating-point Multiply (vector).
+/// This instruction multiplies corresponding floating-point values in the vectors
+/// in the two source SIMD&FP registers, places the result in a vector,
+/// and writes the vector to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f16)
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn xvmul_f16(v1: x_float16x4_t, v2: x_float16x4_t) -> x_float16x4_t {
+    let result: uint16x4_t;
+    asm!(
+    "fmul {0:v}.4h, {1:v}.4h, {2:v}.4h",
+    out(vreg) result,
+    in(vreg) xreinterpret_u16_f16(v1),
+    in(vreg) xreinterpret_u16_f16(v2),
+    options(pure, nomem, nostack)
+    );
+    xreinterpret_f16_u16(result)
+}
+
+/// Floating-point Divide (vector).
+/// This instruction divides the floating-point values in the elements
+/// in the first source SIMD&FP register, by the floating-point values
+/// in the corresponding elements in the second source SIMD&FP register,
+/// places the results in a vector, and writes the vector to the destination SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdivq_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvdivq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float16x8_t {
@@ -455,6 +614,12 @@ pub(super) unsafe fn xvdivq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float
     xreinterpretq_f16_u16(result)
 }
 
+/// Bitwise Select.
+/// This instruction sets each bit in the destination SIMD&FP register
+/// to the corresponding bit from the first source SIMD&FP register when the
+/// original destination bit was 1, otherwise from the second source SIMD&FP register.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvbslq_f16(
@@ -474,16 +639,19 @@ pub(super) unsafe fn xvbslq_f16(
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvst_f16(ptr: *mut half::f16, x: x_float16x4_t) {
-    vst1_u16(std::mem::transmute(ptr), xreinterpret_u16_f16(x))
+    vst1_u16(ptr as *mut u16, xreinterpret_u16_f16(x))
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvstq_f16(ptr: *mut half::f16, x: x_float16x8_t) {
-    vst1q_u16(std::mem::transmute(ptr), xreinterpretq_u16_f16(x))
+    vst1q_u16(ptr as *mut u16, xreinterpretq_u16_f16(x))
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) {
     let ptr_u16 = ptr as *mut u16;
     vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0));
@@ -491,6 +659,7 @@ pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) {
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) {
     let ptr_u16 = ptr as *mut u16;
     vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0));
@@ -510,6 +679,7 @@ pub(crate) unsafe fn xvdup_laneq_f16<const N: i32>(a: x_float16x8_t) -> x_float1
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvld1q_lane_f16<const LANE: i32>(
     ptr: *const half::f16,
     src: x_float16x8_t,
@@ -521,6 +691,7 @@ pub(crate) unsafe fn xvld1q_lane_f16<const LANE: i32>(
 }
 
 #[inline]
+#[cfg(feature = "half")]
 pub(crate) unsafe fn xvsetq_lane_f16<const LANE: i32>(
     v: half::f16,
     r: x_float16x8_t,
@@ -531,6 +702,13 @@ pub(crate) unsafe fn xvsetq_lane_f16<const LANE: i32>(
     ))
 }
 
+/// Floating-point Compare Equal to zero (vector).
+/// This instruction reads each floating-point value in the source SIMD&FP register
+/// and if the value is equal to zero sets every bit of the corresponding vector element
+/// in the destination SIMD&FP register to one, otherwise sets every bit of the
+/// corresponding vector element in the destination SIMD&FP register to zero.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_f16)
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(crate) unsafe fn vceqzq_f16(a: x_float16x8_t) -> uint16x8_t {
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 00c0c9d..c537f9e 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -34,9 +34,9 @@ mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
 mod ar30;
+mod check_alpha;
 #[cfg(feature = "half")]
 mod convolve_f16;
-#[cfg(feature = "half")]
 mod f16_utils;
 mod horizontal_ar30;
 mod plane_f32;
@@ -63,7 +63,9 @@ mod vertical_f16_full;
 mod vertical_f32;
 mod vertical_u16;
 mod vertical_u16_lb;
+mod vertical_u16_lb_f16;
 mod vertical_u8;
+mod weights;
 
 #[cfg(feature = "half")]
 pub(crate) use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16};
@@ -76,7 +78,9 @@ pub(crate) use alpha_f32::neon_unpremultiply_alpha_rgba_f32;
 pub(crate) use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16};
 pub(crate) use alpha_u8::neon_premultiply_alpha_rgba;
 pub(crate) use alpha_u8::neon_unpremultiply_alpha_rgba;
-#[cfg(feature = "half")]
+pub(crate) use check_alpha::{
+    neon_has_non_constant_cap_alpha_rgba16, neon_has_non_constant_cap_alpha_rgba8,
+};
 pub(crate) use f16_utils::*;
 pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30;
 pub(crate) use plane_f32::convolve_horizontal_plane_neon_row_one;
@@ -122,6 +126,8 @@ pub(crate) use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16;
 pub(crate) use vertical_f32::convolve_vertical_rgb_neon_row_f32;
 pub(crate) use vertical_u16::convolve_column_u16;
 pub(crate) use vertical_u16_lb::convolve_column_lb_u16;
+pub(crate) use vertical_u16_lb_f16::convolve_column_lb_u16_f16;
 pub(crate) use vertical_u8::{
     convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision,
 };
+pub(crate) use weights::convert_weights_to_f16;
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index aa8e265..183097c 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -195,6 +195,13 @@ unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
     vqrdmlah_s16(store, lo, w0)
 }
 
+/// Checking NEON `rdm` availability is required before a call.
+///
+/// RDM feature has slightly lower precision and won't work really well on huge kernel which
+/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
 pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
     src: &[u8],
     src_stride: usize,
@@ -316,21 +323,22 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
         let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2));
         let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3));
 
-        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
-        let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
-        dest_ptr_32.write_unaligned(pixel);
-
-        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1));
-        let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32;
-        dest_ptr_32.write_unaligned(pixel);
-
-        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2));
-        let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32;
-        dest_ptr_32.write_unaligned(pixel);
-
-        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-        let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32;
-        dest_ptr_32.write_unaligned(pixel);
+        vst1_lane_u32::<0>(
+            chunk0.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8_0),
+        );
+        vst1_lane_u32::<0>(
+            chunk1.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8_1),
+        );
+        vst1_lane_u32::<0>(
+            chunk2.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8_2),
+        );
+        vst1_lane_u32::<0>(
+            chunk3.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8),
+        );
     }
 }
 
@@ -510,14 +518,21 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
             let store_16 = vqshrun_n_s32::<PRECISION>(store);
             let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
-            let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-            let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(value);
+            vst1_lane_u32::<0>(
+                dst.as_mut_ptr() as *mut u32,
+                vreinterpret_u32_u8(store_16_8),
+            );
         }
     }
 }
 
-/// Checking NEON `rdm` availability is required before a call
+/// Checking NEON `rdm` availability is required before a call.
+///
+/// RDM feature has slightly lower precision and won't work really well on huge kernel which
+/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
 pub(crate) fn convolve_horizontal_rgba_neon_row_i16(
     src: &[u8],
     dst: &mut [u8],
diff --git a/src/neon/utils.rs b/src/neon/utils.rs
index 76c2381..a435d96 100644
--- a/src/neon/utils.rs
+++ b/src/neon/utils.rs
@@ -138,19 +138,13 @@ pub(crate) unsafe fn prefer_vfmaq_lane_f32<const LANE: i32>(
 
 #[inline(always)]
 pub(crate) unsafe fn load_3b_as_u16x4(src_ptr: *const u8) -> uint16x4_t {
-    let v_new_value1 = u16::from_le_bytes([src_ptr.read_unaligned(), 0]);
-    let v_new_value2 = u16::from_le_bytes([src_ptr.add(1).read_unaligned(), 0]);
-    let v_new_value3 = u16::from_le_bytes([src_ptr.add(2).read_unaligned(), 0]);
-    let arr = [v_new_value1, v_new_value2, v_new_value3, 0];
-    vld1_u16(arr.as_ptr())
+    let mut v = vreinterpret_u8_u16(vld1_lane_u16::<0>(src_ptr as *const u16, vdup_n_u16(0)));
+    v = vld1_lane_u8::<2>(src_ptr.add(2), v);
+    vget_low_u16(vmovl_u8(v))
 }
 
 #[inline(always)]
 pub(crate) unsafe fn load_4b_as_u16x4(src_ptr: *const u8) -> uint16x4_t {
-    let v_new_value1 = u16::from_le_bytes([src_ptr.read_unaligned(), 0]);
-    let v_new_value2 = u16::from_le_bytes([src_ptr.add(1).read_unaligned(), 0]);
-    let v_new_value3 = u16::from_le_bytes([src_ptr.add(2).read_unaligned(), 0]);
-    let v_new_value4 = u16::from_le_bytes([src_ptr.add(3).read_unaligned(), 0]);
-    let arr = [v_new_value1, v_new_value2, v_new_value3, v_new_value4];
-    vld1_u16(arr.as_ptr())
+    let j = vreinterpret_u8_u32(vld1_lane_u32::<0>(src_ptr as *const u32, vdup_n_u32(0)));
+    vget_low_u16(vmovl_u8(j))
 }
diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs
index 71226ef..d85d1d6 100644
--- a/src/neon/vertical_f32.rs
+++ b/src/neon/vertical_f32.rs
@@ -255,7 +255,7 @@ unsafe fn convolve_vertical_part_neon_1_f32(
     }
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-    dst_ptr.write_unaligned(vgetq_lane_f32::<0>(store_0));
+    vst1q_lane_f32::<0>(dst_ptr, store_0);
 }
 
 pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
diff --git a/src/neon/vertical_u16.rs b/src/neon/vertical_u16.rs
index bf34f50..05aaba2 100644
--- a/src/neon/vertical_u16.rs
+++ b/src/neon/vertical_u16.rs
@@ -221,10 +221,10 @@ pub(crate) fn convolve_column_u16(
                     store3 = prefer_vfmaq_f32(store3, hi1, v_weight);
                 }
             }
-            let u_store0 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store0, zeros)), v_max_colors);
-            let u_store1 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store1, zeros)), v_max_colors);
-            let u_store2 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store2, zeros)), v_max_colors);
-            let u_store3 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store3, zeros)), v_max_colors);
+            let u_store0 = vminq_u32(vcvtaq_u32_f32(store0), v_max_colors);
+            let u_store1 = vminq_u32(vcvtaq_u32_f32(store1), v_max_colors);
+            let u_store2 = vminq_u32(vcvtaq_u32_f32(store2), v_max_colors);
+            let u_store3 = vminq_u32(vcvtaq_u32_f32(store3), v_max_colors);
 
             let item0 = vcombine_u16(vqmovn_u32(u_store0), vqmovn_u32(u_store1));
             vst1q_u16(dst.as_mut_ptr(), item0);
@@ -348,8 +348,8 @@ pub(crate) fn convolve_column_u16(
                 }
             }
 
-            let u_store0 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store0, zeros)), v_max_colors);
-            let u_store1 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store1, zeros)), v_max_colors);
+            let u_store0 = vminq_u32(vcvtaq_u32_f32(store0), v_max_colors);
+            let u_store1 = vminq_u32(vcvtaq_u32_f32(store1), v_max_colors);
 
             let item = vcombine_u16(vqmovn_u32(u_store0), vqmovn_u32(u_store1));
             vst1q_u16(dst.as_mut_ptr(), item);
@@ -444,7 +444,7 @@ pub(crate) fn convolve_column_u16(
                 }
             }
 
-            let u_store0 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store0, zeros)), v_max_colors);
+            let u_store0 = vminq_u32(vcvtaq_u32_f32(store0), v_max_colors);
 
             vst1_u16(dst.as_mut_ptr(), vqmovn_u32(u_store0));
 
diff --git a/src/neon/vertical_u16_lb_f16.rs b/src/neon/vertical_u16_lb_f16.rs
new file mode 100644
index 0000000..4bc7f1c
--- /dev/null
+++ b/src/neon/vertical_u16_lb_f16.rs
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_weights::FilterBounds;
+use crate::neon::f16_utils::{
+    xvcvt_f16_u16, xvcvta_u16_f16, xvcvtaq_u16_f16, xvcvtq_f16_u16, xvfmla_f16, xvfmlaq_f16,
+    xvzerosq_f16,
+};
+use crate::neon::{xreinterpret_f16_u16, xreinterpretq_f16_u16, xvget_low_f16};
+use std::arch::aarch64::*;
+
+pub(crate) fn convolve_column_lb_u16_f16(
+    j0: usize,
+    bounds: &FilterBounds,
+    src: &[u16],
+    dst: &mut [u16],
+    src_stride: usize,
+    weight: &[i16],
+    bit_depth: u32,
+) {
+    unsafe {
+        let transmuted_u16 =
+            std::slice::from_raw_parts(weight.as_ptr() as *const u16, weight.len());
+        convolve_column_lb_u16_f16_impl(
+            j0,
+            bounds,
+            src,
+            dst,
+            src_stride,
+            transmuted_u16,
+            bit_depth,
+        );
+    }
+}
+
+#[target_feature(enable = "fp16")]
+unsafe fn convolve_column_lb_u16_f16_impl(
+    _: usize,
+    bounds: &FilterBounds,
+    src: &[u16],
+    dst: &mut [u16],
+    src_stride: usize,
+    weight: &[u16],
+    bit_depth: u32,
+) {
+    let max_colors = (1 << bit_depth) - 1;
+    let mut cx = 0usize;
+
+    let bounds_size = bounds.size;
+
+    let initial_store = xvzerosq_f16();
+
+    let v_max_colors = vdupq_n_u16(max_colors);
+
+    let v_px = cx;
+
+    let iter16 = dst.chunks_exact_mut(16);
+
+    for (x, dst) in iter16.enumerate() {
+        let mut store0 = initial_store;
+        let mut store1 = initial_store;
+
+        let v_dx = v_px + x * 16;
+
+        for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+            let py = bounds.start + j;
+            let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
+
+            let v_weight = xreinterpretq_f16_u16(vdupq_n_u16(k_weight));
+
+            let item_row0 = vld1q_u16(src_ptr.as_ptr());
+            let item_row1 = vld1q_u16(src_ptr.as_ptr().add(8));
+
+            store0 = xvfmlaq_f16(store0, xvcvtq_f16_u16(item_row0), v_weight);
+            store1 = xvfmlaq_f16(store1, xvcvtq_f16_u16(item_row1), v_weight);
+        }
+
+        let item0 = vminq_u16(xvcvtaq_u16_f16(store0), v_max_colors);
+        let item1 = vminq_u16(xvcvtaq_u16_f16(store1), v_max_colors);
+
+        vst1q_u16(dst.as_mut_ptr(), item0);
+        vst1q_u16(dst.as_mut_ptr().add(8), item1);
+
+        cx = v_dx;
+    }
+
+    let tail16 = dst.chunks_exact_mut(16).into_remainder();
+    let iter8 = tail16.chunks_exact_mut(8);
+
+    let v_px = cx;
+
+    for (x, dst) in iter8.enumerate() {
+        let mut store0 = initial_store;
+
+        let v_dx = v_px + x * 8;
+
+        for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+            let py = bounds.start + j;
+            let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
+
+            let v_weight = xreinterpretq_f16_u16(vdupq_n_u16(k_weight));
+
+            let item_row = vld1q_u16(src_ptr.as_ptr());
+
+            store0 = xvfmlaq_f16(store0, xvcvtq_f16_u16(item_row), v_weight);
+        }
+
+        let item = vminq_u16(xvcvtaq_u16_f16(store0), v_max_colors);
+        vst1q_u16(dst.as_mut_ptr(), item);
+
+        cx = v_dx;
+    }
+
+    let tail8 = tail16.chunks_exact_mut(8).into_remainder();
+    let iter4 = tail8.chunks_exact_mut(4);
+
+    let v_cx = cx;
+
+    for (x, dst) in iter4.enumerate() {
+        let mut store0 = xvget_low_f16(initial_store);
+
+        let v_dx = v_cx + x * 4;
+
+        for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+            let py = bounds.start + j;
+            let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
+
+            let v_weight = xreinterpret_f16_u16(vdup_n_u16(k_weight));
+
+            let item_row = vld1_u16(src_ptr.as_ptr());
+
+            store0 = xvfmla_f16(store0, xvcvt_f16_u16(item_row), v_weight);
+        }
+
+        let u_store0 = vmin_u16(xvcvta_u16_f16(store0), vget_low_u16(v_max_colors));
+        vst1_u16(dst.as_mut_ptr(), u_store0);
+
+        cx = v_dx;
+    }
+
+    let tail4 = tail8.chunks_exact_mut(4).into_remainder();
+
+    let a_px = cx;
+
+    for (x, dst) in tail4.iter_mut().enumerate() {
+        let mut store0 = xvget_low_f16(initial_store);
+
+        let v_px = a_px + x;
+
+        for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+            let py = bounds.start + j;
+            let offset = src_stride * py + v_px;
+            let src_ptr = src.get_unchecked(offset..(offset + 1));
+
+            let v_weight = xreinterpret_f16_u16(vdup_n_u16(k_weight));
+            let item_row = xvcvt_f16_u16(vld1_lane_u16::<0>(src_ptr.as_ptr(), vdup_n_u16(0)));
+            store0 = xvfmla_f16(store0, item_row, v_weight);
+        }
+
+        let u_store0 = vmin_u16(xvcvta_u16_f16(store0), vget_low_u16(v_max_colors));
+        vst1_lane_u16::<0>(dst, u_store0);
+    }
+}
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index 2a0f44a..62bb824 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -69,6 +69,13 @@ macro_rules! accumulate_4_into_lane {
     }};
 }
 
+/// Checking NEON `rdm` availability is required before a call.
+///
+/// RDM feature has slightly lower precision and won't work really well on huge kernel which
+/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
 pub(crate) fn convolve_vertical_neon_i16_precision(
     width: usize,
     bounds: &FilterBounds,
@@ -133,7 +140,6 @@ unsafe fn convolve_vertical_neon_row_upper(
     let mut cx = 0usize;
 
     unsafe {
-        let zeros = vdupq_n_s16(0);
         let iter_64 = dst.chunks_exact_mut(64);
 
         let bounds_size = bounds.size;
@@ -258,15 +264,6 @@ unsafe fn convolve_vertical_neon_row_upper(
                 }
             }
 
-            store_0 = vmaxq_s16(store_0, zeros);
-            store_1 = vmaxq_s16(store_1, zeros);
-            store_2 = vmaxq_s16(store_2, zeros);
-            store_3 = vmaxq_s16(store_3, zeros);
-            store_4 = vmaxq_s16(store_4, zeros);
-            store_5 = vmaxq_s16(store_5, zeros);
-            store_6 = vmaxq_s16(store_6, zeros);
-            store_7 = vmaxq_s16(store_7, zeros);
-
             let item00 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
             let item01 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
             let item10 = vqshrun_n_s16::<R_SHR_SCALE>(store_2);
@@ -380,11 +377,6 @@ unsafe fn convolve_vertical_neon_row_upper(
                 }
             }
 
-            store_0 = vmaxq_s16(store_0, zeros);
-            store_1 = vmaxq_s16(store_1, zeros);
-            store_2 = vmaxq_s16(store_2, zeros);
-            store_3 = vmaxq_s16(store_3, zeros);
-
             let item00 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
             let item01 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
             let item10 = vqshrun_n_s16::<R_SHR_SCALE>(store_2);
@@ -475,9 +467,6 @@ unsafe fn convolve_vertical_neon_row_upper(
                 }
             }
 
-            store_0 = vmaxq_s16(store_0, zeros);
-            store_1 = vmaxq_s16(store_1, zeros);
-
             let item0 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
             let item1 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
 
@@ -568,8 +557,6 @@ unsafe fn convolve_vertical_neon_row_upper(
                 }
             }
 
-            store_0 = vmaxq_s16(store_0, zeros);
-
             let item = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
             vst1_u8(dst.as_mut_ptr(), item);
 
@@ -658,8 +645,6 @@ unsafe fn convolve_vertical_neon_row_upper(
                 }
             }
 
-            store = vmaxq_s16(store, zeros);
-
             let shrinked_store = vqshrun_n_s16::<R_SHR_SCALE>(store);
             let value = vget_lane_u8::<0>(shrinked_store);
             *dst = value;
@@ -1207,10 +1192,6 @@ fn convolve_vertical_neon_row_full(
                 }
             }
 
-            let zeros = vdupq_n_s32(0);
-
-            store = vmaxq_s32(store, zeros);
-
             let shrinked_store = vqshrun_n_s32::<PRECISION>(store);
 
             let low_16 = vcombine_u16(shrinked_store, shrinked_store);
diff --git a/src/neon/weights.rs b/src/neon/weights.rs
new file mode 100644
index 0000000..2f7e390
--- /dev/null
+++ b/src/neon/weights.rs
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::neon::f16_utils::{xvcombine_f16, xvcvt_f16_f32};
+use crate::neon::utils::xvld1q_f32_x2;
+use crate::neon::{xreinterpret_u16_f16, xreinterpretq_u16_f16};
+use std::arch::aarch64::*;
+
+pub(crate) fn convert_weights_to_f16(weights: &[f32]) -> Vec<i16> {
+    unsafe { convert_weights_to_f16_impl(weights) }
+}
+
+#[target_feature(enable = "fp16")]
+unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec<i16> {
+    let mut new_weights = vec![0i16; weights.len()];
+
+    for (dst, src) in new_weights.chunks_exact_mut(8).zip(weights.chunks_exact(8)) {
+        let j = xvld1q_f32_x2(src.as_ptr());
+        let cvt0 = xvcvt_f16_f32(j.0);
+        let cvt1 = xvcvt_f16_f32(j.1);
+        vst1q_u16(
+            dst.as_mut_ptr() as *mut u16,
+            xreinterpretq_u16_f16(xvcombine_f16(cvt0, cvt1)),
+        );
+    }
+
+    let dst = new_weights.chunks_exact_mut(8).into_remainder();
+    let src = weights.chunks_exact(8).remainder();
+
+    for (dst, src) in dst.chunks_exact_mut(4).zip(src.chunks_exact(4)) {
+        let j = vld1q_f32(src.as_ptr());
+        let cvt = xvcvt_f16_f32(j);
+        vst1_u16(dst.as_mut_ptr() as *mut u16, xreinterpret_u16_f16(cvt));
+    }
+
+    let dst = dst.chunks_exact_mut(4).into_remainder();
+    let src = src.chunks_exact(4).remainder();
+
+    for (dst, src) in dst.chunks_exact_mut(1).zip(src.iter()) {
+        let j = xvcvt_f16_f32(vld1q_lane_f32::<0>(src, vdupq_n_f32(0.)));
+        vst1_lane_u16::<0>(dst.as_mut_ptr() as *mut u16, xreinterpret_u16_f16(j));
+    }
+
+    new_weights
+}
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index b2d43b8..099febe 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -30,7 +30,7 @@
 use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
-use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{handle_fixed_column_u8, handle_fixed_row_u8};
 use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
@@ -79,6 +79,7 @@ impl HorizontalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
             _pool,
             _dispatcher_4_rows,
             _dispatcher_1_row,
+            DefaultWeightsConverter::default(),
         );
     }
 }
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index 8aff370..dae1038 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -30,7 +30,7 @@
 use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
-use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{
     handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
 };
@@ -78,6 +78,7 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
             pool,
             _dispatcher_4_rows,
             _dispatcher_1_row,
+            DefaultWeightsConverter::default(),
         );
     }
 }
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index b99b586..14d3a23 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -35,7 +35,7 @@ use crate::avx2::{
 };
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
-use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{
     handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
 };
@@ -79,7 +79,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
-            if std::is_x86_feature_detected!("sse4.1") {
+            if std::arch::is_x86_feature_detected!("sse4.1") {
                 if _scale_factor < 8. {
                     _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_lb);
                     _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one_lb;
@@ -88,7 +88,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
                     _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one;
                 }
             }
-            if std::is_x86_feature_detected!("avx2") || _scale_factor < 8. {
+            if std::arch::is_x86_feature_detected!("avx2") || _scale_factor < 8. {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_lb);
                 _dispatcher_1_row = convolve_horizontal_rgba_avx_rows_one_lb;
             }
@@ -100,6 +100,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
             _pool,
             _dispatcher_4_rows,
             _dispatcher_1_row,
+            DefaultWeightsConverter::default(),
         );
     }
 }
@@ -126,14 +127,14 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
-            if std::is_x86_feature_detected!("sse4.1") {
+            if std::arch::is_x86_feature_detected!("sse4.1") {
                 if _scale_factor < 8. {
                     _dispatcher = convolve_vertical_sse_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_sse_row;
                 }
             }
-            if std::is_x86_feature_detected!("avx2") {
+            if std::arch::is_x86_feature_detected!("avx2") {
                 if _scale_factor < 8. {
                     _dispatcher = convolve_vertical_avx_row_lp;
                 } else {
diff --git a/src/scaler.rs b/src/scaler.rs
index e726d39..283ccf4 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -1044,3 +1044,196 @@ impl Scaler {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_rgba8_resizing_vertical() {
+        let image_width = 255;
+        let image_height = 512;
+        const CN: usize = 4;
+        let mut image = vec![0u8; image_height * image_width * CN];
+        image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        let mut target_store = ImageStoreMut::alloc(image_width, image_height / 2);
+        scaler
+            .resize_rgba(&src_store, &mut target_store, false)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data
+            [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgba8_resizing_both() {
+        let image_width = 255;
+        let image_height = 512;
+        const CN: usize = 4;
+        let mut image = vec![0u8; image_height * image_width * CN];
+        image[0] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        let mut target_store = ImageStoreMut::alloc(image_width / 2, image_height / 2);
+        scaler
+            .resize_rgba(&src_store, &mut target_store, false)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data[0];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgba8_resizing_alpha() {
+        let image_width = 255;
+        let image_height = 512;
+        const CN: usize = 4;
+        let mut image = vec![0u8; image_height * image_width * CN];
+        image[0] = 174;
+        image[7] = 1;
+        let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        let mut target_store = ImageStoreMut::alloc(image_width / 2, image_height / 2);
+        scaler
+            .resize_rgba(&src_store, &mut target_store, true)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data[0];
+        assert_eq!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgb8_resizing_vertical() {
+        let image_width = 255;
+        let image_height = 512;
+        const CN: usize = 3;
+        let mut image = vec![0u8; image_height * image_width * CN];
+        image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        let mut target_store = ImageStoreMut::alloc(image_width, image_height / 2);
+        scaler.resize_rgb(&src_store, &mut target_store).unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data
+            [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgba10_resizing_vertical() {
+        let image_width = 8;
+        let image_height = 8;
+        const CN: usize = 4;
+        let mut image = vec![0u16; image_height * image_width * CN];
+        image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        src_store.bit_depth = 10;
+        let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 10);
+        scaler
+            .resize_rgba_u16(&src_store, &mut target_store, false)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data
+            [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgb10_resizing_vertical() {
+        let image_width = 8;
+        let image_height = 4;
+        const CN: usize = 3;
+        let mut image = vec![0; image_height * image_width * CN];
+        image[0] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        src_store.bit_depth = 10;
+        let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 10);
+        scaler
+            .resize_rgb_u16(&src_store, &mut target_store)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data[0];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgb16_resizing_vertical() {
+        let image_width = 8;
+        let image_height = 8;
+        const CN: usize = 3;
+        let mut image = vec![164; image_height * image_width * CN];
+        image[0] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        src_store.bit_depth = 10;
+        let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 16);
+        scaler
+            .resize_rgb_u16(&src_store, &mut target_store)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data[0];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgba16_resizing_vertical() {
+        let image_width = 8;
+        let image_height = 8;
+        const CN: usize = 4;
+        let mut image = vec![0u16; image_height * image_width * CN];
+        image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        src_store.bit_depth = 10;
+        let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 16);
+        scaler
+            .resize_rgba_u16(&src_store, &mut target_store, false)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data
+            [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN];
+        assert_ne!(resized, 0);
+    }
+
+    #[test]
+    fn check_rgba8_nearest_vertical() {
+        let image_width = 255;
+        let image_height = 512;
+        const CN: usize = 4;
+        let mut image = vec![0u8; image_height * image_width * CN];
+        image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174;
+        let mut scaler = Scaler::new(ResamplingFunction::Nearest);
+        scaler.set_threading_policy(ThreadingPolicy::Single);
+        let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap();
+        let mut target_store = ImageStoreMut::alloc(image_width, image_height / 2);
+        scaler
+            .resize_rgba(&src_store, &mut target_store, false)
+            .unwrap();
+        let target_data = target_store.buffer.borrow();
+
+        let resized = target_data
+            [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN];
+        assert_eq!(resized, 174);
+    }
+}
diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs
index b5ed21d..43f8bd0 100644
--- a/src/sse/alpha_f16.rs
+++ b/src/sse/alpha_f16.rs
@@ -195,78 +195,76 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16c(
 unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl<const F16C: bool>(in_place: &mut [half::f16]) {
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(8 * 4) {
-            let src_ptr = dst.as_ptr();
-            let lane0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-            let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-            let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
-
-            let low_alpha = _mm_cvtph_psx::<F16C>(pixel.3);
-            let zeros = _mm_setzero_ps();
-            let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros);
-            let low_r = _mm_blendv_ps(
-                _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.0), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_g = _mm_blendv_ps(
-                _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.1), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
-            let low_b = _mm_blendv_ps(
-                _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.2), low_alpha),
-                zeros,
-                low_alpha_zero_mask,
-            );
+    for dst in rem.chunks_exact_mut(8 * 4) {
+        let src_ptr = dst.as_ptr();
+        let lane0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+        let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+        let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let high_alpha = _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.3));
-            let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros);
-            let high_r = _mm_blendv_ps(
-                _mm_mul_ps(
-                    _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.0)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_g = _mm_blendv_ps(
-                _mm_mul_ps(
-                    _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.1)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let high_b = _mm_blendv_ps(
-                _mm_mul_ps(
-                    _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.2)),
-                    high_alpha,
-                ),
-                zeros,
-                high_alpha_zero_mask,
-            );
-            let r_values =
-                _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_r), _mm_cvtps_phx::<F16C>(high_r));
-            let g_values =
-                _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_g), _mm_cvtps_phx::<F16C>(high_g));
-            let b_values =
-                _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_b), _mm_cvtps_phx::<F16C>(high_b));
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
-            _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0);
-            _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2);
-            _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3);
-        }
+        let low_alpha = _mm_cvtph_psx::<F16C>(pixel.3);
+        let zeros = _mm_setzero_ps();
+        let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros);
+        let low_r = _mm_blendv_ps(
+            _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.0), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_g = _mm_blendv_ps(
+            _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.1), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
+        let low_b = _mm_blendv_ps(
+            _mm_mul_ps(_mm_cvtph_psx::<F16C>(pixel.2), low_alpha),
+            zeros,
+            low_alpha_zero_mask,
+        );
 
-        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+        let high_alpha = _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.3));
+        let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros);
+        let high_r = _mm_blendv_ps(
+            _mm_mul_ps(
+                _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.0)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_g = _mm_blendv_ps(
+            _mm_mul_ps(
+                _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.1)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let high_b = _mm_blendv_ps(
+            _mm_mul_ps(
+                _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(pixel.2)),
+                high_alpha,
+            ),
+            zeros,
+            high_alpha_zero_mask,
+        );
+        let r_values =
+            _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_r), _mm_cvtps_phx::<F16C>(high_r));
+        let g_values =
+            _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_g), _mm_cvtps_phx::<F16C>(high_g));
+        let b_values =
+            _mm_unpacklo_epi64(_mm_cvtps_phx::<F16C>(low_b), _mm_cvtps_phx::<F16C>(high_b));
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
+        _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0);
+        _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2);
+        _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3);
     }
 
+    rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
     unpremultiply_pixel_f16_row(rem);
 }
 
diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs
index 38e79e4..5910361 100644
--- a/src/sse/alpha_u16.rs
+++ b/src/sse/alpha_u16.rs
@@ -88,6 +88,17 @@ pub(crate) unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
     _mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
 }
 
+#[inline(always)]
+unsafe fn _mm_div_by<const BIT_DEPTH: usize>(v: __m128i) -> __m128i {
+    if BIT_DEPTH == 10 {
+        _mm_div_by_1023_epi32(v)
+    } else if BIT_DEPTH == 12 {
+        _mm_div_by_4095_epi32(v)
+    } else {
+        _mm_div_by_65535_epi32(v)
+    }
+}
+
 pub(crate) fn unpremultiply_alpha_sse_rgba_u16(
     in_place: &mut [u16],
     width: usize,
@@ -226,169 +237,144 @@ pub(crate) fn premultiply_alpha_sse_rgba_u16(
     }
 }
 
-#[target_feature(enable = "sse4.1")]
-unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) {
-    let max_colors = (1 << bit_depth) - 1;
+trait Sse41PremultiplyExecutor {
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize);
+}
 
-    let mut rem = dst;
-    let mut src_rem = src;
+#[derive(Default)]
+struct Sse41PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
 
-    unsafe {
-        if bit_depth == 10 {
-            let zeros = _mm_setzero_si128();
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
-                let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
-
-                let new_rrrr = _mm_packus_epi32(
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
-                );
-                let new_gggg = _mm_packus_epi32(
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
-                );
-                let new_bbbb = _mm_packus_epi32(
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
-                    _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
-                );
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
-        } else if bit_depth == 12 {
-            let zeros = _mm_setzero_si128();
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
-                let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
-
-                let new_rrrr = _mm_packus_epi32(
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
-                );
-                let new_gggg = _mm_packus_epi32(
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
-                );
-                let new_bbbb = _mm_packus_epi32(
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
-                    _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
-                );
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
-        } else if bit_depth == 16 {
-            let zeros = _mm_setzero_si128();
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
-                let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
-
-                let new_rrrr = _mm_packus_epi32(
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpacklo_epi16(rrrr, zeros),
-                        a_lo_f,
-                    )),
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpackhi_epi16(rrrr, zeros),
-                        a_hi_f,
-                    )),
-                );
-                let new_gggg = _mm_packus_epi32(
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpacklo_epi16(gggg, zeros),
-                        a_lo_f,
-                    )),
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpackhi_epi16(gggg, zeros),
-                        a_hi_f,
-                    )),
-                );
-                let new_bbbb = _mm_packus_epi32(
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpacklo_epi16(bbbb, zeros),
-                        a_lo_f,
-                    )),
-                    _mm_div_by_65535_epi32(_mm_mullo_epi32(
-                        _mm_unpackhi_epi16(bbbb, zeros),
-                        a_hi_f,
-                    )),
-                );
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
-        } else {
-            let v_max_colors_scale =
-                _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors)));
-            for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
-                let src_ptr = src.as_ptr();
-                let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
-                let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
-                let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
-                let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
-
-                let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128()));
-                let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128()));
-
-                let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale);
-                let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale);
-                let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale);
-
-                let (rgba0, rgba1, rgba2, rgba3) =
-                    sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
-
-                let dst_ptr = dst.as_mut_ptr();
-                _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-                _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
-                _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
-                _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
-            }
+impl<const BIT_DEPTH: usize> Sse41PremultiplyExecutor
+    for Sse41PremultiplyExecutorDefault<BIT_DEPTH>
+{
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = dst;
+        let mut src_rem = src;
+
+        let zeros = _mm_setzero_si128();
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            let src_ptr = src.as_ptr();
+            let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+            let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+            let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+            let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+            let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+
+            let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
+            let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
+
+            let new_rrrr = _mm_packus_epi32(
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)),
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)),
+            );
+            let new_gggg = _mm_packus_epi32(
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)),
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)),
+            );
+            let new_bbbb = _mm_packus_epi32(
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)),
+                _mm_div_by::<BIT_DEPTH>(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)),
+            );
+
+            let (rgba0, rgba1, rgba2, rgba3) =
+                sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
+
+            let dst_ptr = dst.as_mut_ptr();
+            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
+            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
+            _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
         }
 
         rem = rem.chunks_exact_mut(8 * 4).into_remainder();
         src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
+        premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32);
     }
+}
+
+#[derive(Default)]
+struct Sse41PremultiplyExecutorAny {}
+
+impl Sse41PremultiplyExecutor for Sse41PremultiplyExecutorAny {
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        let max_colors = (1 << bit_depth) - 1;
+
+        let mut rem = dst;
+        let mut src_rem = src;
 
-    premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32);
+        let v_max_colors_scale =
+            _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors)));
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            let src_ptr = src.as_ptr();
+            let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+            let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+            let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+            let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+            let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+
+            let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128()));
+            let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128()));
+
+            let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale);
+            let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale);
+            let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale);
+
+            let (rgba0, rgba1, rgba2, rgba3) =
+                sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa);
+
+            let dst_ptr = dst.as_mut_ptr();
+            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1);
+            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2);
+            _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3);
+        }
+
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
+        premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32);
+    }
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn pma_sse41_rgba16_dispatch(
+    dst: &mut [u16],
+    src: &[u16],
+    bit_depth: usize,
+    executor: impl Sse41PremultiplyExecutor,
+) {
+    executor.premultiply(dst, src, bit_depth);
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) {
+    if bit_depth == 10 {
+        pma_sse41_rgba16_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Sse41PremultiplyExecutorDefault::<10>::default(),
+        )
+    } else if bit_depth == 12 {
+        pma_sse41_rgba16_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Sse41PremultiplyExecutorDefault::<12>::default(),
+        )
+    } else if bit_depth == 16 {
+        pma_sse41_rgba16_dispatch(
+            dst,
+            src,
+            bit_depth,
+            Sse41PremultiplyExecutorDefault::<16>::default(),
+        )
+    } else {
+        pma_sse41_rgba16_dispatch(dst, src, bit_depth, Sse41PremultiplyExecutorAny::default())
+    }
 }
diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs
index 32f01cd..53a75ab 100644
--- a/src/sse/alpha_u8.rs
+++ b/src/sse/alpha_u8.rs
@@ -111,55 +111,53 @@ unsafe fn sse_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     let mut rem = dst;
     let mut src_rem = src;
 
-    unsafe {
-        let zeros = _mm_setzero_si128();
-        for (dst, src) in rem
-            .chunks_exact_mut(16 * 4)
-            .zip(src_rem.chunks_exact(16 * 4))
-        {
-            let src_ptr = src.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
-            let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
-
-            let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
-            let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
-
-            let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
-            let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
-
-            let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
-            let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
-
-            rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
-            rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
-            ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
-            ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
-            bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
-            bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
-
-            let rrr = _mm_packus_epi16(rrr_low, rrr_high);
-            let ggg = _mm_packus_epi16(ggg_low, ggg_high);
-            let bbb = _mm_packus_epi16(bbb_low, bbb_high);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
-
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(16 * 4).remainder();
+    let zeros = _mm_setzero_si128();
+    for (dst, src) in rem
+        .chunks_exact_mut(16 * 4)
+        .zip(src_rem.chunks_exact(16 * 4))
+    {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+    src_rem = src_rem.chunks_exact(16 * 4).remainder();
+
     premultiply_alpha_rgba_row_impl(rem, src_rem);
 }
 
@@ -203,31 +201,30 @@ pub(crate) fn sse_unpremultiply_alpha_rgba(
 #[target_feature(enable = "sse4.1")]
 unsafe fn sse_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
     let mut rem = in_place;
-    unsafe {
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-            let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-            let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-            let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-            let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-            let rrr = sse_unpremultiply_row(rrr, aaa);
-            let ggg = sse_unpremultiply_row(ggg, aaa);
-            let bbb = sse_unpremultiply_row(bbb, aaa);
-
-            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
-
-            let dst_ptr = dst.as_mut_ptr();
-            _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-        }
 
-        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = sse_unpremultiply_row(rrr, aaa);
+        let ggg = sse_unpremultiply_row(ggg, aaa);
+        let bbb = sse_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = dst.as_mut_ptr();
+        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
+        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
+        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
     }
 
+    rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+
     for dst in rem.chunks_exact_mut(4) {
         let a = dst[3];
         if a != 0 {
diff --git a/src/sse/check_alpha.rs b/src/sse/check_alpha.rs
new file mode 100644
index 0000000..85133e3
--- /dev/null
+++ b/src/sse/check_alpha.rs
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk 12/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::sse::_mm_hsum_epi32;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+/// Checks if image has constant alpha by xor rows
+pub(crate) fn sse_has_non_constant_cap_alpha_rgba8(
+    store: &[u8],
+    width: usize,
+    stride: usize,
+) -> bool {
+    unsafe { sse_has_non_constant_cap_alpha_rgba8_impl(store, width, stride) }
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn sse_has_non_constant_cap_alpha_rgba8_impl(
+    store: &[u8],
+    width: usize,
+    stride: usize,
+) -> bool {
+    if store.is_empty() {
+        return true;
+    }
+
+    let sh0 = _mm_setr_epi8(3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1);
+
+    let first_alpha = store[3];
+    let def_alpha = _mm_set1_epi32(first_alpha as i32);
+
+    for row in store.chunks_exact(stride) {
+        let row = &row[0..width * 4];
+        let mut sums = _mm_set1_epi32(0);
+        for chunk in row.chunks_exact(16 * 4) {
+            let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+            let mut r1 = _mm_loadu_si128(chunk.get_unchecked(16..).as_ptr() as *const __m128i);
+            let mut r2 = _mm_loadu_si128(chunk.get_unchecked(32..).as_ptr() as *const __m128i);
+            let mut r3 = _mm_loadu_si128(chunk.get_unchecked(48..).as_ptr() as *const __m128i);
+
+            r0 = _mm_xor_si128(_mm_shuffle_epi8(r0, sh0), def_alpha);
+            r1 = _mm_xor_si128(_mm_shuffle_epi8(r1, sh0), def_alpha);
+            r2 = _mm_xor_si128(_mm_shuffle_epi8(r2, sh0), def_alpha);
+            r3 = _mm_xor_si128(_mm_shuffle_epi8(r3, sh0), def_alpha);
+
+            sums = _mm_add_epi32(sums, r0);
+            sums = _mm_add_epi32(sums, r1);
+            sums = _mm_add_epi32(sums, r2);
+            sums = _mm_add_epi32(sums, r3);
+        }
+
+        let row = row.chunks_exact(16 * 4).remainder();
+
+        for chunk in row.chunks_exact(16) {
+            let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+
+            r0 = _mm_shuffle_epi8(r0, sh0);
+
+            let alphas = _mm_xor_si128(r0, def_alpha);
+
+            sums = _mm_add_epi32(sums, alphas);
+        }
+
+        let row = row.chunks_exact(16).remainder();
+
+        let mut h_sum = _mm_hsum_epi32(sums);
+
+        for chunk in row.chunks_exact(4) {
+            h_sum += chunk[3] as i32 ^ first_alpha as i32;
+        }
+
+        if h_sum != 0 {
+            return true;
+        }
+    }
+    false
+}
+
+/// Checks if image has constant alpha by xor rows for image 16bits
+pub(crate) fn sse_has_non_constant_cap_alpha_rgba16(
+    store: &[u16],
+    width: usize,
+    stride: usize,
+) -> bool {
+    unsafe { sse_has_non_constant_cap_alpha_rgba16_impl(store, width, stride) }
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn sse_has_non_constant_cap_alpha_rgba16_impl(
+    store: &[u16],
+    width: usize,
+    stride: usize,
+) -> bool {
+    if store.is_empty() {
+        return true;
+    }
+
+    let sh0 = _mm_setr_epi8(6, 7, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+
+    let first_alpha = store[3];
+    let def_alpha = _mm_set1_epi32(first_alpha as i32);
+
+    for row in store.chunks_exact(stride) {
+        let row = &row[0..width * 4];
+        let mut sums = _mm_set1_epi32(0);
+        for chunk in row.chunks_exact(8 * 4) {
+            let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+            let mut r1 = _mm_loadu_si128(chunk.get_unchecked(8..).as_ptr() as *const __m128i);
+            let mut r2 = _mm_loadu_si128(chunk.get_unchecked(16..).as_ptr() as *const __m128i);
+            let mut r3 = _mm_loadu_si128(chunk.get_unchecked(24..).as_ptr() as *const __m128i);
+
+            r0 = _mm_shuffle_epi8(r0, sh0);
+            r1 = _mm_shuffle_epi8(r1, sh0);
+            r2 = _mm_shuffle_epi8(r2, sh0);
+            r3 = _mm_shuffle_epi8(r3, sh0);
+
+            let r01 = _mm_xor_si128(_mm_unpacklo_epi32(r0, r1), def_alpha);
+            let r23 = _mm_xor_si128(_mm_unpacklo_epi32(r2, r3), def_alpha);
+
+            sums = _mm_add_epi32(sums, r01);
+            sums = _mm_add_epi32(sums, r23);
+        }
+
+        let row = row.chunks_exact(8 * 4).remainder();
+
+        for chunk in row.chunks_exact(8) {
+            let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+
+            r0 = _mm_shuffle_epi8(r0, sh0);
+
+            let alphas = _mm_xor_si128(_mm_unpacklo_epi32(r0, r0), def_alpha);
+
+            sums = _mm_add_epi32(sums, alphas);
+        }
+
+        let row = row.chunks_exact(8).remainder();
+
+        let mut h_sum = _mm_hsum_epi32(sums);
+
+        for chunk in row.chunks_exact(4) {
+            h_sum += chunk[3] as i32 ^ first_alpha as i32;
+        }
+
+        if h_sum != 0 {
+            return true;
+        }
+    }
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_alpha_exists_rgba8() {
+        let image_size = 256usize;
+        let mut image = vec![0u8; image_size * image_size * 4];
+        image[3 + 150 * 4] = 75;
+        let has_alpha = sse_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4);
+        assert_eq!(true, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_exists_rgba16() {
+        let image_size = 256usize;
+        let mut image = vec![0u16; image_size * image_size * 4];
+        image[3] = 715;
+        image[7] = 715;
+        image[11] = 715;
+        image[15] = 715;
+        let has_alpha = sse_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4);
+        assert_eq!(true, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba8() {
+        let image_size = 256usize;
+        let image = vec![255u8; image_size * image_size * 4];
+        let has_alpha = sse_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4);
+        assert_eq!(false, has_alpha);
+    }
+
+    #[test]
+    fn check_alpha_not_exists_rgba16() {
+        let image_size = 256usize;
+        let image = vec![255u16; image_size * image_size * 4];
+        let has_alpha = sse_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4);
+        assert_eq!(false, has_alpha);
+    }
+}
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 6726e82..45417dd 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -32,6 +32,7 @@ mod alpha_f16;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
+mod check_alpha;
 #[cfg(feature = "half")]
 mod f16_utils;
 mod plane_f32;
@@ -67,6 +68,9 @@ pub(crate) use alpha_u8::{
     _mm_div_by_255_epi16, sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba,
     sse_unpremultiply_row,
 };
+pub(crate) use check_alpha::{
+    sse_has_non_constant_cap_alpha_rgba16, sse_has_non_constant_cap_alpha_rgba8,
+};
 pub(crate) use plane_f32::convolve_horizontal_plane_sse_row_one;
 pub(crate) use plane_f32::convolve_horizontal_plane_sse_rows_4;
 pub(crate) use plane_u8::{
diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs
index 99ed16a..7c83a9a 100644
--- a/src/sse/plane_f32.rs
+++ b/src/sse/plane_f32.rs
@@ -316,7 +316,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_fma(
     );
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
@@ -326,139 +326,135 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
     dst: &mut [f32],
     dst_stride: usize,
 ) {
-    unsafe {
-        let mut filter_offset = 0usize;
-        let zeros = _mm_setzero_ps();
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = filter_weights.bounds.get_unchecked(x);
-            let mut jx = 0usize;
-            let mut store_0 = zeros;
-            let mut store_1 = zeros;
-            let mut store_2 = zeros;
-            let mut store_3 = zeros;
-
-            while jx + 16 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights0 = _mm_loadu_ps(ptr);
-                let read_weights1 = _mm_loadu_ps(ptr.add(4));
-                let read_weights2 = _mm_loadu_ps(ptr.add(8));
-                let read_weights3 = _mm_loadu_ps(ptr.add(12));
-                let weights = (read_weights0, read_weights1, read_weights2, read_weights3);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA);
-                let s_ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA);
-                let s_ptr3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA);
-                jx += 16;
-            }
-
-            while jx + 8 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights0 = _mm_loadu_ps(ptr);
-                let read_weights1 = _mm_loadu_ps(ptr.add(4));
-                let read_weights = (read_weights0, read_weights1);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    src,
-                    read_weights.0,
-                    read_weights.1,
-                    store_0,
-                    FMA
-                );
-                let s_ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    s_ptr_1,
-                    read_weights.0,
-                    read_weights.1,
-                    store_1,
-                    FMA
-                );
-                let s_ptr2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    s_ptr2,
-                    read_weights.0,
-                    read_weights.1,
-                    store_2,
-                    FMA
-                );
-                let s_ptr3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_8_f32!(
-                    bounds_start,
-                    s_ptr3,
-                    read_weights.0,
-                    read_weights.1,
-                    store_3,
-                    FMA
-                );
-                jx += 8;
-            }
-
-            while jx + 4 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = _mm_loadu_ps(ptr);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA);
-                let s_ptr_1 = src.get_unchecked(src_stride..);
-                store_1 =
-                    conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA);
-                let s_ptr3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA);
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weights =
-                    _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.);
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA);
-                let ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA);
-                let ptr_2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA);
-                let ptr_3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA);
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weight0 = _mm_set1_ps(ptr.read_unaligned());
-                let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA);
-                let ptr_1 = src.get_unchecked(src_stride..);
-                store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA);
-                let ptr_2 = src.get_unchecked(src_stride * 2..);
-                store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA);
-                let ptr_3 = src.get_unchecked(src_stride * 3..);
-                store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA);
-                jx += 1;
-            }
-
-            let px = x;
-            let dest_ptr = dst.get_unchecked_mut(px);
-            *dest_ptr = _mm_hsum_ps(store_0);
-
-            let dest_ptr = dst.get_unchecked_mut(px + dst_stride);
-            *dest_ptr = _mm_hsum_ps(store_1);
-
-            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2);
-            *dest_ptr = _mm_hsum_ps(store_2);
-
-            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3);
-            *dest_ptr = _mm_hsum_ps(store_3);
-
-            filter_offset += filter_weights.aligned_size;
+    let mut filter_offset = 0usize;
+    let zeros = _mm_setzero_ps();
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store_0 = zeros;
+        let mut store_1 = zeros;
+        let mut store_2 = zeros;
+        let mut store_3 = zeros;
+
+        while jx + 16 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights0 = _mm_loadu_ps(ptr);
+            let read_weights1 = _mm_loadu_ps(ptr.add(4));
+            let read_weights2 = _mm_loadu_ps(ptr.add(8));
+            let read_weights3 = _mm_loadu_ps(ptr.add(12));
+            let weights = (read_weights0, read_weights1, read_weights2, read_weights3);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA);
+            jx += 16;
         }
+
+        while jx + 8 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights0 = _mm_loadu_ps(ptr);
+            let read_weights1 = _mm_loadu_ps(ptr.add(4));
+            let read_weights = (read_weights0, read_weights1);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                src,
+                read_weights.0,
+                read_weights.1,
+                store_0,
+                FMA
+            );
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                s_ptr_1,
+                read_weights.0,
+                read_weights.1,
+                store_1,
+                FMA
+            );
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                s_ptr2,
+                read_weights.0,
+                read_weights.1,
+                store_2,
+                FMA
+            );
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_8_f32!(
+                bounds_start,
+                s_ptr3,
+                read_weights.0,
+                read_weights.1,
+                store_3,
+                FMA
+            );
+            jx += 8;
+        }
+
+        while jx + 4 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = _mm_loadu_ps(ptr);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA);
+            let ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA);
+            let ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA);
+            let ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA);
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = _mm_set1_ps(ptr.read_unaligned());
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA);
+            let ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA);
+            let ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA);
+            let ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA);
+            jx += 1;
+        }
+
+        let px = x;
+        let dest_ptr = dst.get_unchecked_mut(px);
+        *dest_ptr = _mm_hsum_ps(store_0);
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride);
+        *dest_ptr = _mm_hsum_ps(store_1);
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2);
+        *dest_ptr = _mm_hsum_ps(store_2);
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3);
+        *dest_ptr = _mm_hsum_ps(store_3);
+
+        filter_offset += filter_weights.aligned_size;
     }
 }
diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs
index ede3433..637b774 100644
--- a/src/sse/rgb_f16.rs
+++ b/src/sse/rgb_f16.rs
@@ -82,8 +82,7 @@ unsafe fn convolve_horizontal_parts_4_rgb_f16<const F16C: bool, const FMA: bool>
     let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_2, weight2);
-    let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3);
-    acc
+    _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3)
 }
 
 #[inline(always)]
@@ -142,8 +141,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f16<const F16C: bool, const FMA: boo
         0,
         0,
     ));
-    let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0);
-    acc
+    _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
 pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FMA: bool>(
diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs
index ee6e357..cc1b041 100644
--- a/src/sse/rgb_u8.rs
+++ b/src/sse/rgb_u8.rs
@@ -69,7 +69,6 @@ unsafe fn load_rgb_x4(src: &[u8]) -> __m128i {
     rgb_pixel
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
     src: &[u8],
@@ -78,10 +77,9 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 3;
+    const CHANNELS: usize = 3;
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_lo = _mm_setr_epi8(0, -1,
                                                3, -1,
                                                1, -1,
@@ -91,7 +89,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
                                                -1, -1,
                                                -1, -1);
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_hi = _mm_setr_epi8(6, -1,
                                                9, -1,
                                                7, -1,
@@ -101,147 +99,142 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
                                                -1, -1,
                                                -1, -1);
 
-        let vld = _mm_set1_epi32(ROUNDING_CONST);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            // Will make step in 4 items however since it is RGB it is necessary to make a safe offset
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
-                const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
-                let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
-                const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
-                let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
-                let bounds_start = (bounds.start + jx) * CHANNELS;
-
-                let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..));
-                let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..));
-                let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..));
-                let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..));
-
-                let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
-
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
-
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
-
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = (bounds.start + jx) * CHANNELS;
-                let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
-
-                let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..));
-                let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..));
-                let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..));
-                let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..));
-
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let bounds_start = bounds.start + jx;
-
-                let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
-
-                store_0 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            let store_0_8 = compress_i32(store_0);
-            let store_1_8 = compress_i32(store_1);
-            let store_2_8 = compress_i32(store_2);
-            let store_3_8 = compress_i32(store_3);
-
-            let element_0 = _mm_extract_epi32::<0>(store_0_8);
-            let element_1 = _mm_extract_epi32::<0>(store_1_8);
-            let element_2 = _mm_extract_epi32::<0>(store_2_8);
-            let element_3 = _mm_extract_epi32::<0>(store_3_8);
-
-            let bytes = element_0.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk0.get_unchecked_mut(2) = bytes[2];
-
-            let bytes = element_1.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk1.get_unchecked_mut(2) = bytes[2];
-
-            let bytes = element_2.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk2.get_unchecked_mut(2) = bytes[2];
-
-            let bytes = element_3.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *chunk3.get_unchecked_mut(2) = bytes[2];
+    let vld = _mm_set1_epi32(ROUNDING_CONST);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        // Will make step in 4 items however since it is RGB it is necessary to make a safe offset
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
+            const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
+            let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
+            const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
+            let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
+            let bounds_start = (bounds.start + jx) * CHANNELS;
+
+            let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..));
+            let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..));
+            let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..));
+            let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..));
+
+            let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
+
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
+
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
+
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = (bounds.start + jx) * CHANNELS;
+            let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
+
+            let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..));
+            let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..));
+            let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..));
+            let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..));
+
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let bounds_start = bounds.start + jx;
+
+            let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
+
+            store_0 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3);
+            jx += 1;
         }
+
+        let store_0_8 = compress_i32(store_0);
+        let store_1_8 = compress_i32(store_1);
+        let store_2_8 = compress_i32(store_2);
+        let store_3_8 = compress_i32(store_3);
+
+        let element_0 = _mm_extract_epi32::<0>(store_0_8);
+        let element_1 = _mm_extract_epi32::<0>(store_1_8);
+        let element_2 = _mm_extract_epi32::<0>(store_2_8);
+        let element_3 = _mm_extract_epi32::<0>(store_3_8);
+
+        let bytes = element_0.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk0.get_unchecked_mut(2) = bytes[2];
+
+        let bytes = element_1.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk1.get_unchecked_mut(2) = bytes[2];
+
+        let bytes = element_2.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk2.get_unchecked_mut(2) = bytes[2];
+
+        let bytes = element_3.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *chunk3.get_unchecked_mut(2) = bytes[2];
     }
 }
 
@@ -262,71 +255,68 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_impl(
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 3;
-
-        let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1);
-
-        let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1);
-
-        for ((dst, bounds), weights) in dst
-            .chunks_exact_mut(CHANNELS)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let bounds_size = bounds.size;
-            let mut jx = 0usize;
-            let mut store = _mm_setzero_si128();
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
-                const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
-                let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
-                const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
-                let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
-                let bounds_start = bounds.start + jx;
-                let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..);
-
-                let rgb_pixel = load_rgb_x4(src_ptr_0);
-
-                let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
-                store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
-                let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..);
-                let rgb_pixel = load_rgb_x2(src_ptr);
-                let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0));
-                jx += 2;
-            }
-
-            while jx < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
-                store =
-                    convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store);
-                jx += 1;
-            }
-
-            let store_16_8 = compress_i32(store);
-
-            let element = _mm_extract_epi32::<0>(store_16_8);
-            let bytes = element.to_le_bytes();
-            let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-            (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-            *dst.get_unchecked_mut(2) = bytes[2];
+    const CHANNELS: usize = 3;
+
+    let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1);
+
+    let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1);
+
+    for ((dst, bounds), weights) in dst
+        .chunks_exact_mut(CHANNELS)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let bounds_size = bounds.size;
+        let mut jx = 0usize;
+        let mut store = _mm_setzero_si128();
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
+            const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
+            let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
+            const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
+            let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
+            let bounds_start = bounds.start + jx;
+            let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..);
+
+            let rgb_pixel = load_rgb_x4(src_ptr_0);
+
+            let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+            let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+            store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
+            store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
+            let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..);
+            let rgb_pixel = load_rgb_x2(src_ptr);
+            let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+            store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0));
+            jx += 2;
+        }
+
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32);
+            store = convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store);
+            jx += 1;
         }
+
+        let store_16_8 = compress_i32(store);
+
+        let element = _mm_extract_epi32::<0>(store_16_8);
+        let bytes = element.to_le_bytes();
+        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
+        (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
+        *dst.get_unchecked_mut(2) = bytes[2];
     }
 }
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index d032a21..5e367d2 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16<const F16C: bool, const FMA: bo
     let src_ptr = src.add(start_x * COMPONENTS);
     let rgb_pixel = _mm_loadu_si64(src_ptr as *const u8);
     let pixels = _mm_cvtph_psx::<F16C>(rgb_pixel);
-    let acc = _mm_prefer_fma_ps::<FMA>(store_0, pixels, weight0);
-    acc
+    _mm_prefer_fma_ps::<FMA>(store_0, pixels, weight0)
 }
 
 #[inline(always)]
@@ -77,8 +76,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16<const F16C: bool, const FMA: bool
     let acc = _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel_0, weight0);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1);
     let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_2, weight2);
-    let acc = _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3);
-    acc
+    _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_3, weight3)
 }
 
 #[inline(always)]
@@ -95,12 +93,11 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const F16C: bool, const FMA: bool
     let rgb_pixels = _mm_loadu_si128(src_ptr as *const __m128i);
 
     let acc = _mm_prefer_fma_ps::<FMA>(store_0, _mm_cvtph_psx::<F16C>(rgb_pixels), weight0);
-    let acc = _mm_prefer_fma_ps::<FMA>(
+    _mm_prefer_fma_ps::<FMA>(
         acc,
         _mm_cvtph_psx::<F16C>(_mm_srli_si128::<8>(rgb_pixels)),
         weight1,
-    );
-    acc
+    )
 }
 
 pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const FMA: bool>(
diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs
index 64378a3..b5dc978 100644
--- a/src/sse/rgba_f32.rs
+++ b/src/sse/rgba_f32.rs
@@ -119,71 +119,65 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
     src: &[f32],
     dst: &mut [f32],
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = filter_weights.bounds.get_unchecked(x);
-            let mut jx = 0usize;
-            let mut store = _mm_setzero_ps();
-
-            while jx + 4 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr);
-                let filter_start = jx + bounds.start;
-                store = convolve_horizontal_parts_4_rgba_f32::<FMA>(
-                    filter_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8));
-                const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0);
-                let weight0 =
-                    _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_0>(_mm_castps_si128(weights)));
-                const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1);
-                let weight1 =
-                    _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_1>(_mm_castps_si128(weights)));
-                let filter_start = jx + bounds.start;
-                store = convolve_horizontal_parts_2_rgba_f32::<FMA>(
-                    filter_start,
-                    src,
-                    weight0,
-                    weight1,
-                    store,
-                );
-                jx += 2
-            }
-
-            while jx < bounds.size {
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let weight0 = _mm_load1_ps(ptr);
-                let filter_start = jx + bounds.start;
-                store = convolve_horizontal_parts_one_rgba_f32::<FMA>(
-                    filter_start,
-                    src,
-                    weight0,
-                    store,
-                );
-                jx += 1;
-            }
-
-            let px = x * CHANNELS;
-            let dest_ptr = dst.get_unchecked_mut(px..);
-            _mm_storeu_ps(dest_ptr.as_mut_ptr(), store);
-
-            filter_offset += filter_weights.aligned_size;
+    const CHANNELS: usize = 4;
+    let mut filter_offset = 0usize;
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store = _mm_setzero_ps();
+
+        while jx + 4 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr);
+            let filter_start = jx + bounds.start;
+            store = convolve_horizontal_parts_4_rgba_f32::<FMA>(
+                filter_start,
+                src,
+                weight0,
+                weight1,
+                weight2,
+                weight3,
+                store,
+            );
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8));
+            const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0);
+            let weight0 =
+                _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_0>(_mm_castps_si128(weights)));
+            const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1);
+            let weight1 =
+                _mm_castsi128_ps(_mm_shuffle_epi32::<SHUFFLE_1>(_mm_castps_si128(weights)));
+            let filter_start = jx + bounds.start;
+            store = convolve_horizontal_parts_2_rgba_f32::<FMA>(
+                filter_start,
+                src,
+                weight0,
+                weight1,
+                store,
+            );
+            jx += 2
         }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = _mm_load1_ps(ptr);
+            let filter_start = jx + bounds.start;
+            store =
+                convolve_horizontal_parts_one_rgba_f32::<FMA>(filter_start, src, weight0, store);
+            jx += 1;
+        }
+
+        let px = x * CHANNELS;
+        let dest_ptr = dst.get_unchecked_mut(px..);
+        _mm_storeu_ps(dest_ptr.as_mut_ptr(), store);
+
+        filter_offset += filter_weights.aligned_size;
     }
 }
 
@@ -329,6 +323,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
         let mut store_1 = zeros;
         let mut store_2 = zeros;
         let mut store_3 = zeros;
+
         while jx + 4 < bounds.size {
             let ptr = weights_ptr.add(jx + filter_offset);
             let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr);
diff --git a/src/sse/rgba_u16_lb.rs b/src/sse/rgba_u16_lb.rs
index 6baed89..107fc78 100644
--- a/src/sse/rgba_u16_lb.rs
+++ b/src/sse/rgba_u16_lb.rs
@@ -192,7 +192,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl(
     filter_weights: &FilterWeights<i16>,
     bit_depth: u32,
 ) {
-    assert!(bit_depth >= 1 && bit_depth <= 16);
+    assert!((1..=16).contains(&bit_depth));
     const CHANNELS: usize = 4;
     let init = _mm_set1_epi32(ROUNDING_CONST);
 
@@ -269,8 +269,8 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl(
         while jx + 2 < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 2));
             let bounds_start = bounds.start + jx;
-            let w0 = _mm_set1_epi16(w_ptr[0] as i16);
-            let w1 = _mm_set1_epi16(w_ptr[1] as i16);
+            let w0 = _mm_set1_epi16(w_ptr[0]);
+            let w1 = _mm_set1_epi16(w_ptr[1]);
             store_0 = conv_horiz_rgba_2_u16(bounds_start, src0, w0, w1, store_0);
             store_1 = conv_horiz_rgba_2_u16(bounds_start, src1, w0, w1, store_1);
             store_2 = conv_horiz_rgba_2_u16(bounds_start, src2, w0, w1, store_2);
@@ -344,14 +344,14 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl(
         while jx + 8 < bounds_size {
             let bounds_start = bounds.start + jx;
             let w_ptr = weights.get_unchecked(jx..(jx + 8));
-            let w0 = _mm_set1_epi32(w_ptr[0] as i32);
-            let w1 = _mm_set1_epi32(w_ptr[1] as i32);
-            let w2 = _mm_set1_epi32(w_ptr[2] as i32);
-            let w3 = _mm_set1_epi32(w_ptr[3] as i32);
-            let w4 = _mm_set1_epi32(w_ptr[4] as i32);
-            let w5 = _mm_set1_epi32(w_ptr[5] as i32);
-            let w6 = _mm_set1_epi32(w_ptr[6] as i32);
-            let w7 = _mm_set1_epi32(w_ptr[7] as i32);
+            let w0 = _mm_set1_epi16(w_ptr[0]);
+            let w1 = _mm_set1_epi16(w_ptr[1]);
+            let w2 = _mm_set1_epi16(w_ptr[2]);
+            let w3 = _mm_set1_epi16(w_ptr[3]);
+            let w4 = _mm_set1_epi16(w_ptr[4]);
+            let w5 = _mm_set1_epi16(w_ptr[5]);
+            let w6 = _mm_set1_epi16(w_ptr[6]);
+            let w7 = _mm_set1_epi16(w_ptr[7]);
             let set1 = (w0, w1, w2, w3);
             let set2 = (w4, w5, w6, w7);
             store = conv_horiz_rgba_8_u16(bounds_start, src, set1, set2, store);
@@ -360,10 +360,10 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl(
 
         while jx + 4 < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let w0 = _mm_set1_epi32(w_ptr[0] as i32);
-            let w1 = _mm_set1_epi32(w_ptr[1] as i32);
-            let w2 = _mm_set1_epi32(w_ptr[2] as i32);
-            let w3 = _mm_set1_epi32(w_ptr[3] as i32);
+            let w0 = _mm_set1_epi16(w_ptr[0]);
+            let w1 = _mm_set1_epi16(w_ptr[1]);
+            let w2 = _mm_set1_epi16(w_ptr[2]);
+            let w3 = _mm_set1_epi16(w_ptr[3]);
             let bounds_start = bounds.start + jx;
             store = conv_horiz_rgba_4_u16(bounds_start, src, w0, w1, w2, w3, store);
             jx += 4;
@@ -372,15 +372,15 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl(
         while jx + 2 < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 2));
             let bounds_start = bounds.start + jx;
-            let w0 = _mm_set1_epi32(w_ptr[0] as i32);
-            let w1 = _mm_set1_epi32(w_ptr[1] as i32);
+            let w0 = _mm_set1_epi16(w_ptr[0]);
+            let w1 = _mm_set1_epi16(w_ptr[1]);
             store = conv_horiz_rgba_2_u16(bounds_start, src, w0, w1, store);
             jx += 2;
         }
 
         while jx < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let w0 = _mm_set1_epi32(w_ptr[0] as i32);
+            let w0 = _mm_set1_epi16(w_ptr[0]);
             let bounds_start = bounds.start + jx;
             store = conv_horiz_rgba_1_u16(bounds_start, src, w0, store);
             jx += 1;
diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs
index c746c33..1f3dc99 100644
--- a/src/sse/rgba_u8.rs
+++ b/src/sse/rgba_u8.rs
@@ -72,10 +72,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
+    const CHANNELS: usize = 4;
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_lo = _mm_setr_epi8(0, -1,
                                                4, -1,
                                                1, -1,
@@ -85,7 +84,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                                                3, -1,
                                                7, -1);
 
-        #[rustfmt::skip]
+    #[rustfmt::skip]
         let shuffle_hi = _mm_setr_epi8(8, -1,
                                                12, -1,
                                                9, -1,
@@ -95,152 +94,147 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                                                11, -1,
                                                15, -1);
 
-        let vld = _mm_set1_epi32(ROUNDING_CONST);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
-                const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
-                let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
-                const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
-                let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-
-                let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
-
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
-
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
-
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-
-                let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
-
-                let rgb_pixel_0 =
-                    _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_1 =
-                    _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_2 =
-                    _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_3 =
-                    _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-
-                let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
-                let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
-                let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
-                let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
-
-                store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
-                store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
-                store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
-                store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-
-                let weight0 = _mm_set1_epi32(w_ptr[0] as i32);
-
-                let start_bounds = bounds.start + jx;
-
-                store_0 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            let store_16_8_0 = compress_i32(store_0);
-            let store_16_8_1 = compress_i32(store_1);
-            let store_16_8_2 = compress_i32(store_2);
-            let store_16_8_3 = compress_i32(store_3);
-
-            _mm_storeu_si32(
-                chunk0.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_0, store_16_8_0),
+    let vld = _mm_set1_epi32(ROUNDING_CONST);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8);
+            const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0);
+            let weight01 = _mm_shuffle_epi32::<SHUFFLE_01>(weights);
+            const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1);
+            let weight23 = _mm_shuffle_epi32::<SHUFFLE_23>(weights);
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
-            _mm_storeu_si32(
-                chunk1.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_1, store_16_8_1),
+            let rgb_pixel_1 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
-            _mm_storeu_si32(
-                chunk2.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_2, store_16_8_2),
+            let rgb_pixel_2 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
-            _mm_storeu_si32(
-                chunk3.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+            let rgb_pixel_3 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
+
+            let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi);
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23));
+
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23));
+
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23));
+
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23));
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+
+            let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned());
+
+            let rgb_pixel_0 =
+                _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_1 =
+                _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_2 =
+                _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_3 =
+                _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+
+            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo);
+            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo);
+            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo);
+            let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo);
+
+            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01));
+            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01));
+            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01));
+            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01));
+
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+
+            let weight0 = _mm_set1_epi32(w_ptr[0] as i32);
+
+            let start_bounds = bounds.start + jx;
+
+            store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
+            jx += 1;
         }
+
+        let store_16_8_0 = compress_i32(store_0);
+        let store_16_8_1 = compress_i32(store_1);
+        let store_16_8_2 = compress_i32(store_2);
+        let store_16_8_3 = compress_i32(store_3);
+
+        _mm_storeu_si32(
+            chunk0.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_0, store_16_8_0),
+        );
+        _mm_storeu_si32(
+            chunk1.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_1, store_16_8_1),
+        );
+        _mm_storeu_si32(
+            chunk2.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_2, store_16_8_2),
+        );
+        _mm_storeu_si32(
+            chunk3.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_3, store_16_8_3),
+        );
     }
 }
 
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index e5b1934..8590618 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -112,254 +112,240 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        const CHANNELS: usize = 4;
-
-        const SCALE: i32 = 6;
-        const V_SHR: i32 = SCALE;
-        const ROUNDING: i16 = 1 << (V_SHR - 1);
-
-        let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0);
-
-        let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
-
-            while jx + 8 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-                let weight23 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-
-                let weight45 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-
-                let weight67 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_0_1 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1_0 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2_1 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3_1 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS + 16)..)
-                        .as_ptr() as *const __m128i,
-                );
-
-                store_0 = hdot4(
-                    store_0,
-                    rgb_pixel_0,
-                    rgb_pixel_0_1,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-                store_1 = hdot4(
-                    store_1,
-                    rgb_pixel_1,
-                    rgb_pixel_1_0,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-                store_2 = hdot4(
-                    store_2,
-                    rgb_pixel_2,
-                    rgb_pixel_2_1,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-                store_3 = hdot4(
-                    store_3,
-                    rgb_pixel_3,
-                    rgb_pixel_3_1,
-                    weight01,
-                    weight23,
-                    weight45,
-                    weight67,
-                );
-
-                jx += 8;
-            }
-
-            while jx + 4 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-                let weight23 = _mm_shuffle_epi8(
-                    _mm_set1_epi32(
-                        (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(),
-                    ),
-                    shuffle_weights,
-                );
-                let start_bounds = bounds.start + jx;
-
-                let rgb_pixel_0 = _mm_loadu_si128(
-                    src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_1 = _mm_loadu_si128(
-                    src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_2 = _mm_loadu_si128(
-                    src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-                let rgb_pixel_3 = _mm_loadu_si128(
-                    src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
-                );
-
-                store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23);
-                store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23);
-                store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23);
-                store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23);
-
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-
-                let weight01 = _mm_shuffle_epi8(
-                    _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
-                    shuffle_weights,
-                );
-
-                let rgb_pixel_0 =
-                    _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_1 =
-                    _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_2 =
-                    _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-                let rgb_pixel_3 =
-                    _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
-
-                store_0 = hdot(store_0, rgb_pixel_0, weight01);
-                store_1 = hdot(store_1, rgb_pixel_1, weight01);
-                store_2 = hdot(store_2, rgb_pixel_2, weight01);
-                store_3 = hdot(store_3, rgb_pixel_3, weight01);
-
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-
-                let weight0 = _mm_set1_epi16(w_ptr[0]);
-
-                let start_bounds = bounds.start + jx;
-
-                store_0 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
-                store_1 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
-                store_2 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
-                store_3 =
-                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
-                jx += 1;
-            }
-
-            store_0 = _mm_add_hi_lo_epi16(store_0);
-            store_1 = _mm_add_hi_lo_epi16(store_1);
-            store_2 = _mm_add_hi_lo_epi16(store_2);
-            store_3 = _mm_add_hi_lo_epi16(store_3);
-
-            let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
-            let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
-            let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
-            let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
-
-            _mm_storeu_si32(
-                chunk0.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_0, store_16_8_0),
-            );
-            _mm_storeu_si32(
-                chunk1.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_1, store_16_8_1),
-            );
-            _mm_storeu_si32(
-                chunk2.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_2, store_16_8_2),
-            );
-            _mm_storeu_si32(
-                chunk3.as_mut_ptr() as *mut _,
-                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+    const CHANNELS: usize = 4;
+
+    const SCALE: i32 = 6;
+    const V_SHR: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (V_SHR - 1);
+
+    let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0);
+
+    let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+            let weight23 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let weight45 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let weight67 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_0_1 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_1 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_1_0 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_2 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
             );
+            let rgb_pixel_2_1 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_3 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_3_1 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS + 16)..)
+                    .as_ptr() as *const __m128i,
+            );
+
+            store_0 = hdot4(
+                store_0,
+                rgb_pixel_0,
+                rgb_pixel_0_1,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+            store_1 = hdot4(
+                store_1,
+                rgb_pixel_1,
+                rgb_pixel_1_0,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+            store_2 = hdot4(
+                store_2,
+                rgb_pixel_2,
+                rgb_pixel_2_1,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+            store_3 = hdot4(
+                store_3,
+                rgb_pixel_3,
+                rgb_pixel_3_1,
+                weight01,
+                weight23,
+                weight45,
+                weight67,
+            );
+
+            jx += 8;
         }
+
+        while jx + 4 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+            let weight23 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+            let start_bounds = bounds.start + jx;
+
+            let rgb_pixel_0 = _mm_loadu_si128(
+                src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_1 = _mm_loadu_si128(
+                src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_2 = _mm_loadu_si128(
+                src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+            let rgb_pixel_3 = _mm_loadu_si128(
+                src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
+            );
+
+            store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23);
+            store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23);
+            store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23);
+            store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23);
+
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+
+            let weight01 = _mm_shuffle_epi8(
+                _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()),
+                shuffle_weights,
+            );
+
+            let rgb_pixel_0 =
+                _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_1 =
+                _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_2 =
+                _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+            let rgb_pixel_3 =
+                _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
+
+            store_0 = hdot(store_0, rgb_pixel_0, weight01);
+            store_1 = hdot(store_1, rgb_pixel_1, weight01);
+            store_2 = hdot(store_2, rgb_pixel_2, weight01);
+            store_3 = hdot(store_3, rgb_pixel_3, weight01);
+
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+
+            let weight0 = _mm_set1_epi16(w_ptr[0]);
+
+            let start_bounds = bounds.start + jx;
+
+            store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+            store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+            store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+            store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
+            jx += 1;
+        }
+
+        store_0 = _mm_add_hi_lo_epi16(store_0);
+        store_1 = _mm_add_hi_lo_epi16(store_1);
+        store_2 = _mm_add_hi_lo_epi16(store_2);
+        store_3 = _mm_add_hi_lo_epi16(store_3);
+
+        let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
+        let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
+        let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
+        let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
+
+        _mm_storeu_si32(
+            chunk0.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_0, store_16_8_0),
+        );
+        _mm_storeu_si32(
+            chunk1.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_1, store_16_8_1),
+        );
+        _mm_storeu_si32(
+            chunk2.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_2, store_16_8_2),
+        );
+        _mm_storeu_si32(
+            chunk3.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8_3, store_16_8_3),
+        );
     }
 }
 
diff --git a/src/sse/u8_utils.rs b/src/sse/u8_utils.rs
index 7b6e5ec..c79518a 100644
--- a/src/sse/u8_utils.rs
+++ b/src/sse/u8_utils.rs
@@ -36,9 +36,11 @@ use crate::support::PRECISION;
 
 #[inline(always)]
 pub(crate) fn compress_i32(x: __m128i) -> __m128i {
-    let store_32 = unsafe { _mm_srai_epi32::<PRECISION>(x) };
-    let store_16 = unsafe { _mm_packus_epi32(store_32, store_32) };
-    unsafe { _mm_packus_epi16(store_16, store_16) }
+    unsafe {
+        let store_32 = _mm_srai_epi32::<PRECISION>(x);
+        let store_16 = _mm_packus_epi32(store_32, store_32);
+        _mm_packus_epi16(store_16, store_16)
+    }
 }
 
 #[inline]
diff --git a/src/sse/vertical_u16.rs b/src/sse/vertical_u16.rs
index a1ca78b..eb2de77 100644
--- a/src/sse/vertical_u16.rs
+++ b/src/sse/vertical_u16.rs
@@ -144,19 +144,19 @@ unsafe fn convolve_column_lb_u16_impl<const FMA: bool>(
         }
 
         let v_st0 = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store0, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store0)),
             v_max_colors,
         );
         let v_st1 = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store1, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store1)),
             v_max_colors,
         );
         let v_st2 = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store2, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store2)),
             v_max_colors,
         );
         let v_st3 = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store3, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store3)),
             v_max_colors,
         );
 
@@ -201,11 +201,11 @@ unsafe fn convolve_column_lb_u16_impl<const FMA: bool>(
         }
 
         let v_st0 = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store0, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store0)),
             v_max_colors,
         );
         let v_st1 = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store1, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store1)),
             v_max_colors,
         );
 
@@ -351,7 +351,7 @@ unsafe fn convolve_column_lb_u16_impl<const FMA: bool>(
         }
 
         let v_st = _mm_min_epi32(
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(_mm_max_ps(store0, zeros_ps))),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING>(store0)),
             v_max_colors,
         );
 
diff --git a/src/sse/vertical_u16_lb.rs b/src/sse/vertical_u16_lb.rs
index 3f74c1e..1ee9d9b 100644
--- a/src/sse/vertical_u16_lb.rs
+++ b/src/sse/vertical_u16_lb.rs
@@ -58,7 +58,7 @@ unsafe fn convolve_column_lb_u16_impl(
     weight: &[i16],
     bit_depth: u32,
 ) {
-    assert!(bit_depth >= 1 && bit_depth <= 16);
+    assert!((1..=16).contains(&bit_depth));
     let max_colors = (1 << bit_depth) - 1;
     let mut cx = 0usize;
 
@@ -173,8 +173,8 @@ unsafe fn convolve_column_lb_u16_impl(
         if bounds_size == 2 {
             let weights = weight.get_unchecked(0..2);
 
-            let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-            let v_weight1 = _mm_set1_epi32(weights[1] as i32);
+            let v_weight0 = _mm_set1_epi16(weights[0]);
+            let v_weight1 = _mm_set1_epi16(weights[1]);
 
             let py = bounds.start;
             let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -194,9 +194,9 @@ unsafe fn convolve_column_lb_u16_impl(
         } else if bounds_size == 3 {
             let weights = weight.get_unchecked(0..3);
 
-            let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-            let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-            let v_weight2 = _mm_set1_epi32(weights[2] as i32);
+            let v_weight0 = _mm_set1_epi16(weights[0]);
+            let v_weight1 = _mm_set1_epi16(weights[1]);
+            let v_weight2 = _mm_set1_epi16(weights[2]);
 
             let py = bounds.start;
             let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -223,10 +223,10 @@ unsafe fn convolve_column_lb_u16_impl(
         } else if bounds_size == 4 {
             let weights = weight.get_unchecked(0..4);
 
-            let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-            let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-            let v_weight2 = _mm_set1_epi32(weights[2] as i32);
-            let v_weight3 = _mm_set1_epi32(weights[3] as i32);
+            let v_weight0 = _mm_set1_epi16(weights[0]);
+            let v_weight1 = _mm_set1_epi16(weights[1]);
+            let v_weight2 = _mm_set1_epi16(weights[2]);
+            let v_weight3 = _mm_set1_epi16(weights[3]);
 
             let py = bounds.start;
             let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -262,7 +262,7 @@ unsafe fn convolve_column_lb_u16_impl(
                 let py = bounds.start + j;
                 let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
 
-                let v_weight = _mm_set1_epi32(k_weight as i32);
+                let v_weight = _mm_set1_epi16(k_weight);
 
                 let item_row = _mm_loadu_si64(src_ptr.as_ptr() as *const u8);
 
diff --git a/src/sse/vertical_u8.rs b/src/sse/vertical_u8.rs
index ab567d7..060901e 100644
--- a/src/sse/vertical_u8.rs
+++ b/src/sse/vertical_u8.rs
@@ -224,22 +224,28 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
     if bounds_size == 2 {
         let py = start_y;
         let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
+        let v_weight0 = _mm_set1_epi16(weight[0]);
+        let v_weight1 = _mm_set1_epi16(weight[1]);
         let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
         let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
         let item_row0 = _mm_loadu_si64(src_ptr0.as_ptr());
         let item_row1 = _mm_loadu_si64(src_ptr1.as_ptr());
 
         let low0 = _mm_unpacklo_epi8(item_row0, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low0, zeros), v_weight0),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0),
         );
 
         let low1 = _mm_unpacklo_epi8(item_row1, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low1, zeros), v_weight1),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1),
@@ -247,9 +253,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
     } else if bounds_size == 3 {
         let py = start_y;
         let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
+        let v_weight0 = _mm_set1_epi16(weight[0]);
+        let v_weight1 = _mm_set1_epi16(weight[1]);
+        let v_weight2 = _mm_set1_epi16(weight[2]);
         let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
         let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
         let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -258,21 +264,30 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         let item_row2 = _mm_loadu_si64(src_ptr2.as_ptr());
 
         let low0 = _mm_unpacklo_epi8(item_row0, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low0, zeros), v_weight0),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0),
         );
 
         let low1 = _mm_unpacklo_epi8(item_row1, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low1, zeros), v_weight1),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1),
         );
 
         let low2 = _mm_unpacklo_epi8(item_row2, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low2, zeros), v_weight2),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low2, zeros), v_weight2),
@@ -280,10 +295,10 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
     } else if bounds_size == 4 {
         let py = start_y;
         let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm_set1_epi32(weight[3] as i32);
+        let v_weight0 = _mm_set1_epi16(weight[0]);
+        let v_weight1 = _mm_set1_epi16(weight[1]);
+        let v_weight2 = _mm_set1_epi16(weight[2]);
+        let v_weight3 = _mm_set1_epi16(weight[3]);
         let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
         let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
         let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -294,28 +309,40 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         let item_row3 = _mm_loadu_si64(src_ptr3.as_ptr());
 
         let low0 = _mm_unpacklo_epi8(item_row0, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low0, zeros), v_weight0),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0),
         );
 
         let low1 = _mm_unpacklo_epi8(item_row1, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low1, zeros), v_weight1),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1),
         );
 
         let low2 = _mm_unpacklo_epi8(item_row2, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low2, zeros), v_weight2),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low2, zeros), v_weight2),
         );
 
         let low3 = _mm_unpacklo_epi8(item_row3, zeros);
-        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low3), v_weight3));
+        store_0 = _mm_add_epi32(
+            store_0,
+            _mm_madd_epi16(_mm_unpacklo_epi16(low3, zeros), v_weight3),
+        );
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low3, zeros), v_weight3),
@@ -324,12 +351,15 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         for j in 0..bounds_size {
             let py = start_y + j;
             let weight = *filter.get_unchecked(j);
-            let v_weight = _mm_set1_epi32(weight as i32);
+            let v_weight = _mm_set1_epi16(weight);
             let src_ptr = src.get_unchecked((src_stride * py + px)..);
             let item_row = _mm_loadu_si64(src_ptr.as_ptr());
 
             let low = _mm_unpacklo_epi8(item_row, zeros);
-            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low), v_weight));
+            store_0 = _mm_add_epi32(
+                store_0,
+                _mm_madd_epi16(_mm_unpacklo_epi16(low, zeros), v_weight),
+            );
             store_1 = _mm_add_epi32(
                 store_1,
                 _mm_madd_epi16(_mm_unpackhi_epi16(low, zeros), v_weight),
@@ -368,8 +398,8 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
     if bounds_size == 2 {
         let py = start_y;
         let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
+        let v_weight0 = _mm_set1_epi16(weight[0]);
+        let v_weight1 = _mm_set1_epi16(weight[1]);
         let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
         let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
         let item_row0 =
@@ -382,9 +412,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
     } else if bounds_size == 3 {
         let py = start_y;
         let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
+        let v_weight0 = _mm_set1_epi16(weight[0]);
+        let v_weight1 = _mm_set1_epi16(weight[1]);
+        let v_weight2 = _mm_set1_epi16(weight[2]);
         let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
         let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
         let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -401,10 +431,10 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
     } else if bounds_size == 4 {
         let py = start_y;
         let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm_set1_epi32(weight[3] as i32);
+        let v_weight0 = _mm_set1_epi16(weight[0]);
+        let v_weight1 = _mm_set1_epi16(weight[1]);
+        let v_weight2 = _mm_set1_epi16(weight[2]);
+        let v_weight3 = _mm_set1_epi16(weight[3]);
         let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
         let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
         let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs
index 4236b58..f10fdc8 100644
--- a/src/sse/vertical_u8_lp.rs
+++ b/src/sse/vertical_u8_lp.rs
@@ -77,6 +77,7 @@ unsafe fn convolve_vertical_sse_row_impl(
     let mut cx = 0usize;
 
     let mut rem = dst;
+
     let iter_64 = rem.chunks_exact_mut(64);
 
     for dst in iter_64 {
@@ -140,6 +141,7 @@ unsafe fn convolve_vertical_sse_row_impl(
     }
 
     rem = rem.chunks_exact_mut(64).into_remainder();
+
     let iter_32 = rem.chunks_exact_mut(32);
 
     for dst in iter_32 {