diff --git a/app/src/main.rs b/app/src/main.rs
index b63e78e..42ad481 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -13,8 +13,9 @@ use fast_image_resize::{
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
     Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ImageStoreScaling, ResamplingFunction,
-    RgbF16ImageStore, RgbF16ImageStoreMut, Rgba16ImageStoreMut, RgbaF16ImageStore,
-    RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy, WorkloadStrategy,
+    RgbF16ImageStore, RgbF16ImageStoreMut, Rgba16ImageStore, Rgba16ImageStoreMut,
+    RgbaF16ImageStore, RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy,
+    WorkloadStrategy,
 };
 
 fn resize_plane(
@@ -51,26 +52,26 @@ fn main() {
         .decode()
         .unwrap();
     let dimensions = img.dimensions();
-    let transient = img.to_rgb8();
+    let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
     let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
-    scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed);
+    scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
 
-    // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
+    let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
 
-    let rgb_feature16 = transient
-        .iter()
-        .map(|&x| (x as f32 / 255f32) as f16)
-        .collect::<Vec<_>>();
+    // let rgb_feature16 = transient
+    //     .iter()
+    //     .map(|&x| (x as f32 / 255f32) as f16)
+    //     .collect::<Vec<_>>();
 
     //
-    let store =
-        RgbF16ImageStore::from_slice(&rgb_feature16, dimensions.0 as usize, dimensions.1 as usize)
-            .unwrap();
+    let mut store =
+        Rgba16ImageStore::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize).unwrap();
+    store.bit_depth = 10;
 
     // let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
     // // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
@@ -84,15 +85,17 @@ fn main() {
     // //     )
     // //     .unwrap();
     //
-    let mut dst_store = RgbF16ImageStoreMut::alloc_with_depth(
-        dimensions.0 as usize / 4,
-        dimensions.1 as usize / 4,
+    let mut dst_store = Rgba16ImageStoreMut::alloc_with_depth(
+        dimensions.0 as usize / 2,
+        dimensions.1 as usize / 2,
         10,
     );
     //
     // // for i in 0..25 {
     // let start_time = Instant::now();
-    scaler.resize_rgb_f16(&store, &mut dst_store).unwrap();
+    scaler
+        .resize_rgba_u16(&store, &mut dst_store, false)
+        .unwrap();
     //
     // let elapsed_time = start_time.elapsed();
     // // Print the elapsed time in milliseconds
@@ -134,17 +137,17 @@ fn main() {
     // //     .map(|&x| (x * 255f32) as u8)
     // //     .collect();
     //
-    // // let dst: Vec<u8> = dst_store
-    // //     .as_bytes()
-    // //     .iter()
-    // //     .map(|&x| (x >> 2) as u8)
-    // //     .collect();
-    //
-    let dst = dst_store
+    let dst: Vec<u8> = dst_store
         .as_bytes()
         .iter()
-        .map(|&x| (x as f32 * 255.).round() as u8)
-        .collect::<Vec<_>>();
+        .map(|&x| (x >> 2) as u8)
+        .collect();
+
+    // let dst = dst_store
+    //     .as_bytes()
+    //     .iter()
+    //     .map(|&x| (x as f32 * 255.).round() as u8)
+    //     .collect::<Vec<_>>();
     // let dst = resized;
     // image::save_buffer(
     //     "converted.png",
diff --git a/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs b/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs
index f0422a1..db04a3d 100644
--- a/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs
+++ b/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs
@@ -29,11 +29,9 @@
 #![feature(f16)]
 #![no_main]
 
-use libfuzzer_sys::fuzz_target;
-use pic_scale::{
-    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy,
-};
 use core::f16;
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy};
 
 fuzz_target!(|data: (u16, u16, u16, u16, bool)| {
     let strategy = if data.4 {
diff --git a/fuzz/resize_rgb_f16/resize_rgb_f16.rs b/fuzz/resize_rgb_f16/resize_rgb_f16.rs
index 5d04a65..0446e31 100644
--- a/fuzz/resize_rgb_f16/resize_rgb_f16.rs
+++ b/fuzz/resize_rgb_f16/resize_rgb_f16.rs
@@ -31,9 +31,7 @@
 
 use core::f16;
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{
-    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy,
-};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy};
 
 fuzz_target!(|data: (u16, u16, u16, u16, bool)| {
     let strategy = if data.4 {
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 72176a1..deb4bd1 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -45,6 +45,7 @@ mod vertical_f32;
 mod vertical_u16_lb;
 mod vertical_u8;
 mod vertical_u8_lp;
+mod rgba_u16_lb;
 
 #[cfg(feature = "nightly_f16")]
 pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
@@ -73,3 +74,4 @@ pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;
 pub(crate) use vertical_u16_lb::convolve_column_lb_avx2_u16;
 pub(crate) use vertical_u8::convolve_vertical_avx_row;
 pub(crate) use vertical_u8_lp::convolve_vertical_avx_row_lp;
+pub(crate) use rgba_u16_lb::{convolve_horizontal_rgba_avx_rows_4_u16, convolve_horizontal_rgba_avx_u16lp_row};
\ No newline at end of file
diff --git a/src/avx2/rgb_u8.rs b/src/avx2/rgb_u8.rs
index 3249ff9..93c3107 100644
--- a/src/avx2/rgb_u8.rs
+++ b/src/avx2/rgb_u8.rs
@@ -27,9 +27,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::avx2::utils::_mm256_dot16_avx_epi32;
+use crate::avx2::utils::{_mm256_dot16_avx_epi32, _mm_dot16_avx_epi32};
 use crate::filter_weights::FilterWeights;
-use crate::sse::{_mm_dot16_avx_epi32, compress_i32};
+use crate::sse::compress_i32;
 use crate::support::ROUNDING_CONST;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
diff --git a/src/avx512/rgba_u16_lp.rs b/src/avx2/rgba_u16_lb.rs
similarity index 73%
rename from src/avx512/rgba_u16_lp.rs
rename to src/avx2/rgba_u16_lb.rs
index 901f57d..62fb13a 100644
--- a/src/avx512/rgba_u16_lp.rs
+++ b/src/avx2/rgba_u16_lb.rs
@@ -27,6 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+use crate::avx2::utils::{_mm256_dot16_avx_epi32, _mm_dot16_avx_epi32};
 use crate::filter_weights::FilterWeights;
 use crate::support::{PRECISION, ROUNDING_CONST};
 #[cfg(target_arch = "x86")]
@@ -35,7 +36,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline(always)]
-unsafe fn acc_1_dot(
+unsafe fn acc_1_dot<const D: bool>(
     start_x: usize,
     src: &[u16],
     w0: __m128i,
@@ -45,11 +46,11 @@ unsafe fn acc_1_dot(
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
     let rgba_pixel = _mm_loadu_si64(src_ptr.as_ptr() as *const u8);
-    _mm_dpwssd_avx_epi32(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0)
+    _mm_dot16_avx_epi32::<D>(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0)
 }
 
 #[inline(always)]
-unsafe fn acc_2_dot(
+unsafe fn acc_2_dot<const D: bool>(
     start_x: usize,
     src: &[u16],
     w0: __m128i,
@@ -59,11 +60,11 @@ unsafe fn acc_2_dot(
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
     let rgba_pixel = _mm_loadu_si128(src_ptr.as_ptr() as *const _);
-    _mm_dpwssd_avx_epi32(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0)
+    _mm_dot16_avx_epi32::<D>(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0)
 }
 
 #[inline(always)]
-unsafe fn acc_4_dot(
+unsafe fn acc_4_dot<const D: bool>(
     start_x: usize,
     src: &[u16],
     w0: __m256i,
@@ -73,11 +74,11 @@ unsafe fn acc_4_dot(
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
     let rgba_pixel = _mm256_loadu_si256(src_ptr.as_ptr() as *const _);
-    _mm256_dpwssd_avx_epi32(store, _mm256_shuffle_epi8(rgba_pixel, shuffle), w0)
+    _mm256_dot16_avx_epi32::<D>(store, _mm256_shuffle_epi8(rgba_pixel, shuffle), w0)
 }
 
 #[inline(always)]
-unsafe fn acc_8_dot(
+unsafe fn acc_8_dot<const D: bool>(
     start_x: usize,
     src: &[u16],
     w0: __m256i,
@@ -90,11 +91,11 @@ unsafe fn acc_8_dot(
     let rgba_pixel0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const _);
     let rgba_pixel1 = _mm256_loadu_si256(src_ptr.get_unchecked(16..).as_ptr() as *const _);
 
-    let p0 = _mm256_dpwssd_avx_epi32(store, _mm256_shuffle_epi8(rgba_pixel0, shuffle), w0);
-    _mm256_dpwssd_avx_epi32(p0, _mm256_shuffle_epi8(rgba_pixel1, shuffle), w1)
+    let p0 = _mm256_dot16_avx_epi32::<D>(store, _mm256_shuffle_epi8(rgba_pixel0, shuffle), w0);
+    _mm256_dot16_avx_epi32::<D>(p0, _mm256_shuffle_epi8(rgba_pixel1, shuffle), w1)
 }
 
-pub(crate) fn convolve_horizontal_rgba_vnni_rows_4_u16(
+pub(crate) fn convolve_horizontal_rgba_avx_rows_4_u16(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -103,7 +104,18 @@ pub(crate) fn convolve_horizontal_rgba_vnni_rows_4_u16(
     bit_depth: u32,
 ) {
     unsafe {
-        convolve_horizontal_rgba_vnni_rows_4_lb_impl(
+        #[cfg(feature = "nightly_avx512")]
+        if std::arch::is_x86_feature_detected!("avxvnni") {
+            return convolve_horizontal_rgba_avx_rows_4_lb_vn(
+                src,
+                src_stride,
+                dst,
+                dst_stride,
+                filter_weights,
+                bit_depth,
+            );
+        }
+        convolve_horizontal_rgba_avx_rows_4_lb_a(
             src,
             src_stride,
             dst,
@@ -114,8 +126,47 @@ pub(crate) fn convolve_horizontal_rgba_vnni_rows_4_u16(
     }
 }
 
+#[cfg(feature = "nightly_avx512")]
 #[target_feature(enable = "avxvnni", enable = "avx2")]
-unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl(
+unsafe fn convolve_horizontal_rgba_avx_rows_4_lb_vn(
+    src: &[u16],
+    src_stride: usize,
+    dst: &mut [u16],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+    bit_depth: u32,
+) {
+    convolve_horizontal_rgba_avx_rows_4_lb_impl::<true>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+        bit_depth,
+    );
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn convolve_horizontal_rgba_avx_rows_4_lb_a(
+    src: &[u16],
+    src_stride: usize,
+    dst: &mut [u16],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+    bit_depth: u32,
+) {
+    convolve_horizontal_rgba_avx_rows_4_lb_impl::<false>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+        bit_depth,
+    );
+}
+
+#[inline(always)]
+unsafe fn convolve_horizontal_rgba_avx_rows_4_lb_impl<const D: bool>(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -228,10 +279,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl(
                 a_shuffle_weights_table,
             );
             let bounds_start = bounds.start + jx;
-            store_0 = acc_8_dot(bounds_start, src0, w0, w1, store_0, a_shuffle_2_table);
-            store_1 = acc_8_dot(bounds_start, src1, w0, w1, store_1, a_shuffle_2_table);
-            store_2 = acc_8_dot(bounds_start, src2, w0, w1, store_2, a_shuffle_2_table);
-            store_3 = acc_8_dot(bounds_start, src3, w0, w1, store_3, a_shuffle_2_table);
+            store_0 = acc_8_dot::<D>(bounds_start, src0, w0, w1, store_0, a_shuffle_2_table);
+            store_1 = acc_8_dot::<D>(bounds_start, src1, w0, w1, store_1, a_shuffle_2_table);
+            store_2 = acc_8_dot::<D>(bounds_start, src2, w0, w1, store_2, a_shuffle_2_table);
+            store_3 = acc_8_dot::<D>(bounds_start, src3, w0, w1, store_3, a_shuffle_2_table);
             jx += 8;
         }
 
@@ -245,10 +296,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl(
                 ),
                 a_shuffle_weights_table,
             );
-            store_0 = acc_4_dot(bounds_start, src0, w0, store_0, a_shuffle_2_table);
-            store_1 = acc_4_dot(bounds_start, src1, w0, store_1, a_shuffle_2_table);
-            store_2 = acc_4_dot(bounds_start, src2, w0, store_2, a_shuffle_2_table);
-            store_3 = acc_4_dot(bounds_start, src3, w0, store_3, a_shuffle_2_table);
+            store_0 = acc_4_dot::<D>(bounds_start, src0, w0, store_0, a_shuffle_2_table);
+            store_1 = acc_4_dot::<D>(bounds_start, src1, w0, store_1, a_shuffle_2_table);
+            store_2 = acc_4_dot::<D>(bounds_start, src2, w0, store_2, a_shuffle_2_table);
+            store_3 = acc_4_dot::<D>(bounds_start, src3, w0, store_3, a_shuffle_2_table);
             jx += 4;
         }
 
@@ -276,10 +327,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl(
                 _mm_loadu_si32(w_ptr.as_ptr() as *const _),
                 shuffle_weights_table,
             );
-            store_0 = acc_2_dot(bounds_start, src0, w0, store_0, shuffle_2_table);
-            store_1 = acc_2_dot(bounds_start, src1, w0, store_1, shuffle_2_table);
-            store_2 = acc_2_dot(bounds_start, src2, w0, store_2, shuffle_2_table);
-            store_3 = acc_2_dot(bounds_start, src3, w0, store_3, shuffle_2_table);
+            store_0 = acc_2_dot::<D>(bounds_start, src0, w0, store_0, shuffle_2_table);
+            store_1 = acc_2_dot::<D>(bounds_start, src1, w0, store_1, shuffle_2_table);
+            store_2 = acc_2_dot::<D>(bounds_start, src2, w0, store_2, shuffle_2_table);
+            store_3 = acc_2_dot::<D>(bounds_start, src3, w0, store_3, shuffle_2_table);
             jx += 2;
         }
 
@@ -287,10 +338,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl(
             let w_ptr = weights.get_unchecked(jx..(jx + 1));
             let bounds_start = bounds.start + jx;
             let w0 = _mm_shuffle_epi8(_mm_set1_epi16(w_ptr[0]), shuffle_weights_table);
-            store_0 = acc_1_dot(bounds_start, src0, w0, store_0, shuffle_1_table);
-            store_1 = acc_1_dot(bounds_start, src1, w0, store_1, shuffle_1_table);
-            store_2 = acc_1_dot(bounds_start, src2, w0, store_2, shuffle_1_table);
-            store_3 = acc_1_dot(bounds_start, src3, w0, store_3, shuffle_1_table);
+            store_0 = acc_1_dot::<D>(bounds_start, src0, w0, store_0, shuffle_1_table);
+            store_1 = acc_1_dot::<D>(bounds_start, src1, w0, store_1, shuffle_1_table);
+            store_2 = acc_1_dot::<D>(bounds_start, src2, w0, store_2, shuffle_1_table);
+            store_3 = acc_1_dot::<D>(bounds_start, src3, w0, store_3, shuffle_1_table);
             jx += 1;
         }
 
@@ -311,19 +362,44 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl(
     }
 }
 
-pub(crate) fn convolve_horizontal_rgba_vnni_u16lp_row(
+pub(crate) fn convolve_horizontal_rgba_avx_u16lp_row(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<i16>,
     bit_depth: u32,
 ) {
     unsafe {
-        convolve_horizontal_rgba_vnni_u16_row_impl(src, dst, filter_weights, bit_depth);
+        #[cfg(feature = "nightly_avx512")]
+        if std::arch::is_x86_feature_detected!("avxvnni") {
+            return convolve_horizontal_rgba_avx_u16_row_vn(src, dst, filter_weights, bit_depth);
+        }
+        convolve_horizontal_rgba_avx_u16_row_avx(src, dst, filter_weights, bit_depth);
     }
 }
 
+#[cfg(feature = "nightly_avx512")]
 #[target_feature(enable = "avxvnni", enable = "avx2")]
-unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl(
+unsafe fn convolve_horizontal_rgba_avx_u16_row_vn(
+    src: &[u16],
+    dst: &mut [u16],
+    filter_weights: &FilterWeights<i16>,
+    bit_depth: u32,
+) {
+    convolve_horizontal_rgba_vnni_u16_row_impl::<true>(src, dst, filter_weights, bit_depth);
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn convolve_horizontal_rgba_avx_u16_row_avx(
+    src: &[u16],
+    dst: &mut [u16],
+    filter_weights: &FilterWeights<i16>,
+    bit_depth: u32,
+) {
+    convolve_horizontal_rgba_vnni_u16_row_impl::<false>(src, dst, filter_weights, bit_depth);
+}
+
+#[inline(always)]
+unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl<const D: bool>(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<i16>,
@@ -387,7 +463,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl(
                 a_shuffle_weights_table,
             );
             let bounds_start = bounds.start + jx;
-            store = acc_8_dot(bounds_start, src, w0, w1, store, a_shuffle_2_table);
+            store = acc_8_dot::<D>(bounds_start, src, w0, w1, store, a_shuffle_2_table);
             jx += 8;
         }
 
@@ -401,7 +477,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl(
                 a_shuffle_weights_table,
             );
             let bounds_start = bounds.start + jx;
-            store = acc_4_dot(bounds_start, src, w0, store, a_shuffle_2_table);
+            store = acc_4_dot::<D>(bounds_start, src, w0, store, a_shuffle_2_table);
             jx += 4;
         }
 
@@ -417,7 +493,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl(
                 _mm_loadu_si32(w_ptr.as_ptr() as *const _),
                 shuffle_weights_table,
             );
-            store = acc_2_dot(bounds_start, src, w0, store, shuffle_2_table);
+            store = acc_2_dot::<D>(bounds_start, src, w0, store, shuffle_2_table);
             jx += 2;
         }
 
@@ -425,7 +501,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl(
             let w_ptr = weights.get_unchecked(jx..(jx + 1));
             let w0 = _mm_shuffle_epi8(_mm_set1_epi16(w_ptr[0]), shuffle_weights_table);
             let bounds_start = bounds.start + jx;
-            store = acc_1_dot(bounds_start, src, w0, store, shuffle_1_table);
+            store = acc_1_dot::<D>(bounds_start, src, w0, store, shuffle_1_table);
             jx += 1;
         }
 
diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
index e2b9ebc..761484f 100644
--- a/src/avx2/utils.rs
+++ b/src/avx2/utils.rs
@@ -374,6 +374,27 @@ pub(crate) unsafe fn _mm256_dot16_avx_epi32<const HAS_DOT: bool>(
     }
 }
 
+#[allow(dead_code)]
+#[inline(always)]
+pub(crate) unsafe fn _mm_dot16_avx_epi32<const HAS_DOT: bool>(
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    #[cfg(feature = "nightly_avx512")]
+    {
+        if HAS_DOT {
+            _mm_dpwssd_avx_epi32(a, b, c)
+        } else {
+            _mm_add_epi32(a, _mm_madd_epi16(b, c))
+        }
+    }
+    #[cfg(not(feature = "nightly_avx512"))]
+    {
+        _mm_add_epi32(a, _mm_madd_epi16(b, c))
+    }
+}
+
 #[allow(dead_code)]
 #[inline(always)]
 pub(crate) unsafe fn _mm_udot8_epi16<const DOT: bool>(
diff --git a/src/avx2/vertical_u16_lb.rs b/src/avx2/vertical_u16_lb.rs
index 72ac2c3..6319eb0 100644
--- a/src/avx2/vertical_u16_lb.rs
+++ b/src/avx2/vertical_u16_lb.rs
@@ -26,9 +26,8 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::avx2::utils::{_mm256_dot16_avx_epi32, avx2_pack_u32};
+use crate::avx2::utils::{_mm256_dot16_avx_epi32, _mm_dot16_avx_epi32, avx2_pack_u32};
 use crate::filter_weights::FilterBounds;
-use crate::sse::_mm_dot16_avx_epi32;
 use crate::support::{PRECISION, ROUNDING_CONST};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
diff --git a/src/handler_provider.rs b/src/handler_provider.rs
index e790204..a960df0 100644
--- a/src/handler_provider.rs
+++ b/src/handler_provider.rs
@@ -423,10 +423,9 @@ impl RowHandlerFixedPoint<u16> for u16 {
         i16: AsPrimitive<J>,
         u16: AsPrimitive<J>,
     {
-        #[cfg(feature = "nightly_avx512")]
-        if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avxvnni") {
-            use crate::avx512::convolve_horizontal_rgba_vnni_rows_4_u16;
-            return convolve_horizontal_rgba_vnni_rows_4_u16(
+        if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avx2") {
+            use crate::avx2::convolve_horizontal_rgba_avx_rows_4_u16;
+            return convolve_horizontal_rgba_avx_rows_4_u16(
                 src,
                 src_stride,
                 dst,
@@ -531,10 +530,9 @@ impl RowHandlerFixedPoint<u16> for u16 {
         i16: AsPrimitive<J>,
         u16: AsPrimitive<J>,
     {
-        #[cfg(feature = "nightly_avx512")]
-        if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avxvnni") {
-            use crate::avx512::convolve_horizontal_rgba_vnni_u16lp_row;
-            return convolve_horizontal_rgba_vnni_u16lp_row(src, dst, filter_weights, bit_depth);
+        if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avx2") {
+            use crate::avx2::convolve_horizontal_rgba_avx_u16lp_row;
+            return convolve_horizontal_rgba_avx_u16lp_row(src, dst, filter_weights, bit_depth);
         }
         if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("sse4.1") {
             convolve_horizontal_rgba_sse_u16_lb_row(src, dst, filter_weights, bit_depth);
diff --git a/src/sse/utils.rs b/src/sse/utils.rs
index 1a98903..d22df09 100644
--- a/src/sse/utils.rs
+++ b/src/sse/utils.rs
@@ -280,26 +280,6 @@ pub(crate) unsafe fn _mm_store3_u16(ptr: *mut u16, a: __m128i) {
         .write_unaligned(_mm_extract_epi16::<2>(a) as i16);
 }
 
-#[inline(always)]
-pub(crate) unsafe fn _mm_dot16_avx_epi32<const HAS_DOT: bool>(
-    a: __m128i,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
-    #[cfg(feature = "nightly_avx512")]
-    {
-        if HAS_DOT {
-            _mm_dpwssd_avx_epi32(a, b, c)
-        } else {
-            _mm_add_epi32(a, _mm_madd_epi16(b, c))
-        }
-    }
-    #[cfg(not(feature = "nightly_avx512"))]
-    {
-        _mm_add_epi32(a, _mm_madd_epi16(b, c))
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;