diff --git a/Cargo.toml b/Cargo.toml
index 12abb08..b8a12a2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,4 +27,5 @@ libc = "0.2.158"
 
 [features]
 default = ["colorspaces"]
-colorspaces = ["dep:colorutils-rs"]
\ No newline at end of file
+colorspaces = ["dep:colorutils-rs"]
+nightly_avx512 = []
\ No newline at end of file
diff --git a/README.md b/README.md
index 5585736..a9ff1db 100644
--- a/README.md
+++ b/README.md
@@ -44,25 +44,27 @@ Despite all implementation are fast, not all the paths are implemented using SIM
 
 `~` - Partially implemented
 
-|                | NEON | SSE | AVX2 | WASM | 
-|----------------|------|-----|------|------| 
-| RGBA (8 bit)   | x    | x   | x    | ~    | 
-| RGB (8 bit)    | x    | x   | ~    | ~    | 
-| Plane (8 bit)  | x    | x   | ~    | ~    | 
-| RGBA (8+ bit)  | x    | x   | ~    | -    | 
-| RGB (8+ bit)   | x    | x   | ~    | -    | 
-| Plane (8+ bit) | ~    | ~   | ~    | -    | 
-| RGBA (f32)     | x    | x   | x    | -    | 
-| RGB (f32)      | x    | x   | ~    | -    | 
-| Plane (f32)    | x    | x   | ~    | -    | 
-| RGBA (f16)     | x    | x   | x    | -    | 
-| RGB (f16)      | x    | ~   | ~    | -    | 
-| Plane (f16)    | ~    | ~   | ~    | -    |
-| AR30/RA30      | x    | -   | -    | -    |
+|                | NEON | SSE | AVX2 | AVX-512 | WASM | 
+|----------------|------|-----|------|---------|------| 
+| RGBA (8 bit)   | x    | x   | x    | ~       | ~    | 
+| RGB (8 bit)    | x    | x   | ~    | ~       | ~    | 
+| Plane (8 bit)  | x    | x   | ~    | ~       | ~    | 
+| RGBA (8+ bit)  | x    | x   | ~    | -       | -    | 
+| RGB (8+ bit)   | x    | x   | ~    | -       | -    | 
+| Plane (8+ bit) | ~    | ~   | ~    | -       | -    | 
+| RGBA (f32)     | x    | x   | x    | -       | -    | 
+| RGB (f32)      | x    | x   | ~    | -       | -    | 
+| Plane (f32)    | x    | x   | ~    | -       | -    | 
+| RGBA (f16)     | x    | x   | x    | -       | -    | 
+| RGB (f16)      | x    | ~   | ~    | -       | -    | 
+| Plane (f16)    | ~    | ~   | ~    | -       | -    |
+| AR30/RA30      | x    | -   | -    | -       | -    |
 
 #### Features
 
-To enable support of `f16` the feature `half` should be activated.
+Features: 
+ -  To enable support of `f16` the feature `half` should be activated.
+ -  `nightly_avx512` activates AVX-512 feature set and requires `nightly` compiler channel 
 
 #### Target features with runtime dispatch
 
@@ -72,6 +74,8 @@ For x86 and aarch64 NEON runtime dispatch is used.
 
 `avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, no additional actions need, and called the best path.
 
+`avx512` requires feature `nightly_avx512` and requires `nightly` compiler channel, runtime detection if it is available then will be used.
+
 `fullfp16` NEON target detection performed in runtime, when available best the best paths for *f16* images are available on ARM.
 
 WASM `simd128` target feature activating is mandatory in build flags.
diff --git a/app/Cargo.toml b/app/Cargo.toml
index 1c09c6d..71f0b9f 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half"], default-features = true }
+pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 
diff --git a/app/src/main.rs b/app/src/main.rs
index 25367ac..3f73175 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -42,7 +42,7 @@ fn resize_plane(
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/asset_4.png")
+    let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -59,7 +59,7 @@ fn main() {
 
     //
     let store =
-        ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
+        ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
@@ -75,15 +75,13 @@ fn main() {
     //     )
     //     .unwrap();
 
-    let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
+    let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
         dimensions.0 as usize,
         dimensions.1 as usize / 2,
         10,
     );
 
-    scaler
-        .resize_rgba_u16(&store, &mut dst_store, true)
-        .unwrap();
+    scaler.resize_rgba(&store, &mut dst_store, true).unwrap();
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
@@ -160,13 +158,13 @@ fn main() {
     //     .map(|&x| (x * 255f32) as u8)
     //     .collect();
 
-    let dst: Vec<u8> = dst_store
-        .as_bytes()
-        .iter()
-        .map(|&x| (x >> 2) as u8)
-        .collect();
+    // let dst: Vec<u8> = dst_store
+    //     .as_bytes()
+    //     .iter()
+    //     .map(|&x| (x >> 2) as u8)
+    //     .collect();
 
-    // let dst = dst_store.as_bytes();
+    let dst = dst_store.as_bytes();
     // let dst = resized;
     // image::save_buffer(
     //     "converted.png",
diff --git a/src/alpha_handle_u8.rs b/src/alpha_handle_u8.rs
index c77e792..3c15e0d 100644
--- a/src/alpha_handle_u8.rs
+++ b/src/alpha_handle_u8.rs
@@ -137,6 +137,13 @@ pub(crate) fn premultiply_alpha_rgba(
         if is_x86_feature_detected!("avx2") {
             _dispatcher = avx_premultiply_alpha_rgba;
         }
+        #[cfg(feature = "nightly_avx512")]
+        if std::arch::is_x86_feature_detected!("avx512f")
+            && std::arch::is_x86_feature_detected!("avx512bw")
+        {
+            use crate::avx512::avx512_premultiply_alpha_rgba;
+            _dispatcher = avx512_premultiply_alpha_rgba;
+        }
     }
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     {
@@ -159,15 +166,22 @@ pub(crate) fn unpremultiply_alpha_rgba(
     }
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("sse4.1") {
+        if std::arch::is_x86_feature_detected!("sse4.1") {
             _dispatcher = sse_unpremultiply_alpha_rgba;
         }
     }
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("avx2") {
+        if std::arch::is_x86_feature_detected!("avx2") {
             _dispatcher = avx_unpremultiply_alpha_rgba;
         }
+        #[cfg(feature = "nightly_avx512")]
+        if std::arch::is_x86_feature_detected!("avx512f")
+            && std::arch::is_x86_feature_detected!("avx512bw")
+        {
+            use crate::avx512::avx512_unpremultiply_alpha_rgba;
+            _dispatcher = avx512_unpremultiply_alpha_rgba;
+        }
     }
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     {
diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
index f2f9a06..c16fdb3 100644
--- a/src/avx2/alpha_u16.rs
+++ b/src/avx2/alpha_u16.rs
@@ -45,8 +45,8 @@ unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m2
     let low_px = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(px, zeros));
     let high_px = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(px, zeros));
 
-    let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(low_px, low_low_a)));
-    let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(high_px, low_high_a)));
+    let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(low_px, low_low_a)));
+    let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(high_px, low_high_a)));
 
     _mm256_packus_epi32(new_ll, new_lh)
 }
@@ -110,8 +110,7 @@ trait Avx2PremultiplyExecutor {
 struct Avx2PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
 
 impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutorDefault<BIT_DEPTH> {
-    #[inline]
-    #[target_feature(enable = "avx2")]
+    #[inline(always)]
     unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16]) {
         let src_ptr = src.as_ptr();
         let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
@@ -203,8 +202,7 @@ impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutor for Avx2PremultiplyExecutor
 struct Avx2PremultiplyExecutorAnyBit {}
 
 impl Avx2PremultiplyExecutorAnyBit {
-    #[inline]
-    #[target_feature(enable = "avx2")]
+    #[inline(always)]
     unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m256) {
         let src_ptr = src.as_ptr();
         let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 4ecff54..2755d5d 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -27,13 +27,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiply_alpha_rgba_row_impl};
 use crate::avx2::utils::{
     _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba,
 };
-use crate::sse::{
-    _mm_div_by_255_epi16, sse_deinterleave_rgba, sse_interleave_rgba, sse_unpremultiply_row,
-};
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
@@ -42,57 +38,6 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-#[inline(always)]
-unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i {
-    let zeros = _mm256_setzero_si256();
-    let lo = _mm256_unpacklo_epi8(x, zeros);
-    let hi = _mm256_unpackhi_epi8(x, zeros);
-
-    let scale = _mm256_set1_epi16(255);
-
-    let is_zero_mask = _mm256_cmpeq_epi8(a, zeros);
-    let a = _mm256_select_si256(is_zero_mask, scale, a);
-
-    let scale_ps = _mm256_set1_ps(255f32);
-
-    let lo_lo = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)),
-        scale_ps,
-    );
-    let lo_hi = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)),
-        scale_ps,
-    );
-    let hi_lo = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)),
-        scale_ps,
-    );
-    let hi_hi = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
-        scale_ps,
-    );
-    let a_lo = _mm256_unpacklo_epi8(a, zeros);
-    let a_hi = _mm256_unpackhi_epi8(x, zeros);
-    let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
-    let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
-    let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
-    let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
-
-    let lo_lo = _mm256_cvtps_epi32(_mm256_mul_ps(lo_lo, a_lo_lo));
-    let lo_hi = _mm256_cvtps_epi32(_mm256_mul_ps(lo_hi, a_lo_hi));
-    let hi_lo = _mm256_cvtps_epi32(_mm256_mul_ps(hi_lo, a_hi_lo));
-    let hi_hi = _mm256_cvtps_epi32(_mm256_mul_ps(hi_hi, a_hi_hi));
-
-    _mm256_select_si256(
-        is_zero_mask,
-        zeros,
-        _mm256_packus_epi16(
-            _mm256_packus_epi32(lo_lo, lo_hi),
-            _mm256_packus_epi32(hi_lo, hi_hi),
-        ),
-    )
-}
-
 pub(crate) fn avx_premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
@@ -105,15 +50,16 @@ pub(crate) fn avx_premultiply_alpha_rgba(
     }
 }
 
-#[target_feature(enable = "avx2")]
-unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
-    let mut rem = dst;
-    let mut src_rem = src;
-
-    for (dst, src) in rem
-        .chunks_exact_mut(32 * 4)
-        .zip(src_rem.chunks_exact(32 * 4))
-    {
+trait AssociateAlpha {
+    unsafe fn associate(&self, dst: &mut [u8], src: &[u8]);
+}
+
+#[derive(Default)]
+struct AssociateAlphaDefault {}
+
+impl AssociateAlphaDefault {
+    #[inline(always)]
+    unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) {
         let src_ptr = src.as_ptr();
         let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
         let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
@@ -153,58 +99,48 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
         _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
         _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
     }
+}
 
-    rem = rem.chunks_exact_mut(32 * 4).into_remainder();
-    src_rem = src_rem.chunks_exact(32 * 4).remainder();
-
-    let zeros = _mm_setzero_si128();
-    for (dst, src) in rem
-        .chunks_exact_mut(16 * 4)
-        .zip(src_rem.chunks_exact(16 * 4))
-    {
-        let src_ptr = src.as_ptr();
-        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
-
-        let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
-        let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
-
-        let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
-        let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
+impl AssociateAlpha for AssociateAlphaDefault {
+    #[target_feature(enable = "avx2")]
+    unsafe fn associate(&self, dst: &mut [u8], src: &[u8]) {
+        let mut rem = dst;
+        let mut src_rem = src;
 
-        let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
-        let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
+        for (dst, src) in rem
+            .chunks_exact_mut(32 * 4)
+            .zip(src_rem.chunks_exact(32 * 4))
+        {
+            self.associate_chunk(dst, src);
+        }
 
-        let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
-        let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
+        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(32 * 4).remainder();
 
-        rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
-        rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high));
-        ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low));
-        ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high));
-        bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low));
-        bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high));
+        if !rem.is_empty() {
+            const PART_SIZE: usize = 32 * 4;
+            assert!(src_rem.len() < PART_SIZE);
+            assert!(rem.len() < PART_SIZE);
+            assert_eq!(src_rem.len(), rem.len());
 
-        let rrr = _mm_packus_epi16(rrr_low, rrr_high);
-        let ggg = _mm_packus_epi16(ggg_low, ggg_high);
-        let bbb = _mm_packus_epi16(bbb_low, bbb_high);
+            let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+            let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+            std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
 
-        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+            self.associate_chunk(&mut dst_buffer, &buffer);
 
-        let dst_ptr = dst.as_mut_ptr();
-        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
+            std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
     }
+}
 
-    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
-    src_rem = src_rem.chunks_exact(16 * 4).remainder();
-
-    premultiply_alpha_rgba_row_impl(rem, src_rem);
+#[target_feature(enable = "avx2")]
+unsafe fn avx_premultiply_alpha_rgba_impl_row(
+    dst: &mut [u8],
+    src: &[u8],
+    executor: impl AssociateAlpha,
+) {
+    executor.associate(dst, src);
 }
 
 #[target_feature(enable = "avx2")]
@@ -220,14 +156,14 @@ unsafe fn avx_premultiply_alpha_rgba_impl(
             dst.par_chunks_exact_mut(width * 4)
                 .zip(src.par_chunks_exact(width * 4))
                 .for_each(|(dst, src)| unsafe {
-                    avx_premultiply_alpha_rgba_impl_row(dst, src);
+                    avx_premultiply_alpha_rgba_impl_row(dst, src, AssociateAlphaDefault::default());
                 });
         });
     } else {
         dst.chunks_exact_mut(width * 4)
             .zip(src.chunks_exact(width * 4))
             .for_each(|(dst, src)| unsafe {
-                avx_premultiply_alpha_rgba_impl_row(dst, src);
+                avx_premultiply_alpha_rgba_impl_row(dst, src, AssociateAlphaDefault::default());
             });
     }
 }
@@ -243,57 +179,117 @@ pub(crate) fn avx_unpremultiply_alpha_rgba(
     }
 }
 
-#[target_feature(enable = "avx2")]
-unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
-    let mut rem = in_place;
+trait DisassociateAlpha {
+    unsafe fn disassociate(&self, in_place: &mut [u8]);
+}
 
-    for dst in rem.chunks_exact_mut(32 * 4) {
-        let src_ptr = dst.as_ptr();
+#[derive(Default)]
+struct Avx2DisassociateAlpha {}
+
+impl Avx2DisassociateAlpha {
+    #[inline(always)]
+    unsafe fn avx2_unpremultiply_row(&self, x: __m256i, a: __m256i) -> __m256i {
+        let zeros = _mm256_setzero_si256();
+        let lo = _mm256_unpacklo_epi8(x, zeros);
+        let hi = _mm256_unpackhi_epi8(x, zeros);
+
+        let is_zero_mask = _mm256_cmpeq_epi8(a, zeros);
+
+        let scale_ps = _mm256_set1_ps(255f32);
+
+        let lo_lo = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)),
+            scale_ps,
+        );
+        let lo_hi = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)),
+            scale_ps,
+        );
+        let hi_lo = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)),
+            scale_ps,
+        );
+        let hi_hi = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
+            scale_ps,
+        );
+        let a_lo = _mm256_unpacklo_epi8(a, zeros);
+        let a_hi = _mm256_unpackhi_epi8(a, zeros);
+        let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
+        let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
+        let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
+        let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
+
+        let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo)));
+        let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi)));
+        let hi_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_lo, a_hi_lo)));
+        let hi_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_hi, a_hi_hi)));
+
+        _mm256_select_si256(
+            is_zero_mask,
+            zeros,
+            _mm256_packus_epi16(
+                _mm256_packus_epi32(lo_lo, lo_hi),
+                _mm256_packus_epi32(hi_lo, hi_hi),
+            ),
+        )
+    }
+
+    #[inline(always)]
+    unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) {
+        let src_ptr = in_place.as_ptr();
         let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
         let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
         let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
         let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
         let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
 
-        let rrr = avx2_unpremultiply_row(rrr, aaa);
-        let ggg = avx2_unpremultiply_row(ggg, aaa);
-        let bbb = avx2_unpremultiply_row(bbb, aaa);
+        let rrr = self.avx2_unpremultiply_row(rrr, aaa);
+        let ggg = self.avx2_unpremultiply_row(ggg, aaa);
+        let bbb = self.avx2_unpremultiply_row(bbb, aaa);
 
         let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
 
-        let dst_ptr = dst.as_mut_ptr();
+        let dst_ptr = in_place.as_mut_ptr();
         _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
         _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
         _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
         _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
     }
+}
 
-    rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+impl DisassociateAlpha for Avx2DisassociateAlpha {
+    #[target_feature(enable = "avx2")]
+    unsafe fn disassociate(&self, in_place: &mut [u8]) {
+        let mut rem = in_place;
 
-    for dst in rem.chunks_exact_mut(16 * 4) {
-        let src_ptr = dst.as_ptr();
-        let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
-        let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
-        let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
-        let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
-        let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+        for dst in rem.chunks_exact_mut(32 * 4) {
+            self.disassociate_chunk(dst);
+        }
 
-        let rrr = sse_unpremultiply_row(rrr, aaa);
-        let ggg = sse_unpremultiply_row(ggg, aaa);
-        let bbb = sse_unpremultiply_row(bbb, aaa);
+        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
 
-        let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa);
+        if !rem.is_empty() {
+            const PART_SIZE: usize = 32 * 4;
+            assert!(rem.len() < PART_SIZE);
 
-        let dst_ptr = dst.as_mut_ptr();
-        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-    }
+            let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
 
-    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+            self.disassociate_chunk(&mut buffer);
 
-    unpremultiply_alpha_rgba_row_impl(rem);
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn avx_unpremultiply_alpha_rgba_impl_row(
+    in_place: &mut [u8],
+    executor: impl DisassociateAlpha,
+) {
+    executor.disassociate(in_place);
 }
 
 #[target_feature(enable = "avx2")]
@@ -308,12 +304,12 @@ unsafe fn avx_unpremultiply_alpha_rgba_impl(
             in_place
                 .par_chunks_exact_mut(width * 4)
                 .for_each(|row| unsafe {
-                    avx_unpremultiply_alpha_rgba_impl_row(row);
+                    avx_unpremultiply_alpha_rgba_impl_row(row, Avx2DisassociateAlpha::default());
                 });
         });
     } else {
         in_place.chunks_exact_mut(width * 4).for_each(|row| unsafe {
-            avx_unpremultiply_alpha_rgba_impl_row(row);
+            avx_unpremultiply_alpha_rgba_impl_row(row, Avx2DisassociateAlpha::default());
         });
     }
 }
diff --git a/src/avx512/alpha_u8.rs b/src/avx512/alpha_u8.rs
new file mode 100644
index 0000000..94d23cb
--- /dev/null
+++ b/src/avx512/alpha_u8.rs
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk 01/2025. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::avx512::utils::{avx512_deinterleave_rgba, avx512_div_by255, avx512_interleave_rgba};
+use rayon::iter::{IndexedParallelIterator, ParallelIterator};
+use rayon::prelude::{ParallelSlice, ParallelSliceMut};
+use rayon::ThreadPool;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+trait AssociateAlpha {
+    unsafe fn associate(&self, dst: &mut [u8], src: &[u8]);
+}
+
+#[derive(Default)]
+struct AssociateAlphaDefault<const HAS_VBMI: bool> {}
+
+impl<const HAS_VBMI: bool> AssociateAlphaDefault<HAS_VBMI> {
+    #[inline(always)]
+    unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) {
+        let src_ptr = src.as_ptr();
+        let rgba0 = _mm512_loadu_si512(src_ptr as *const _);
+        let rgba1 = _mm512_loadu_si512(src_ptr.add(64) as *const _);
+        let rgba2 = _mm512_loadu_si512(src_ptr.add(128) as *const _);
+        let rgba3 = _mm512_loadu_si512(src_ptr.add(128 + 64) as *const _);
+        let (rrr, ggg, bbb, aaa) = avx512_deinterleave_rgba::<HAS_VBMI>(rgba0, rgba1, rgba2, rgba3);
+
+        let zeros = _mm512_setzero_si512();
+
+        let mut rrr_low = _mm512_unpacklo_epi8(rrr, zeros);
+        let mut rrr_high = _mm512_unpackhi_epi8(rrr, zeros);
+
+        let mut ggg_low = _mm512_unpacklo_epi8(ggg, zeros);
+        let mut ggg_high = _mm512_unpackhi_epi8(ggg, zeros);
+
+        let mut bbb_low = _mm512_unpacklo_epi8(bbb, zeros);
+        let mut bbb_high = _mm512_unpackhi_epi8(bbb, zeros);
+
+        let aaa_low = _mm512_unpacklo_epi8(aaa, zeros);
+        let aaa_high = _mm512_unpackhi_epi8(aaa, zeros);
+
+        rrr_low = avx512_div_by255(_mm512_mullo_epi16(rrr_low, aaa_low));
+        rrr_high = avx512_div_by255(_mm512_mullo_epi16(rrr_high, aaa_high));
+        ggg_low = avx512_div_by255(_mm512_mullo_epi16(ggg_low, aaa_low));
+        ggg_high = avx512_div_by255(_mm512_mullo_epi16(ggg_high, aaa_high));
+        bbb_low = avx512_div_by255(_mm512_mullo_epi16(bbb_low, aaa_low));
+        bbb_high = avx512_div_by255(_mm512_mullo_epi16(bbb_high, aaa_high));
+
+        let rrr = _mm512_packus_epi16(rrr_low, rrr_high);
+        let ggg = _mm512_packus_epi16(ggg_low, ggg_high);
+        let bbb = _mm512_packus_epi16(bbb_low, bbb_high);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx512_interleave_rgba::<HAS_VBMI>(rrr, ggg, bbb, aaa);
+        let dst_ptr = dst.as_mut_ptr();
+        _mm512_storeu_si512(dst_ptr as *mut _, rgba0);
+        _mm512_storeu_si512(dst_ptr.add(64) as *mut _, rgba1);
+        _mm512_storeu_si512(dst_ptr.add(128) as *mut _, rgba2);
+        _mm512_storeu_si512(dst_ptr.add(128 + 64) as *mut _, rgba3);
+    }
+}
+
+impl AssociateAlpha for AssociateAlphaDefault<false> {
+    #[target_feature(enable = "avx512f", enable = "avx512bw")]
+    unsafe fn associate(&self, dst: &mut [u8], src: &[u8]) {
+        let mut rem = dst;
+        let mut src_rem = src;
+
+        for (dst, src) in rem
+            .chunks_exact_mut(64 * 4)
+            .zip(src_rem.chunks_exact(64 * 4))
+        {
+            self.associate_chunk(dst, src);
+        }
+
+        rem = rem.chunks_exact_mut(64 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(64 * 4).remainder();
+
+        if !rem.is_empty() {
+            const PART_SIZE: usize = 64 * 4;
+            assert!(src_rem.len() < PART_SIZE);
+            assert!(rem.len() < PART_SIZE);
+            assert_eq!(src_rem.len(), rem.len());
+
+            let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+            let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+            std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
+
+            self.associate_chunk(&mut dst_buffer, &buffer);
+
+            std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+impl AssociateAlpha for AssociateAlphaDefault<true> {
+    #[target_feature(enable = "avx512f", enable = "avx512bw", enable = "avx512vbmi")]
+    unsafe fn associate(&self, dst: &mut [u8], src: &[u8]) {
+        let mut rem = dst;
+        let mut src_rem = src;
+
+        for (dst, src) in rem
+            .chunks_exact_mut(64 * 4)
+            .zip(src_rem.chunks_exact(64 * 4))
+        {
+            self.associate_chunk(dst, src);
+        }
+
+        rem = rem.chunks_exact_mut(64 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(64 * 4).remainder();
+
+        if !rem.is_empty() {
+            const PART_SIZE: usize = 64 * 4;
+            assert!(src_rem.len() < PART_SIZE);
+            assert!(rem.len() < PART_SIZE);
+            assert_eq!(src_rem.len(), rem.len());
+
+            let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+            let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+            std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
+
+            self.associate_chunk(&mut dst_buffer, &buffer);
+
+            std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8], executor: impl AssociateAlpha) {
+    unsafe {
+        executor.associate(dst, src);
+    }
+}
+
+pub(crate) fn avx512_premultiply_alpha_rgba(
+    dst: &mut [u8],
+    src: &[u8],
+    width: usize,
+    _: usize,
+    pool: &Option<ThreadPool>,
+) {
+    let has_vbmi = std::arch::is_x86_feature_detected!("avx512vbmi");
+    if let Some(pool) = pool {
+        pool.install(|| {
+            dst.par_chunks_exact_mut(width * 4)
+                .zip(src.par_chunks_exact(width * 4))
+                .for_each(|(dst, src)| {
+                    if has_vbmi {
+                        avx_premultiply_alpha_rgba_impl_row(
+                            dst,
+                            src,
+                            AssociateAlphaDefault::<true>::default(),
+                        );
+                    } else {
+                        avx_premultiply_alpha_rgba_impl_row(
+                            dst,
+                            src,
+                            AssociateAlphaDefault::<false>::default(),
+                        );
+                    }
+                });
+        });
+    } else {
+        dst.chunks_exact_mut(width * 4)
+            .zip(src.chunks_exact(width * 4))
+            .for_each(|(dst, src)| {
+                if has_vbmi {
+                    avx_premultiply_alpha_rgba_impl_row(
+                        dst,
+                        src,
+                        AssociateAlphaDefault::<true>::default(),
+                    );
+                } else {
+                    avx_premultiply_alpha_rgba_impl_row(
+                        dst,
+                        src,
+                        AssociateAlphaDefault::<false>::default(),
+                    );
+                }
+            });
+    }
+}
+
+trait DisassociateAlpha {
+    unsafe fn disassociate(&self, in_place: &mut [u8]);
+}
+
+#[derive(Default)]
+struct Avx512DisassociateAlpha<const HAS_VBMI: bool> {}
+
+impl<const HAS_VBMI: bool> Avx512DisassociateAlpha<HAS_VBMI> {
+    #[inline(always)]
+    unsafe fn avx512_unpremultiply_row(&self, x: __m512i, a: __m512i) -> __m512i {
+        let zeros = _mm512_setzero_si512();
+        let lo = _mm512_unpacklo_epi8(x, zeros);
+        let hi = _mm512_unpackhi_epi8(x, zeros);
+
+        let is_zero_mask = _mm512_cmp_epi8_mask::<0>(a, zeros);
+
+        let scale_ps = _mm512_set1_ps(255f32);
+
+        let lo_lo = _mm512_mul_ps(
+            _mm512_cvtepi32_ps(_mm512_unpacklo_epi16(lo, zeros)),
+            scale_ps,
+        );
+        let lo_hi = _mm512_mul_ps(
+            _mm512_cvtepi32_ps(_mm512_unpackhi_epi16(lo, zeros)),
+            scale_ps,
+        );
+        let hi_lo = _mm512_mul_ps(
+            _mm512_cvtepi32_ps(_mm512_unpacklo_epi16(hi, zeros)),
+            scale_ps,
+        );
+        let hi_hi = _mm512_mul_ps(
+            _mm512_cvtepi32_ps(_mm512_unpackhi_epi16(hi, zeros)),
+            scale_ps,
+        );
+        let a_lo = _mm512_unpacklo_epi8(a, zeros);
+        let a_hi = _mm512_unpackhi_epi8(a, zeros);
+
+        let a_lo_lo = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpacklo_epi16(a_lo, zeros)));
+        let a_lo_hi = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpackhi_epi16(a_lo, zeros)));
+        let a_hi_lo = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpacklo_epi16(a_hi, zeros)));
+        let a_hi_hi = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpackhi_epi16(a_hi, zeros)));
+
+        const FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+        let lo_lo = _mm512_cvtps_epi32(_mm512_mul_round_ps::<FLAGS>(lo_lo, a_lo_lo));
+        let lo_hi = _mm512_cvtps_epi32(_mm512_mul_round_ps::<FLAGS>(lo_hi, a_lo_hi));
+        let hi_lo = _mm512_cvtps_epi32(_mm512_mul_round_ps::<FLAGS>(hi_lo, a_hi_lo));
+        let hi_hi = _mm512_cvtps_epi32(_mm512_mul_round_ps::<FLAGS>(hi_hi, a_hi_hi));
+
+        let packed = _mm512_packus_epi16(
+            _mm512_packus_epi32(lo_lo, lo_hi),
+            _mm512_packus_epi32(hi_lo, hi_hi),
+        );
+        _mm512_mask_blend_epi8(is_zero_mask, packed, _mm512_setzero_si512())
+    }
+
+    #[inline(always)]
+    unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) {
+        let src_ptr = in_place.as_ptr();
+        let rgba0 = _mm512_loadu_si512(src_ptr as *const _);
+        let rgba1 = _mm512_loadu_si512(src_ptr.add(64) as *const _);
+        let rgba2 = _mm512_loadu_si512(src_ptr.add(128) as *const _);
+        let rgba3 = _mm512_loadu_si512(src_ptr.add(64 + 128) as *const _);
+        let (rrr, ggg, bbb, aaa) = avx512_deinterleave_rgba::<HAS_VBMI>(rgba0, rgba1, rgba2, rgba3);
+
+        let rrr = self.avx512_unpremultiply_row(rrr, aaa);
+        let ggg = self.avx512_unpremultiply_row(ggg, aaa);
+        let bbb = self.avx512_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx512_interleave_rgba::<HAS_VBMI>(rrr, ggg, bbb, aaa);
+
+        let dst_ptr = in_place.as_mut_ptr();
+        _mm512_storeu_si512(dst_ptr as *mut _, rgba0);
+        _mm512_storeu_si512(dst_ptr.add(64) as *mut _, rgba1);
+        _mm512_storeu_si512(dst_ptr.add(128) as *mut _, rgba2);
+        _mm512_storeu_si512(dst_ptr.add(128 + 64) as *mut _, rgba3);
+    }
+}
+
+impl DisassociateAlpha for Avx512DisassociateAlpha<false> {
+    #[target_feature(enable = "avx512f", enable = "avx512bw")]
+    unsafe fn disassociate(&self, in_place: &mut [u8]) {
+        let mut rem = in_place;
+
+        for dst in rem.chunks_exact_mut(64 * 4) {
+            self.disassociate_chunk(dst);
+        }
+
+        rem = rem.chunks_exact_mut(64 * 4).into_remainder();
+
+        if !rem.is_empty() {
+            const PART_SIZE: usize = 64 * 4;
+            assert!(rem.len() < PART_SIZE);
+
+            let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
+
+            self.disassociate_chunk(&mut buffer);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+impl DisassociateAlpha for Avx512DisassociateAlpha<true> {
+    #[target_feature(enable = "avx512f", enable = "avx512bw", enable = "avx512vbmi")]
+    unsafe fn disassociate(&self, in_place: &mut [u8]) {
+        let mut rem = in_place;
+
+        for dst in rem.chunks_exact_mut(64 * 4) {
+            self.disassociate_chunk(dst);
+        }
+
+        rem = rem.chunks_exact_mut(64 * 4).into_remainder();
+
+        if !rem.is_empty() {
+            const PART_SIZE: usize = 64 * 4;
+            assert!(rem.len() < PART_SIZE);
+
+            let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];
+
+            std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len());
+
+            self.disassociate_chunk(&mut buffer);
+
+            std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
+#[target_feature(enable = "avx512f", enable = "avx512bw")]
+unsafe fn avx512_unp_row(in_place: &mut [u8], executor: impl DisassociateAlpha) {
+    executor.disassociate(in_place);
+}
+pub(crate) fn avx512_unpremultiply_alpha_rgba(
+    in_place: &mut [u8],
+    width: usize,
+    _: usize,
+    pool: &Option<ThreadPool>,
+) {
+    let has_vbmi = std::arch::is_x86_feature_detected!("avx512vbmi");
+
+    if let Some(pool) = pool {
+        pool.install(|| {
+            in_place
+                .par_chunks_exact_mut(width * 4)
+                .for_each(|row| unsafe {
+                    if has_vbmi {
+                        avx512_unp_row(row, Avx512DisassociateAlpha::<true>::default());
+                    } else {
+                        avx512_unp_row(row, Avx512DisassociateAlpha::<false>::default());
+                    }
+                });
+        });
+    } else {
+        in_place.chunks_exact_mut(width * 4).for_each(|row| unsafe {
+            if has_vbmi {
+                avx512_unp_row(row, Avx512DisassociateAlpha::<true>::default());
+            } else {
+                avx512_unp_row(row, Avx512DisassociateAlpha::<false>::default());
+            }
+        });
+    }
+}
diff --git a/src/avx512/avx512_setr.rs b/src/avx512/avx512_setr.rs
new file mode 100644
index 0000000..92336bc
--- /dev/null
+++ b/src/avx512/avx512_setr.rs
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![allow(dead_code)]
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[inline(always)]
+pub(crate) unsafe fn _v512_set_epu32(
+    a15: i64,
+    a14: i64,
+    a13: i64,
+    a12: i64,
+    a11: i64,
+    a10: i64,
+    a9: i64,
+    a8: i64,
+    a7: i64,
+    a6: i64,
+    a5: i64,
+    a4: i64,
+    a3: i64,
+    a2: i64,
+    a1: i64,
+    a0: i64,
+) -> __m512i {
+    _mm512_set_epi64(
+        ((a15) << 32) | (a14),
+        ((a13) << 32) | (a12),
+        ((a11) << 32) | (a10),
+        ((a9) << 32) | (a8),
+        ((a7) << 32) | (a6),
+        ((a5) << 32) | (a4),
+        ((a3) << 32) | (a2),
+        ((a1) << 32) | (a0),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn _v512_set_epu16(
+    a31: i64,
+    a30: i64,
+    a29: i64,
+    a28: i64,
+    a27: i64,
+    a26: i64,
+    a25: i64,
+    a24: i64,
+    a23: i64,
+    a22: i64,
+    a21: i64,
+    a20: i64,
+    a19: i64,
+    a18: i64,
+    a17: i64,
+    a16: i64,
+    a15: i64,
+    a14: i64,
+    a13: i64,
+    a12: i64,
+    a11: i64,
+    a10: i64,
+    a9: i64,
+    a8: i64,
+    a7: i64,
+    a6: i64,
+    a5: i64,
+    a4: i64,
+    a3: i64,
+    a2: i64,
+    a1: i64,
+    a0: i64,
+) -> __m512i {
+    _v512_set_epu32(
+        ((a31) << 16) | (a30),
+        ((a29) << 16) | (a28),
+        ((a27) << 16) | (a26),
+        ((a25) << 16) | (a24),
+        ((a23) << 16) | (a22),
+        ((a21) << 16) | (a20),
+        ((a19) << 16) | (a18),
+        ((a17) << 16) | (a16),
+        ((a15) << 16) | (a14),
+        ((a13) << 16) | (a12),
+        ((a11) << 16) | (a10),
+        ((a9) << 16) | (a8),
+        ((a7) << 16) | (a6),
+        ((a5) << 16) | (a4),
+        ((a3) << 16) | (a2),
+        ((a1) << 16) | (a0),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn _v512_set_epu8(
+    a63: i64,
+    a62: i64,
+    a61: i64,
+    a60: i64,
+    a59: i64,
+    a58: i64,
+    a57: i64,
+    a56: i64,
+    a55: i64,
+    a54: i64,
+    a53: i64,
+    a52: i64,
+    a51: i64,
+    a50: i64,
+    a49: i64,
+    a48: i64,
+    a47: i64,
+    a46: i64,
+    a45: i64,
+    a44: i64,
+    a43: i64,
+    a42: i64,
+    a41: i64,
+    a40: i64,
+    a39: i64,
+    a38: i64,
+    a37: i64,
+    a36: i64,
+    a35: i64,
+    a34: i64,
+    a33: i64,
+    a32: i64,
+    a31: i64,
+    a30: i64,
+    a29: i64,
+    a28: i64,
+    a27: i64,
+    a26: i64,
+    a25: i64,
+    a24: i64,
+    a23: i64,
+    a22: i64,
+    a21: i64,
+    a20: i64,
+    a19: i64,
+    a18: i64,
+    a17: i64,
+    a16: i64,
+    a15: i64,
+    a14: i64,
+    a13: i64,
+    a12: i64,
+    a11: i64,
+    a10: i64,
+    a9: i64,
+    a8: i64,
+    a7: i64,
+    a6: i64,
+    a5: i64,
+    a4: i64,
+    a3: i64,
+    a2: i64,
+    a1: i64,
+    a0: i64,
+) -> __m512i {
+    _v512_set_epu32(
+        ((a63) << 24) | ((a62) << 16) | ((a61) << 8) | (a60),
+        ((a59) << 24) | ((a58) << 16) | ((a57) << 8) | (a56),
+        ((a55) << 24) | ((a54) << 16) | ((a53) << 8) | (a52),
+        ((a51) << 24) | ((a50) << 16) | ((a49) << 8) | (a48),
+        ((a47) << 24) | ((a46) << 16) | ((a45) << 8) | (a44),
+        ((a43) << 24) | ((a42) << 16) | ((a41) << 8) | (a40),
+        ((a39) << 24) | ((a38) << 16) | ((a37) << 8) | (a36),
+        ((a35) << 24) | ((a34) << 16) | ((a33) << 8) | (a32),
+        ((a31) << 24) | ((a30) << 16) | ((a29) << 8) | (a28),
+        ((a27) << 24) | ((a26) << 16) | ((a25) << 8) | (a24),
+        ((a23) << 24) | ((a22) << 16) | (((a21) << 8) | (a20)),
+        ((a19) << 24) | ((a18) << 16) | ((a17) << 8) | (a16),
+        ((a15) << 24) | ((a14) << 16) | ((a13) << 8) | (a12),
+        ((a11) << 24) | ((a10) << 16) | ((a9) << 8) | (a8),
+        ((a7) << 24) | ((a6) << 16) | ((a5) << 8) | (a4),
+        ((a3) << 24) | ((a2) << 16) | ((a1) << 8) | (a0),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn _v512_setr_epu8(
+    a0: i64,
+    a1: i64,
+    a2: i64,
+    a3: i64,
+    a4: i64,
+    a5: i64,
+    a6: i64,
+    a7: i64,
+    a8: i64,
+    a9: i64,
+    a10: i64,
+    a11: i64,
+    a12: i64,
+    a13: i64,
+    a14: i64,
+    a15: i64,
+    a16: i64,
+    a17: i64,
+    a18: i64,
+    a19: i64,
+    a20: i64,
+    a21: i64,
+    a22: i64,
+    a23: i64,
+    a24: i64,
+    a25: i64,
+    a26: i64,
+    a27: i64,
+    a28: i64,
+    a29: i64,
+    a30: i64,
+    a31: i64,
+    a32: i64,
+    a33: i64,
+    a34: i64,
+    a35: i64,
+    a36: i64,
+    a37: i64,
+    a38: i64,
+    a39: i64,
+    a40: i64,
+    a41: i64,
+    a42: i64,
+    a43: i64,
+    a44: i64,
+    a45: i64,
+    a46: i64,
+    a47: i64,
+    a48: i64,
+    a49: i64,
+    a50: i64,
+    a51: i64,
+    a52: i64,
+    a53: i64,
+    a54: i64,
+    a55: i64,
+    a56: i64,
+    a57: i64,
+    a58: i64,
+    a59: i64,
+    a60: i64,
+    a61: i64,
+    a62: i64,
+    a63: i64,
+) -> __m512i {
+    _v512_set_epu32(
+        ((a63) << 24) | ((a62) << 16) | ((a61) << 8) | (a60),
+        ((a59) << 24) | ((a58) << 16) | ((a57) << 8) | (a56),
+        ((a55) << 24) | ((a54) << 16) | ((a53) << 8) | (a52),
+        ((a51) << 24) | ((a50) << 16) | ((a49) << 8) | (a48),
+        ((a47) << 24) | ((a46) << 16) | ((a45) << 8) | (a44),
+        ((a43) << 24) | ((a42) << 16) | ((a41) << 8) | (a40),
+        ((a39) << 24) | ((a38) << 16) | ((a37) << 8) | (a36),
+        ((a35) << 24) | ((a34) << 16) | ((a33) << 8) | (a32),
+        ((a31) << 24) | ((a30) << 16) | ((a29) << 8) | (a28),
+        ((a27) << 24) | ((a26) << 16) | ((a25) << 8) | (a24),
+        ((a23) << 24) | ((a22) << 16) | (((a21) << 8) | (a20)),
+        ((a19) << 24) | ((a18) << 16) | ((a17) << 8) | (a16),
+        ((a15) << 24) | ((a14) << 16) | ((a13) << 8) | (a12),
+        ((a11) << 24) | ((a10) << 16) | ((a9) << 8) | (a8),
+        ((a7) << 24) | ((a6) << 16) | ((a5) << 8) | (a4),
+        ((a3) << 24) | ((a2) << 16) | ((a1) << 8) | (a0),
+    )
+}
diff --git a/src/avx512/mod.rs b/src/avx512/mod.rs
new file mode 100644
index 0000000..191b3f8
--- /dev/null
+++ b/src/avx512/mod.rs
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+mod alpha_u8;
+mod avx512_setr;
+mod utils;
+mod vertical_u8_lp;
+
+pub(crate) use alpha_u8::{avx512_premultiply_alpha_rgba, avx512_unpremultiply_alpha_rgba};
+pub(crate) use vertical_u8_lp::convolve_vertical_avx512_row_lp;
diff --git a/src/avx512/utils.rs b/src/avx512/utils.rs
new file mode 100644
index 0000000..87acf14
--- /dev/null
+++ b/src/avx512/utils.rs
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk 01/2025. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::avx512::avx512_setr::{_v512_set_epu32, _v512_set_epu8};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[inline(always)]
+pub(crate) unsafe fn avx512_deinterleave_rgba<const HAS_VBMI: bool>(
+    bgra0: __m512i,
+    bgra1: __m512i,
+    bgra2: __m512i,
+    bgra3: __m512i,
+) -> (__m512i, __m512i, __m512i, __m512i) {
+    if HAS_VBMI {
+        let mask0 = _v512_set_epu8(
+            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, 94, 92,
+            90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, 54, 52, 50, 48,
+            46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2,
+            0,
+        );
+        let mask1 = _v512_set_epu8(
+            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, 95, 93,
+            91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, 55, 53, 51, 49,
+            47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3,
+            1,
+        );
+
+        let br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
+        let ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
+        let br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
+        let ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
+
+        let a = _mm512_permutex2var_epi8(br01, mask0, br23);
+        let c = _mm512_permutex2var_epi8(br01, mask1, br23);
+        let b = _mm512_permutex2var_epi8(ga01, mask0, ga23);
+        let d = _mm512_permutex2var_epi8(ga01, mask1, ga23);
+        (a, b, c, d)
+    } else {
+        let mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+        let b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
+        let b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
+        let b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
+        let b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
+
+        let mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+        let mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+        let br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
+        let ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
+        let br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
+        let ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
+
+        let a = _mm512_permutex2var_epi32(br01, mask0, br23);
+        let c = _mm512_permutex2var_epi32(br01, mask1, br23);
+        let b = _mm512_permutex2var_epi32(ga01, mask0, ga23);
+        let d = _mm512_permutex2var_epi32(ga01, mask1, ga23);
+        (a, b, c, d)
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn avx512_zip_epi8<const HAS_VBMI: bool>(
+    a: __m512i,
+    b: __m512i,
+) -> (__m512i, __m512i) {
+    if HAS_VBMI {
+        let mask0 = _v512_set_epu8(
+            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, 85, 21,
+            84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10,
+            73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0,
+        );
+        let ab0 = _mm512_permutex2var_epi8(a, mask0, b);
+        let mask1 = _v512_set_epu8(
+            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, 55, 118,
+            54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, 110, 46, 109, 45,
+            108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, 38, 101, 37, 100, 36, 99,
+            35, 98, 34, 97, 33, 96, 32,
+        );
+        let ab1 = _mm512_permutex2var_epi8(a, mask1, b);
+        (ab0, ab1)
+    } else {
+        let low = _mm512_unpacklo_epi8(a, b);
+        let high = _mm512_unpackhi_epi8(a, b);
+        let ab0 = _mm512_permutex2var_epi64(low, _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0), high);
+        let ab1 =
+            _mm512_permutex2var_epi64(low, _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4), high);
+        (ab0, ab1)
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn avx512_interleave_rgba<const HAS_VBMI: bool>(
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+    d: __m512i,
+) -> (__m512i, __m512i, __m512i, __m512i) {
+    let (br01, br23) = avx512_zip_epi8::<HAS_VBMI>(a, c);
+    let (ga01, ga23) = avx512_zip_epi8::<HAS_VBMI>(b, d);
+    let (bgra0, bgra1) = avx512_zip_epi8::<HAS_VBMI>(br01, ga01);
+    let (bgra2, bgra3) = avx512_zip_epi8::<HAS_VBMI>(br23, ga23);
+    (bgra0, bgra1, bgra2, bgra3)
+}
+
+/// Exact division by 255 with rounding to nearest
+#[inline(always)]
+pub(crate) unsafe fn avx512_div_by255(v: __m512i) -> __m512i {
+    let addition = _mm512_set1_epi16(127);
+    _mm512_srli_epi16::<8>(_mm512_add_epi16(
+        _mm512_add_epi16(v, addition),
+        _mm512_srli_epi16::<8>(v),
+    ))
+}
diff --git a/src/avx512/vertical_u8_lp.rs b/src/avx512/vertical_u8_lp.rs
new file mode 100644
index 0000000..4fa8b74
--- /dev/null
+++ b/src/avx512/vertical_u8_lp.rs
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::avx2::utils::avx2_pack_u16;
+use crate::filter_weights::FilterBounds;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub(crate) fn convolve_vertical_avx512_row_lp(
+    dst_width: usize,
+    bounds: &FilterBounds,
+    src: &[u8],
+    dst: &mut [u8],
+    src_stride: usize,
+    weights: &[i16],
+) {
+    unsafe {
+        convolve_vertical_avx2_row_impl(dst_width, bounds, src, dst, src_stride, weights);
+    }
+}
+
+#[inline(always)]
+unsafe fn m256dot(
+    store0: __m256i,
+    store1: __m256i,
+    row: __m256i,
+    weight: __m256i,
+) -> (__m256i, __m256i) {
+    let lo = _mm256_unpacklo_epi8(row, row);
+    let hi = _mm256_unpackhi_epi8(row, row);
+
+    let store0 = _mm256_add_epi16(
+        store0,
+        _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(lo), weight),
+    );
+    let store1 = _mm256_add_epi16(
+        store1,
+        _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(hi), weight),
+    );
+    (store0, store1)
+}
+
+#[inline(always)]
+unsafe fn m512dot(
+    store0: __m512i,
+    store1: __m512i,
+    row: __m512i,
+    weight: __m512i,
+) -> (__m512i, __m512i) {
+    let lo = _mm512_unpacklo_epi8(row, row);
+    let hi = _mm512_unpackhi_epi8(row, row);
+
+    let store0 = _mm512_add_epi16(
+        store0,
+        _mm512_mulhrs_epi16(_mm512_srli_epi16::<2>(lo), weight),
+    );
+    let store1 = _mm512_add_epi16(
+        store1,
+        _mm512_mulhrs_epi16(_mm512_srli_epi16::<2>(hi), weight),
+    );
+    (store0, store1)
+}
+
+#[target_feature(enable = "avx512f", enable = "avx512bw")]
+/// This inlining is required to activate all features for runtime dispatch
+unsafe fn convolve_vertical_avx2_row_impl(
+    _: usize,
+    bounds: &FilterBounds,
+    src: &[u8],
+    dst: &mut [u8],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    let bounds_size = bounds.size;
+    const SCALE: u32 = 6;
+    const R_SHR_SCALE: i32 = SCALE as i32;
+    const AR_SHR_SCALE: u32 = SCALE;
+    const ROUNDING: i16 = 1 << (R_SHR_SCALE - 1);
+
+    let mut cx = 0usize;
+
+    let mut rem = dst;
+
+    let iter_64 = rem.chunks_exact_mut(64);
+
+    for dst in iter_64 {
+        let mut store0 = _mm512_set1_epi16(ROUNDING);
+        let mut store1 = _mm512_set1_epi16(ROUNDING);
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weights = weight.get_unchecked(0..2);
+            let v_weight0 = _mm512_set1_epi16(weights[0]);
+            let v_weight1 = _mm512_set1_epi16(weights[1]);
+            let v_offset0 = src_stride * py + px;
+            let src_ptr0 = src.get_unchecked(v_offset0..);
+            let v_offset1 = src_stride * (py + 1) + px;
+            let src_ptr1 = src.get_unchecked(v_offset1..);
+
+            let item_row0 = _mm512_loadu_si512(src_ptr0.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row0, v_weight0);
+
+            let item_row1 = _mm512_loadu_si512(src_ptr1.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row1, v_weight1);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weights = weight.get_unchecked(0..3);
+            let v_weight0 = _mm512_set1_epi16(weights[0]);
+            let v_weight1 = _mm512_set1_epi16(weights[1]);
+            let v_weight2 = _mm512_set1_epi16(weights[2]);
+            let v_offset0 = src_stride * py + px;
+            let src_ptr0 = src.get_unchecked(v_offset0..);
+            let v_offset1 = src_stride * (py + 1) + px;
+            let src_ptr1 = src.get_unchecked(v_offset1..);
+            let v_offset2 = src_stride * (py + 2) + px;
+            let src_ptr2 = src.get_unchecked(v_offset2..);
+
+            let item_row0 = _mm512_loadu_si512(src_ptr0.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row0, v_weight0);
+
+            let item_row1 = _mm512_loadu_si512(src_ptr1.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row1, v_weight1);
+
+            let item_row2 = _mm512_loadu_si512(src_ptr2.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row2, v_weight2);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weights = weight.get_unchecked(0..4);
+            let v_weight0 = _mm512_set1_epi16(weights[0]);
+            let v_weight1 = _mm512_set1_epi16(weights[1]);
+            let v_weight2 = _mm512_set1_epi16(weights[2]);
+            let v_weight3 = _mm512_set1_epi16(weights[3]);
+            let v_offset0 = src_stride * py + px;
+            let src_ptr0 = src.get_unchecked(v_offset0..);
+            let v_offset1 = src_stride * (py + 1) + px;
+            let src_ptr1 = src.get_unchecked(v_offset1..);
+            let v_offset2 = src_stride * (py + 2) + px;
+            let src_ptr2 = src.get_unchecked(v_offset2..);
+            let v_offset3 = src_stride * (py + 3) + px;
+            let src_ptr3 = src.get_unchecked(v_offset3..);
+
+            let item_row0 = _mm512_loadu_si512(src_ptr0.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row0, v_weight0);
+
+            let item_row1 = _mm512_loadu_si512(src_ptr1.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row1, v_weight1);
+
+            let item_row2 = _mm512_loadu_si512(src_ptr2.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row2, v_weight2);
+
+            let item_row3 = _mm512_loadu_si512(src_ptr3.as_ptr() as *const _);
+            (store0, store1) = m512dot(store0, store1, item_row3, v_weight3);
+        } else {
+            for j in 0..bounds_size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..(j + 1));
+                let v_weight = _mm512_set1_epi16(weight[0]);
+                let v_offset = src_stride * py + px;
+                let src_ptr = src.get_unchecked(v_offset..);
+                let item_row0 = _mm512_loadu_si512(src_ptr.as_ptr() as *const _);
+
+                (store0, store1) = m512dot(store0, store1, item_row0, v_weight);
+            }
+        }
+
+        let rebased0 = _mm512_srai_epi16::<AR_SHR_SCALE>(store0);
+        let rebased1 = _mm512_srai_epi16::<AR_SHR_SCALE>(store1);
+
+        let shrank0 = _mm512_packus_epi16(rebased0, rebased1);
+
+        _mm512_storeu_si512(dst.as_mut_ptr() as *mut _, shrank0);
+
+        cx += 64;
+    }
+
+    rem = rem.chunks_exact_mut(64).into_remainder();
+
+    let iter_32 = rem.chunks_exact_mut(32);
+
+    for dst in iter_32 {
+        let mut store0 = _mm256_set1_epi16(ROUNDING);
+        let mut store1 = _mm256_set1_epi16(ROUNDING);
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weights = weight.get_unchecked(0..2);
+            let v_weight0 = _mm256_set1_epi16(weights[0]);
+            let v_weight1 = _mm256_set1_epi16(weights[1]);
+            let v_offset0 = src_stride * py + px;
+            let src_ptr0 = src.get_unchecked(v_offset0..);
+            let v_offset1 = src_stride * (py + 1) + px;
+            let src_ptr1 = src.get_unchecked(v_offset1..);
+
+            let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
+
+            let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row1, v_weight1);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weights = weight.get_unchecked(0..3);
+            let v_weight0 = _mm256_set1_epi16(weights[0]);
+            let v_weight1 = _mm256_set1_epi16(weights[1]);
+            let v_weight2 = _mm256_set1_epi16(weights[2]);
+            let v_offset0 = src_stride * py + px;
+            let src_ptr0 = src.get_unchecked(v_offset0..);
+            let v_offset1 = src_stride * (py + 1) + px;
+            let src_ptr1 = src.get_unchecked(v_offset1..);
+            let v_offset2 = src_stride * (py + 2) + px;
+            let src_ptr2 = src.get_unchecked(v_offset2..);
+
+            let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
+
+            let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row1, v_weight1);
+
+            let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row2, v_weight2);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weights = weight.get_unchecked(0..4);
+            let v_weight0 = _mm256_set1_epi16(weights[0]);
+            let v_weight1 = _mm256_set1_epi16(weights[1]);
+            let v_weight2 = _mm256_set1_epi16(weights[2]);
+            let v_weight3 = _mm256_set1_epi16(weights[3]);
+            let v_offset0 = src_stride * py + px;
+            let src_ptr0 = src.get_unchecked(v_offset0..);
+            let v_offset1 = src_stride * (py + 1) + px;
+            let src_ptr1 = src.get_unchecked(v_offset1..);
+            let v_offset2 = src_stride * (py + 2) + px;
+            let src_ptr2 = src.get_unchecked(v_offset2..);
+            let v_offset3 = src_stride * (py + 3) + px;
+            let src_ptr3 = src.get_unchecked(v_offset3..);
+
+            let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
+
+            let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row1, v_weight1);
+
+            let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row2, v_weight2);
+
+            let item_row3 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
+            (store0, store1) = m256dot(store0, store1, item_row3, v_weight3);
+        } else {
+            for j in 0..bounds_size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..(j + 1));
+                let v_weight = _mm256_set1_epi16(weight[0]);
+                let v_offset = src_stride * py + px;
+                let src_ptr = src.get_unchecked(v_offset..);
+                let item_row0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
+
+                (store0, store1) = m256dot(store0, store1, item_row0, v_weight);
+            }
+        }
+
+        let rebased0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
+        let rebased1 = _mm256_srai_epi16::<R_SHR_SCALE>(store1);
+
+        let shrank0 = _mm256_packus_epi16(rebased0, rebased1);
+        _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
+
+        cx += 32;
+    }
+
+    rem = rem.chunks_exact_mut(32).into_remainder();
+    let iter_16 = rem.chunks_exact_mut(16);
+
+    for dst in iter_16 {
+        let mut store0 = _mm256_set1_epi16(ROUNDING);
+
+        let px = cx;
+
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm256_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..);
+            let mut item_row = _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(
+                _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i),
+            ));
+            item_row = _mm256_unpacklo_epi8(item_row, item_row);
+            store0 = _mm256_add_epi16(
+                store0,
+                _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(item_row), v_weight),
+            );
+        }
+
+        store0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
+
+        let packed = avx2_pack_u16(store0, store0);
+
+        let rebased0 = _mm256_castsi256_si128(packed);
+        _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, rebased0);
+
+        cx += 16;
+    }
+
+    rem = rem.chunks_exact_mut(16).into_remainder();
+    let iter_8 = rem.chunks_exact_mut(8);
+
+    for dst in iter_8 {
+        let mut store = _mm_set1_epi16(ROUNDING);
+
+        let px = cx;
+
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..);
+            let mut item_row = _mm_loadu_si64(src_ptr.as_ptr());
+            item_row = _mm_unpacklo_epi8(item_row, item_row);
+
+            let low = _mm_srli_epi16::<2>(item_row);
+            store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
+        }
+
+        let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
+        let shrank = _mm_packus_epi16(rebased, rebased);
+        _mm_storeu_si64(dst.as_mut_ptr(), shrank);
+
+        cx += 8;
+    }
+
+    rem = rem.chunks_exact_mut(8).into_remainder();
+    let iter_1 = rem.iter_mut();
+
+    for dst in iter_1 {
+        let mut store = _mm_set1_epi16(ROUNDING);
+
+        let px = cx;
+
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..(v_offset + 1));
+            let item_row = _mm_set1_epi8(src_ptr[0] as i8);
+
+            store = _mm_add_epi16(
+                store,
+                _mm_mulhrs_epi16(
+                    _mm_srli_epi16::<2>(_mm_unpacklo_epi8(item_row, item_row)),
+                    v_weight,
+                ),
+            );
+        }
+
+        let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
+        let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased));
+        *dst = value as u8;
+
+        cx += 1;
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 1824e9f..9e52411 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,9 @@
 #![deny(deprecated)]
 // #![deny(unreachable_code, unused)]
 #![allow(clippy::too_many_arguments)]
+#![cfg_attr(feature = "nightly_avx512", feature(cfg_version))]
+#![cfg_attr(feature = "nightly_avx512", feature(avx512_target_feature))]
+#![cfg_attr(feature = "nightly_avx512", feature(stdarch_x86_avx512))]
 mod alpha_check;
 #[cfg(feature = "half")]
 mod alpha_handle_f16;
@@ -38,6 +41,11 @@ mod alpha_handle_u8;
 mod ar30;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod avx2;
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    feature = "nightly_avx512"
+))]
+mod avx512;
 mod color_group;
 #[cfg(feature = "colorspaces")]
 mod colors;
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index 099febe..dc57c41 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -120,6 +120,13 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
+            #[cfg(feature = "nightly_avx512")]
+            if std::arch::is_x86_feature_detected!("avx512bw") {
+                if _scale_factor < 8. {
+                    use crate::avx512::convolve_vertical_avx512_row_lp;
+                    _dispatcher = convolve_vertical_avx512_row_lp;
+                }
+            }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
         {
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index dae1038..29db8eb 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -119,6 +119,13 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
+            #[cfg(feature = "nightly_avx512")]
+            if std::arch::is_x86_feature_detected!("avx512bw") {
+                if _scale_factor < 8. {
+                    use crate::avx512::convolve_vertical_avx512_row_lp;
+                    _dispatcher = convolve_vertical_avx512_row_lp;
+                }
+            }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
         {
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index 14d3a23..0a3398e 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -141,6 +141,13 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
+            #[cfg(feature = "nightly_avx512")]
+            if std::arch::is_x86_feature_detected!("avx512bw") {
+                if _scale_factor < 8. {
+                    use crate::avx512::convolve_vertical_avx512_row_lp;
+                    _dispatcher = convolve_vertical_avx512_row_lp;
+                }
+            }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
         {
diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs
index d5f5c33..efb1c01 100644
--- a/src/sse/alpha_u16.rs
+++ b/src/sse/alpha_u16.rs
@@ -121,8 +121,7 @@ trait DisassociateAlpha {
 struct DisassociateAlphaDefault {}
 
 impl DisassociateAlphaDefault {
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
+    #[inline(always)]
     unsafe fn disassociate_chunk(&self, in_place: &mut [u16], v_max_colors: __m128) {
         let src_ptr = in_place.as_ptr();
         let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
@@ -362,8 +361,7 @@ impl<const BIT_DEPTH: usize> Sse41PremultiplyExecutor
 struct Sse41PremultiplyExecutorAny {}
 
 impl Sse41PremultiplyExecutorAny {
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
+    #[inline(always)]
     unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m128) {
         let src_ptr = src.as_ptr();
         let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs
index 6e23821..8fba54c 100644
--- a/src/sse/alpha_u8.rs
+++ b/src/sse/alpha_u8.rs
@@ -64,10 +64,7 @@ pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
     let lo = _mm_unpacklo_epi8(x, zeros);
     let hi = _mm_unpackhi_epi8(x, zeros);
 
-    let scale = _mm_set1_epi16(255);
-
     let is_zero_mask = _mm_cmpeq_epi8(a, zeros);
-    let a = _mm_select_si128(is_zero_mask, scale, a);
 
     let scale_ps = _mm_set1_ps(255f32);
 
@@ -114,8 +111,7 @@ trait Sse41PremultiplyExecutorRgba8 {
 struct Sse41PremultiplyExecutor8Default {}
 
 impl Sse41PremultiplyExecutor8Default {
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
+    #[inline(always)]
     unsafe fn premultiply_chunk(&self, dst: &mut [u8], src: &[u8]) {
         let zeros = _mm_setzero_si128();
         let src_ptr = src.as_ptr();
@@ -254,8 +250,7 @@ trait DisassociateAlpha {
 struct DisassociateAlphaDefault {}
 
 impl DisassociateAlphaDefault {
-    #[inline]
-    #[target_feature(enable = "sse4.1")]
+    #[inline(always)]
     unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) {
         let src_ptr = in_place.as_ptr();
         let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i);
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 45417dd..30fd525 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -64,10 +64,7 @@ pub(crate) use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alp
 pub(crate) use alpha_f32::sse_premultiply_alpha_rgba_f32;
 pub(crate) use alpha_f32::sse_unpremultiply_alpha_rgba_f32;
 pub(crate) use alpha_u16::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16};
-pub(crate) use alpha_u8::{
-    _mm_div_by_255_epi16, sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba,
-    sse_unpremultiply_row,
-};
+pub(crate) use alpha_u8::{sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba};
 pub(crate) use check_alpha::{
     sse_has_non_constant_cap_alpha_rgba16, sse_has_non_constant_cap_alpha_rgba8,
 };