diff --git a/Cargo.toml b/Cargo.toml index 12abb08..b8a12a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,4 +27,5 @@ libc = "0.2.158" [features] default = ["colorspaces"] -colorspaces = ["dep:colorutils-rs"] \ No newline at end of file +colorspaces = ["dep:colorutils-rs"] +nightly_avx512 = [] \ No newline at end of file diff --git a/README.md b/README.md index 5585736..a9ff1db 100644 --- a/README.md +++ b/README.md @@ -44,25 +44,27 @@ Despite all implementation are fast, not all the paths are implemented using SIM `~` - Partially implemented -| | NEON | SSE | AVX2 | WASM | -|----------------|------|-----|------|------| -| RGBA (8 bit) | x | x | x | ~ | -| RGB (8 bit) | x | x | ~ | ~ | -| Plane (8 bit) | x | x | ~ | ~ | -| RGBA (8+ bit) | x | x | ~ | - | -| RGB (8+ bit) | x | x | ~ | - | -| Plane (8+ bit) | ~ | ~ | ~ | - | -| RGBA (f32) | x | x | x | - | -| RGB (f32) | x | x | ~ | - | -| Plane (f32) | x | x | ~ | - | -| RGBA (f16) | x | x | x | - | -| RGB (f16) | x | ~ | ~ | - | -| Plane (f16) | ~ | ~ | ~ | - | -| AR30/RA30 | x | - | - | - | +| | NEON | SSE | AVX2 | AVX-512 | WASM | +|----------------|------|-----|------|---------|------| +| RGBA (8 bit) | x | x | x | ~ | ~ | +| RGB (8 bit) | x | x | ~ | ~ | ~ | +| Plane (8 bit) | x | x | ~ | ~ | ~ | +| RGBA (8+ bit) | x | x | ~ | - | - | +| RGB (8+ bit) | x | x | ~ | - | - | +| Plane (8+ bit) | ~ | ~ | ~ | - | - | +| RGBA (f32) | x | x | x | - | - | +| RGB (f32) | x | x | ~ | - | - | +| Plane (f32) | x | x | ~ | - | - | +| RGBA (f16) | x | x | x | - | - | +| RGB (f16) | x | ~ | ~ | - | - | +| Plane (f16) | ~ | ~ | ~ | - | - | +| AR30/RA30 | x | - | - | - | - | #### Features -To enable support of `f16` the feature `half` should be activated. +Features: + - To enable support of `f16` the feature `half` should be activated. + - `nightly_avx512` activates AVX-512 feature set and requires `nightly` compiler channel #### Target features with runtime dispatch @@ -72,6 +74,8 @@ For x86 and aarch64 NEON runtime dispatch is used. `avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, no additional actions need, and called the best path. +`avx512` requires feature `nightly_avx512` and requires `nightly` compiler channel, runtime detection if it is available then will be used. + `fullfp16` NEON target detection performed in runtime, when available best the best paths for *f16* images are available on ARM. WASM `simd128` target feature activating is mandatory in build flags. diff --git a/app/Cargo.toml b/app/Cargo.toml index 1c09c6d..71f0b9f 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half"], default-features = true } +pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } diff --git a/app/src/main.rs b/app/src/main.rs index 25367ac..3f73175 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -42,7 +42,7 @@ fn resize_plane( fn main() { // test_fast_image(); - let img = ImageReader::open("./assets/asset_4.png") + let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png") .unwrap() .decode() .unwrap(); @@ -59,7 +59,7 @@ fn main() { // let store = - ImageStore::::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize) + ImageStore::::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize) .unwrap(); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); @@ -75,15 +75,13 @@ fn main() { // ) // .unwrap(); - let mut dst_store = ImageStoreMut::::alloc_with_depth( + let mut dst_store = ImageStoreMut::::alloc_with_depth( dimensions.0 as usize, dimensions.1 as usize / 2, 10, ); - scaler - .resize_rgba_u16(&store, &mut dst_store, true) - .unwrap(); + scaler.resize_rgba(&store, &mut dst_store, true).unwrap(); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -160,13 +158,13 @@ fn main() { // .map(|&x| (x * 255f32) as u8) // .collect(); - let dst: Vec = dst_store - .as_bytes() - .iter() - .map(|&x| (x >> 2) as u8) - .collect(); + // let dst: Vec = dst_store + // .as_bytes() + // .iter() + // .map(|&x| (x >> 2) as u8) + // .collect(); - // let dst = dst_store.as_bytes(); + let dst = dst_store.as_bytes(); // let dst = resized; // image::save_buffer( // "converted.png", diff --git a/src/alpha_handle_u8.rs b/src/alpha_handle_u8.rs index c77e792..3c15e0d 100644 --- a/src/alpha_handle_u8.rs +++ b/src/alpha_handle_u8.rs @@ -137,6 +137,13 @@ pub(crate) fn premultiply_alpha_rgba( if is_x86_feature_detected!("avx2") { _dispatcher = avx_premultiply_alpha_rgba; } + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512bw") + { + use crate::avx512::avx512_premultiply_alpha_rgba; + _dispatcher = avx512_premultiply_alpha_rgba; + } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { @@ -159,15 +166,22 @@ pub(crate) fn unpremultiply_alpha_rgba( } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if is_x86_feature_detected!("sse4.1") { + if std::arch::is_x86_feature_detected!("sse4.1") { _dispatcher = sse_unpremultiply_alpha_rgba; } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if is_x86_feature_detected!("avx2") { + if std::arch::is_x86_feature_detected!("avx2") { _dispatcher = avx_unpremultiply_alpha_rgba; } + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512bw") + { + use crate::avx512::avx512_unpremultiply_alpha_rgba; + _dispatcher = avx512_unpremultiply_alpha_rgba; + } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs index f2f9a06..c16fdb3 100644 --- a/src/avx2/alpha_u16.rs +++ b/src/avx2/alpha_u16.rs @@ -45,8 +45,8 @@ unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m2 let low_px = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(px, zeros)); let high_px = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(px, zeros)); - let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(low_px, low_low_a))); - let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(high_px, low_high_a))); + let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(low_px, low_low_a))); + let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(high_px, low_high_a))); _mm256_packus_epi32(new_ll, new_lh) } @@ -110,8 +110,7 @@ trait Avx2PremultiplyExecutor { struct Avx2PremultiplyExecutorDefault {} impl Avx2PremultiplyExecutorDefault { - #[inline] - #[target_feature(enable = "avx2")] + #[inline(always)] unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16]) { let src_ptr = src.as_ptr(); let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); @@ -203,8 +202,7 @@ impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutor struct Avx2PremultiplyExecutorAnyBit {} impl Avx2PremultiplyExecutorAnyBit { - #[inline] - #[target_feature(enable = "avx2")] + #[inline(always)] unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m256) { let src_ptr = src.as_ptr(); let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 4ecff54..2755d5d 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -27,13 +27,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiply_alpha_rgba_row_impl}; use crate::avx2::utils::{ _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba, }; -use crate::sse::{ - _mm_div_by_255_epi16, sse_deinterleave_rgba, sse_interleave_rgba, sse_unpremultiply_row, -}; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; @@ -42,57 +38,6 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[inline(always)] -unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i { - let zeros = _mm256_setzero_si256(); - let lo = _mm256_unpacklo_epi8(x, zeros); - let hi = _mm256_unpackhi_epi8(x, zeros); - - let scale = _mm256_set1_epi16(255); - - let is_zero_mask = _mm256_cmpeq_epi8(a, zeros); - let a = _mm256_select_si256(is_zero_mask, scale, a); - - let scale_ps = _mm256_set1_ps(255f32); - - let lo_lo = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)), - scale_ps, - ); - let lo_hi = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)), - scale_ps, - ); - let hi_lo = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)), - scale_ps, - ); - let hi_hi = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), - scale_ps, - ); - let a_lo = _mm256_unpacklo_epi8(a, zeros); - let a_hi = _mm256_unpackhi_epi8(x, zeros); - let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); - let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); - let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); - let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); - - let lo_lo = _mm256_cvtps_epi32(_mm256_mul_ps(lo_lo, a_lo_lo)); - let lo_hi = _mm256_cvtps_epi32(_mm256_mul_ps(lo_hi, a_lo_hi)); - let hi_lo = _mm256_cvtps_epi32(_mm256_mul_ps(hi_lo, a_hi_lo)); - let hi_hi = _mm256_cvtps_epi32(_mm256_mul_ps(hi_hi, a_hi_hi)); - - _mm256_select_si256( - is_zero_mask, - zeros, - _mm256_packus_epi16( - _mm256_packus_epi32(lo_lo, lo_hi), - _mm256_packus_epi32(hi_lo, hi_hi), - ), - ) -} - pub(crate) fn avx_premultiply_alpha_rgba( dst: &mut [u8], src: &[u8], @@ -105,15 +50,16 @@ pub(crate) fn avx_premultiply_alpha_rgba( } } -#[target_feature(enable = "avx2")] -unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { - let mut rem = dst; - let mut src_rem = src; - - for (dst, src) in rem - .chunks_exact_mut(32 * 4) - .zip(src_rem.chunks_exact(32 * 4)) - { +trait AssociateAlpha { + unsafe fn associate(&self, dst: &mut [u8], src: &[u8]); +} + +#[derive(Default)] +struct AssociateAlphaDefault {} + +impl AssociateAlphaDefault { + #[inline(always)] + unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) { let src_ptr = src.as_ptr(); let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); @@ -153,58 +99,48 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); } +} - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(32 * 4).remainder(); - - let zeros = _mm_setzero_si128(); - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); +impl AssociateAlpha for AssociateAlphaDefault { + #[target_feature(enable = "avx2")] + unsafe fn associate(&self, dst: &mut [u8], src: &[u8]) { + let mut rem = dst; + let mut src_rem = src; - let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); + for (dst, src) in rem + .chunks_exact_mut(32 * 4) + .zip(src_rem.chunks_exact(32 * 4)) + { + self.associate_chunk(dst, src); + } - let aaa_low = _mm_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm_unpackhi_epi8(aaa, zeros); + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(32 * 4).remainder(); - rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); - rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); - ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); - ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); - bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); - bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); + if !rem.is_empty() { + const PART_SIZE: usize = 32 * 4; + assert!(src_rem.len() < PART_SIZE); + assert!(rem.len() < PART_SIZE); + assert_eq!(src_rem.len(), rem.len()); - let rrr = _mm_packus_epi16(rrr_low, rrr_high); - let ggg = _mm_packus_epi16(ggg_low, ggg_high); - let bbb = _mm_packus_epi16(bbb_low, bbb_high); + let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len()); - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + self.associate_chunk(&mut dst_buffer, &buffer); - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); + std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } } +} - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); - - premultiply_alpha_rgba_row_impl(rem, src_rem); +#[target_feature(enable = "avx2")] +unsafe fn avx_premultiply_alpha_rgba_impl_row( + dst: &mut [u8], + src: &[u8], + executor: impl AssociateAlpha, +) { + executor.associate(dst, src); } #[target_feature(enable = "avx2")] @@ -220,14 +156,14 @@ unsafe fn avx_premultiply_alpha_rgba_impl( dst.par_chunks_exact_mut(width * 4) .zip(src.par_chunks_exact(width * 4)) .for_each(|(dst, src)| unsafe { - avx_premultiply_alpha_rgba_impl_row(dst, src); + avx_premultiply_alpha_rgba_impl_row(dst, src, AssociateAlphaDefault::default()); }); }); } else { dst.chunks_exact_mut(width * 4) .zip(src.chunks_exact(width * 4)) .for_each(|(dst, src)| unsafe { - avx_premultiply_alpha_rgba_impl_row(dst, src); + avx_premultiply_alpha_rgba_impl_row(dst, src, AssociateAlphaDefault::default()); }); } } @@ -243,57 +179,117 @@ pub(crate) fn avx_unpremultiply_alpha_rgba( } } -#[target_feature(enable = "avx2")] -unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { - let mut rem = in_place; +trait DisassociateAlpha { + unsafe fn disassociate(&self, in_place: &mut [u8]); +} - for dst in rem.chunks_exact_mut(32 * 4) { - let src_ptr = dst.as_ptr(); +#[derive(Default)] +struct Avx2DisassociateAlpha {} + +impl Avx2DisassociateAlpha { + #[inline(always)] + unsafe fn avx2_unpremultiply_row(&self, x: __m256i, a: __m256i) -> __m256i { + let zeros = _mm256_setzero_si256(); + let lo = _mm256_unpacklo_epi8(x, zeros); + let hi = _mm256_unpackhi_epi8(x, zeros); + + let is_zero_mask = _mm256_cmpeq_epi8(a, zeros); + + let scale_ps = _mm256_set1_ps(255f32); + + let lo_lo = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)), + scale_ps, + ); + let lo_hi = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)), + scale_ps, + ); + let hi_lo = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)), + scale_ps, + ); + let hi_hi = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), + scale_ps, + ); + let a_lo = _mm256_unpacklo_epi8(a, zeros); + let a_hi = _mm256_unpackhi_epi8(a, zeros); + let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); + let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); + let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); + let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); + + let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo))); + let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi))); + let hi_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_lo, a_hi_lo))); + let hi_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_hi, a_hi_hi))); + + _mm256_select_si256( + is_zero_mask, + zeros, + _mm256_packus_epi16( + _mm256_packus_epi32(lo_lo, lo_hi), + _mm256_packus_epi32(hi_lo, hi_hi), + ), + ) + } + + #[inline(always)] + unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) { + let src_ptr = in_place.as_ptr(); let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - let rrr = avx2_unpremultiply_row(rrr, aaa); - let ggg = avx2_unpremultiply_row(ggg, aaa); - let bbb = avx2_unpremultiply_row(bbb, aaa); + let rrr = self.avx2_unpremultiply_row(rrr, aaa); + let ggg = self.avx2_unpremultiply_row(ggg, aaa); + let bbb = self.avx2_unpremultiply_row(bbb, aaa); let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); - let dst_ptr = dst.as_mut_ptr(); + let dst_ptr = in_place.as_mut_ptr(); _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); } +} - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); +impl DisassociateAlpha for Avx2DisassociateAlpha { + #[target_feature(enable = "avx2")] + unsafe fn disassociate(&self, in_place: &mut [u8]) { + let mut rem = in_place; - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + for dst in rem.chunks_exact_mut(32 * 4) { + self.disassociate_chunk(dst); + } - let rrr = sse_unpremultiply_row(rrr, aaa); - let ggg = sse_unpremultiply_row(ggg, aaa); - let bbb = sse_unpremultiply_row(bbb, aaa); + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + if !rem.is_empty() { + const PART_SIZE: usize = 32 * 4; + assert!(rem.len() < PART_SIZE); - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } + let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + self.disassociate_chunk(&mut buffer); - unpremultiply_alpha_rgba_row_impl(rem); + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx_unpremultiply_alpha_rgba_impl_row( + in_place: &mut [u8], + executor: impl DisassociateAlpha, +) { + executor.disassociate(in_place); } #[target_feature(enable = "avx2")] @@ -308,12 +304,12 @@ unsafe fn avx_unpremultiply_alpha_rgba_impl( in_place .par_chunks_exact_mut(width * 4) .for_each(|row| unsafe { - avx_unpremultiply_alpha_rgba_impl_row(row); + avx_unpremultiply_alpha_rgba_impl_row(row, Avx2DisassociateAlpha::default()); }); }); } else { in_place.chunks_exact_mut(width * 4).for_each(|row| unsafe { - avx_unpremultiply_alpha_rgba_impl_row(row); + avx_unpremultiply_alpha_rgba_impl_row(row, Avx2DisassociateAlpha::default()); }); } } diff --git a/src/avx512/alpha_u8.rs b/src/avx512/alpha_u8.rs new file mode 100644 index 0000000..94d23cb --- /dev/null +++ b/src/avx512/alpha_u8.rs @@ -0,0 +1,373 @@ +/* + * Copyright (c) Radzivon Bartoshyk 01/2025. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::avx512::utils::{avx512_deinterleave_rgba, avx512_div_by255, avx512_interleave_rgba}; +use rayon::iter::{IndexedParallelIterator, ParallelIterator}; +use rayon::prelude::{ParallelSlice, ParallelSliceMut}; +use rayon::ThreadPool; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +trait AssociateAlpha { + unsafe fn associate(&self, dst: &mut [u8], src: &[u8]); +} + +#[derive(Default)] +struct AssociateAlphaDefault {} + +impl AssociateAlphaDefault { + #[inline(always)] + unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) { + let src_ptr = src.as_ptr(); + let rgba0 = _mm512_loadu_si512(src_ptr as *const _); + let rgba1 = _mm512_loadu_si512(src_ptr.add(64) as *const _); + let rgba2 = _mm512_loadu_si512(src_ptr.add(128) as *const _); + let rgba3 = _mm512_loadu_si512(src_ptr.add(128 + 64) as *const _); + let (rrr, ggg, bbb, aaa) = avx512_deinterleave_rgba::(rgba0, rgba1, rgba2, rgba3); + + let zeros = _mm512_setzero_si512(); + + let mut rrr_low = _mm512_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm512_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm512_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm512_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm512_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm512_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm512_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm512_unpackhi_epi8(aaa, zeros); + + rrr_low = avx512_div_by255(_mm512_mullo_epi16(rrr_low, aaa_low)); + rrr_high = avx512_div_by255(_mm512_mullo_epi16(rrr_high, aaa_high)); + ggg_low = avx512_div_by255(_mm512_mullo_epi16(ggg_low, aaa_low)); + ggg_high = avx512_div_by255(_mm512_mullo_epi16(ggg_high, aaa_high)); + bbb_low = avx512_div_by255(_mm512_mullo_epi16(bbb_low, aaa_low)); + bbb_high = avx512_div_by255(_mm512_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm512_packus_epi16(rrr_low, rrr_high); + let ggg = _mm512_packus_epi16(ggg_low, ggg_high); + let bbb = _mm512_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = avx512_interleave_rgba::(rrr, ggg, bbb, aaa); + let dst_ptr = dst.as_mut_ptr(); + _mm512_storeu_si512(dst_ptr as *mut _, rgba0); + _mm512_storeu_si512(dst_ptr.add(64) as *mut _, rgba1); + _mm512_storeu_si512(dst_ptr.add(128) as *mut _, rgba2); + _mm512_storeu_si512(dst_ptr.add(128 + 64) as *mut _, rgba3); + } +} + +impl AssociateAlpha for AssociateAlphaDefault { + #[target_feature(enable = "avx512f", enable = "avx512bw")] + unsafe fn associate(&self, dst: &mut [u8], src: &[u8]) { + let mut rem = dst; + let mut src_rem = src; + + for (dst, src) in rem + .chunks_exact_mut(64 * 4) + .zip(src_rem.chunks_exact(64 * 4)) + { + self.associate_chunk(dst, src); + } + + rem = rem.chunks_exact_mut(64 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(64 * 4).remainder(); + + if !rem.is_empty() { + const PART_SIZE: usize = 64 * 4; + assert!(src_rem.len() < PART_SIZE); + assert!(rem.len() < PART_SIZE); + assert_eq!(src_rem.len(), rem.len()); + + let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len()); + + self.associate_chunk(&mut dst_buffer, &buffer); + + std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +impl AssociateAlpha for AssociateAlphaDefault { + #[target_feature(enable = "avx512f", enable = "avx512bw", enable = "avx512vbmi")] + unsafe fn associate(&self, dst: &mut [u8], src: &[u8]) { + let mut rem = dst; + let mut src_rem = src; + + for (dst, src) in rem + .chunks_exact_mut(64 * 4) + .zip(src_rem.chunks_exact(64 * 4)) + { + self.associate_chunk(dst, src); + } + + rem = rem.chunks_exact_mut(64 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(64 * 4).remainder(); + + if !rem.is_empty() { + const PART_SIZE: usize = 64 * 4; + assert!(src_rem.len() < PART_SIZE); + assert!(rem.len() < PART_SIZE); + assert_eq!(src_rem.len(), rem.len()); + + let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + let mut dst_buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len()); + + self.associate_chunk(&mut dst_buffer, &buffer); + + std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8], executor: impl AssociateAlpha) { + unsafe { + executor.associate(dst, src); + } +} + +pub(crate) fn avx512_premultiply_alpha_rgba( + dst: &mut [u8], + src: &[u8], + width: usize, + _: usize, + pool: &Option, +) { + let has_vbmi = std::arch::is_x86_feature_detected!("avx512vbmi"); + if let Some(pool) = pool { + pool.install(|| { + dst.par_chunks_exact_mut(width * 4) + .zip(src.par_chunks_exact(width * 4)) + .for_each(|(dst, src)| { + if has_vbmi { + avx_premultiply_alpha_rgba_impl_row( + dst, + src, + AssociateAlphaDefault::::default(), + ); + } else { + avx_premultiply_alpha_rgba_impl_row( + dst, + src, + AssociateAlphaDefault::::default(), + ); + } + }); + }); + } else { + dst.chunks_exact_mut(width * 4) + .zip(src.chunks_exact(width * 4)) + .for_each(|(dst, src)| { + if has_vbmi { + avx_premultiply_alpha_rgba_impl_row( + dst, + src, + AssociateAlphaDefault::::default(), + ); + } else { + avx_premultiply_alpha_rgba_impl_row( + dst, + src, + AssociateAlphaDefault::::default(), + ); + } + }); + } +} + +trait DisassociateAlpha { + unsafe fn disassociate(&self, in_place: &mut [u8]); +} + +#[derive(Default)] +struct Avx512DisassociateAlpha {} + +impl Avx512DisassociateAlpha { + #[inline(always)] + unsafe fn avx512_unpremultiply_row(&self, x: __m512i, a: __m512i) -> __m512i { + let zeros = _mm512_setzero_si512(); + let lo = _mm512_unpacklo_epi8(x, zeros); + let hi = _mm512_unpackhi_epi8(x, zeros); + + let is_zero_mask = _mm512_cmp_epi8_mask::<0>(a, zeros); + + let scale_ps = _mm512_set1_ps(255f32); + + let lo_lo = _mm512_mul_ps( + _mm512_cvtepi32_ps(_mm512_unpacklo_epi16(lo, zeros)), + scale_ps, + ); + let lo_hi = _mm512_mul_ps( + _mm512_cvtepi32_ps(_mm512_unpackhi_epi16(lo, zeros)), + scale_ps, + ); + let hi_lo = _mm512_mul_ps( + _mm512_cvtepi32_ps(_mm512_unpacklo_epi16(hi, zeros)), + scale_ps, + ); + let hi_hi = _mm512_mul_ps( + _mm512_cvtepi32_ps(_mm512_unpackhi_epi16(hi, zeros)), + scale_ps, + ); + let a_lo = _mm512_unpacklo_epi8(a, zeros); + let a_hi = _mm512_unpackhi_epi8(a, zeros); + + let a_lo_lo = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpacklo_epi16(a_lo, zeros))); + let a_lo_hi = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpackhi_epi16(a_lo, zeros))); + let a_hi_lo = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpacklo_epi16(a_hi, zeros))); + let a_hi_hi = _mm512_rcp14_ps(_mm512_cvtepi32_ps(_mm512_unpackhi_epi16(a_hi, zeros))); + + const FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + let lo_lo = _mm512_cvtps_epi32(_mm512_mul_round_ps::(lo_lo, a_lo_lo)); + let lo_hi = _mm512_cvtps_epi32(_mm512_mul_round_ps::(lo_hi, a_lo_hi)); + let hi_lo = _mm512_cvtps_epi32(_mm512_mul_round_ps::(hi_lo, a_hi_lo)); + let hi_hi = _mm512_cvtps_epi32(_mm512_mul_round_ps::(hi_hi, a_hi_hi)); + + let packed = _mm512_packus_epi16( + _mm512_packus_epi32(lo_lo, lo_hi), + _mm512_packus_epi32(hi_lo, hi_hi), + ); + _mm512_mask_blend_epi8(is_zero_mask, packed, _mm512_setzero_si512()) + } + + #[inline(always)] + unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) { + let src_ptr = in_place.as_ptr(); + let rgba0 = _mm512_loadu_si512(src_ptr as *const _); + let rgba1 = _mm512_loadu_si512(src_ptr.add(64) as *const _); + let rgba2 = _mm512_loadu_si512(src_ptr.add(128) as *const _); + let rgba3 = _mm512_loadu_si512(src_ptr.add(64 + 128) as *const _); + let (rrr, ggg, bbb, aaa) = avx512_deinterleave_rgba::(rgba0, rgba1, rgba2, rgba3); + + let rrr = self.avx512_unpremultiply_row(rrr, aaa); + let ggg = self.avx512_unpremultiply_row(ggg, aaa); + let bbb = self.avx512_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = avx512_interleave_rgba::(rrr, ggg, bbb, aaa); + + let dst_ptr = in_place.as_mut_ptr(); + _mm512_storeu_si512(dst_ptr as *mut _, rgba0); + _mm512_storeu_si512(dst_ptr.add(64) as *mut _, rgba1); + _mm512_storeu_si512(dst_ptr.add(128) as *mut _, rgba2); + _mm512_storeu_si512(dst_ptr.add(128 + 64) as *mut _, rgba3); + } +} + +impl DisassociateAlpha for Avx512DisassociateAlpha { + #[target_feature(enable = "avx512f", enable = "avx512bw")] + unsafe fn disassociate(&self, in_place: &mut [u8]) { + let mut rem = in_place; + + for dst in rem.chunks_exact_mut(64 * 4) { + self.disassociate_chunk(dst); + } + + rem = rem.chunks_exact_mut(64 * 4).into_remainder(); + + if !rem.is_empty() { + const PART_SIZE: usize = 64 * 4; + assert!(rem.len() < PART_SIZE); + + let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); + + self.disassociate_chunk(&mut buffer); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +impl DisassociateAlpha for Avx512DisassociateAlpha { + #[target_feature(enable = "avx512f", enable = "avx512bw", enable = "avx512vbmi")] + unsafe fn disassociate(&self, in_place: &mut [u8]) { + let mut rem = in_place; + + for dst in rem.chunks_exact_mut(64 * 4) { + self.disassociate_chunk(dst); + } + + rem = rem.chunks_exact_mut(64 * 4).into_remainder(); + + if !rem.is_empty() { + const PART_SIZE: usize = 64 * 4; + assert!(rem.len() < PART_SIZE); + + let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; + + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); + + self.disassociate_chunk(&mut buffer); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +#[target_feature(enable = "avx512f", enable = "avx512bw")] +unsafe fn avx512_unp_row(in_place: &mut [u8], executor: impl DisassociateAlpha) { + executor.disassociate(in_place); +} +pub(crate) fn avx512_unpremultiply_alpha_rgba( + in_place: &mut [u8], + width: usize, + _: usize, + pool: &Option, +) { + let has_vbmi = std::arch::is_x86_feature_detected!("avx512vbmi"); + + if let Some(pool) = pool { + pool.install(|| { + in_place + .par_chunks_exact_mut(width * 4) + .for_each(|row| unsafe { + if has_vbmi { + avx512_unp_row(row, Avx512DisassociateAlpha::::default()); + } else { + avx512_unp_row(row, Avx512DisassociateAlpha::::default()); + } + }); + }); + } else { + in_place.chunks_exact_mut(width * 4).for_each(|row| unsafe { + if has_vbmi { + avx512_unp_row(row, Avx512DisassociateAlpha::::default()); + } else { + avx512_unp_row(row, Avx512DisassociateAlpha::::default()); + } + }); + } +} diff --git a/src/avx512/avx512_setr.rs b/src/avx512/avx512_setr.rs new file mode 100644 index 0000000..92336bc --- /dev/null +++ b/src/avx512/avx512_setr.rs @@ -0,0 +1,295 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#![allow(dead_code)] + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +pub(crate) unsafe fn _v512_set_epu32( + a15: i64, + a14: i64, + a13: i64, + a12: i64, + a11: i64, + a10: i64, + a9: i64, + a8: i64, + a7: i64, + a6: i64, + a5: i64, + a4: i64, + a3: i64, + a2: i64, + a1: i64, + a0: i64, +) -> __m512i { + _mm512_set_epi64( + ((a15) << 32) | (a14), + ((a13) << 32) | (a12), + ((a11) << 32) | (a10), + ((a9) << 32) | (a8), + ((a7) << 32) | (a6), + ((a5) << 32) | (a4), + ((a3) << 32) | (a2), + ((a1) << 32) | (a0), + ) +} + +#[inline(always)] +pub(crate) unsafe fn _v512_set_epu16( + a31: i64, + a30: i64, + a29: i64, + a28: i64, + a27: i64, + a26: i64, + a25: i64, + a24: i64, + a23: i64, + a22: i64, + a21: i64, + a20: i64, + a19: i64, + a18: i64, + a17: i64, + a16: i64, + a15: i64, + a14: i64, + a13: i64, + a12: i64, + a11: i64, + a10: i64, + a9: i64, + a8: i64, + a7: i64, + a6: i64, + a5: i64, + a4: i64, + a3: i64, + a2: i64, + a1: i64, + a0: i64, +) -> __m512i { + _v512_set_epu32( + ((a31) << 16) | (a30), + ((a29) << 16) | (a28), + ((a27) << 16) | (a26), + ((a25) << 16) | (a24), + ((a23) << 16) | (a22), + ((a21) << 16) | (a20), + ((a19) << 16) | (a18), + ((a17) << 16) | (a16), + ((a15) << 16) | (a14), + ((a13) << 16) | (a12), + ((a11) << 16) | (a10), + ((a9) << 16) | (a8), + ((a7) << 16) | (a6), + ((a5) << 16) | (a4), + ((a3) << 16) | (a2), + ((a1) << 16) | (a0), + ) +} + +#[inline(always)] +pub(crate) unsafe fn _v512_set_epu8( + a63: i64, + a62: i64, + a61: i64, + a60: i64, + a59: i64, + a58: i64, + a57: i64, + a56: i64, + a55: i64, + a54: i64, + a53: i64, + a52: i64, + a51: i64, + a50: i64, + a49: i64, + a48: i64, + a47: i64, + a46: i64, + a45: i64, + a44: i64, + a43: i64, + a42: i64, + a41: i64, + a40: i64, + a39: i64, + a38: i64, + a37: i64, + a36: i64, + a35: i64, + a34: i64, + a33: i64, + a32: i64, + a31: i64, + a30: i64, + a29: i64, + a28: i64, + a27: i64, + a26: i64, + a25: i64, + a24: i64, + a23: i64, + a22: i64, + a21: i64, + a20: i64, + a19: i64, + a18: i64, + a17: i64, + a16: i64, + a15: i64, + a14: i64, + a13: i64, + a12: i64, + a11: i64, + a10: i64, + a9: i64, + a8: i64, + a7: i64, + a6: i64, + a5: i64, + a4: i64, + a3: i64, + a2: i64, + a1: i64, + a0: i64, +) -> __m512i { + _v512_set_epu32( + ((a63) << 24) | ((a62) << 16) | ((a61) << 8) | (a60), + ((a59) << 24) | ((a58) << 16) | ((a57) << 8) | (a56), + ((a55) << 24) | ((a54) << 16) | ((a53) << 8) | (a52), + ((a51) << 24) | ((a50) << 16) | ((a49) << 8) | (a48), + ((a47) << 24) | ((a46) << 16) | ((a45) << 8) | (a44), + ((a43) << 24) | ((a42) << 16) | ((a41) << 8) | (a40), + ((a39) << 24) | ((a38) << 16) | ((a37) << 8) | (a36), + ((a35) << 24) | ((a34) << 16) | ((a33) << 8) | (a32), + ((a31) << 24) | ((a30) << 16) | ((a29) << 8) | (a28), + ((a27) << 24) | ((a26) << 16) | ((a25) << 8) | (a24), + ((a23) << 24) | ((a22) << 16) | (((a21) << 8) | (a20)), + ((a19) << 24) | ((a18) << 16) | ((a17) << 8) | (a16), + ((a15) << 24) | ((a14) << 16) | ((a13) << 8) | (a12), + ((a11) << 24) | ((a10) << 16) | ((a9) << 8) | (a8), + ((a7) << 24) | ((a6) << 16) | ((a5) << 8) | (a4), + ((a3) << 24) | ((a2) << 16) | ((a1) << 8) | (a0), + ) +} + +#[inline(always)] +pub(crate) unsafe fn _v512_setr_epu8( + a0: i64, + a1: i64, + a2: i64, + a3: i64, + a4: i64, + a5: i64, + a6: i64, + a7: i64, + a8: i64, + a9: i64, + a10: i64, + a11: i64, + a12: i64, + a13: i64, + a14: i64, + a15: i64, + a16: i64, + a17: i64, + a18: i64, + a19: i64, + a20: i64, + a21: i64, + a22: i64, + a23: i64, + a24: i64, + a25: i64, + a26: i64, + a27: i64, + a28: i64, + a29: i64, + a30: i64, + a31: i64, + a32: i64, + a33: i64, + a34: i64, + a35: i64, + a36: i64, + a37: i64, + a38: i64, + a39: i64, + a40: i64, + a41: i64, + a42: i64, + a43: i64, + a44: i64, + a45: i64, + a46: i64, + a47: i64, + a48: i64, + a49: i64, + a50: i64, + a51: i64, + a52: i64, + a53: i64, + a54: i64, + a55: i64, + a56: i64, + a57: i64, + a58: i64, + a59: i64, + a60: i64, + a61: i64, + a62: i64, + a63: i64, +) -> __m512i { + _v512_set_epu32( + ((a63) << 24) | ((a62) << 16) | ((a61) << 8) | (a60), + ((a59) << 24) | ((a58) << 16) | ((a57) << 8) | (a56), + ((a55) << 24) | ((a54) << 16) | ((a53) << 8) | (a52), + ((a51) << 24) | ((a50) << 16) | ((a49) << 8) | (a48), + ((a47) << 24) | ((a46) << 16) | ((a45) << 8) | (a44), + ((a43) << 24) | ((a42) << 16) | ((a41) << 8) | (a40), + ((a39) << 24) | ((a38) << 16) | ((a37) << 8) | (a36), + ((a35) << 24) | ((a34) << 16) | ((a33) << 8) | (a32), + ((a31) << 24) | ((a30) << 16) | ((a29) << 8) | (a28), + ((a27) << 24) | ((a26) << 16) | ((a25) << 8) | (a24), + ((a23) << 24) | ((a22) << 16) | (((a21) << 8) | (a20)), + ((a19) << 24) | ((a18) << 16) | ((a17) << 8) | (a16), + ((a15) << 24) | ((a14) << 16) | ((a13) << 8) | (a12), + ((a11) << 24) | ((a10) << 16) | ((a9) << 8) | (a8), + ((a7) << 24) | ((a6) << 16) | ((a5) << 8) | (a4), + ((a3) << 24) | ((a2) << 16) | ((a1) << 8) | (a0), + ) +} diff --git a/src/avx512/mod.rs b/src/avx512/mod.rs new file mode 100644 index 0000000..191b3f8 --- /dev/null +++ b/src/avx512/mod.rs @@ -0,0 +1,35 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +mod alpha_u8; +mod avx512_setr; +mod utils; +mod vertical_u8_lp; + +pub(crate) use alpha_u8::{avx512_premultiply_alpha_rgba, avx512_unpremultiply_alpha_rgba}; +pub(crate) use vertical_u8_lp::convolve_vertical_avx512_row_lp; diff --git a/src/avx512/utils.rs b/src/avx512/utils.rs new file mode 100644 index 0000000..87acf14 --- /dev/null +++ b/src/avx512/utils.rs @@ -0,0 +1,141 @@ +/* + * Copyright (c) Radzivon Bartoshyk 01/2025. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::avx512::avx512_setr::{_v512_set_epu32, _v512_set_epu8}; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +pub(crate) unsafe fn avx512_deinterleave_rgba( + bgra0: __m512i, + bgra1: __m512i, + bgra2: __m512i, + bgra3: __m512i, +) -> (__m512i, __m512i, __m512i, __m512i) { + if HAS_VBMI { + let mask0 = _v512_set_epu8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, 94, 92, + 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, 54, 52, 50, 48, + 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, + 0, + ); + let mask1 = _v512_set_epu8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, 95, 93, + 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, 55, 53, 51, 49, + 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, + 1, + ); + + let br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1); + let ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1); + let br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3); + let ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3); + + let a = _mm512_permutex2var_epi8(br01, mask0, br23); + let c = _mm512_permutex2var_epi8(br01, mask1, br23); + let b = _mm512_permutex2var_epi8(ga01, mask0, ga23); + let d = _mm512_permutex2var_epi8(ga01, mask1, ga23); + (a, b, c, d) + } else { + let mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); + let b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask); + let b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask); + let b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask); + let b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask); + + let mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + let mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + let br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1); + let ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1); + let br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3); + let ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3); + + let a = _mm512_permutex2var_epi32(br01, mask0, br23); + let c = _mm512_permutex2var_epi32(br01, mask1, br23); + let b = _mm512_permutex2var_epi32(ga01, mask0, ga23); + let d = _mm512_permutex2var_epi32(ga01, mask1, ga23); + (a, b, c, d) + } +} + +#[inline(always)] +pub(crate) unsafe fn avx512_zip_epi8( + a: __m512i, + b: __m512i, +) -> (__m512i, __m512i) { + if HAS_VBMI { + let mask0 = _v512_set_epu8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, 85, 21, + 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, + 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0, + ); + let ab0 = _mm512_permutex2var_epi8(a, mask0, b); + let mask1 = _v512_set_epu8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, 55, 118, + 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, 110, 46, 109, 45, + 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, 38, 101, 37, 100, 36, 99, + 35, 98, 34, 97, 33, 96, 32, + ); + let ab1 = _mm512_permutex2var_epi8(a, mask1, b); + (ab0, ab1) + } else { + let low = _mm512_unpacklo_epi8(a, b); + let high = _mm512_unpackhi_epi8(a, b); + let ab0 = _mm512_permutex2var_epi64(low, _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0), high); + let ab1 = + _mm512_permutex2var_epi64(low, _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4), high); + (ab0, ab1) + } +} + +#[inline(always)] +pub(crate) unsafe fn avx512_interleave_rgba( + a: __m512i, + b: __m512i, + c: __m512i, + d: __m512i, +) -> (__m512i, __m512i, __m512i, __m512i) { + let (br01, br23) = avx512_zip_epi8::(a, c); + let (ga01, ga23) = avx512_zip_epi8::(b, d); + let (bgra0, bgra1) = avx512_zip_epi8::(br01, ga01); + let (bgra2, bgra3) = avx512_zip_epi8::(br23, ga23); + (bgra0, bgra1, bgra2, bgra3) +} + +/// Exact division by 255 with rounding to nearest +#[inline(always)] +pub(crate) unsafe fn avx512_div_by255(v: __m512i) -> __m512i { + let addition = _mm512_set1_epi16(127); + _mm512_srli_epi16::<8>(_mm512_add_epi16( + _mm512_add_epi16(v, addition), + _mm512_srli_epi16::<8>(v), + )) +} diff --git a/src/avx512/vertical_u8_lp.rs b/src/avx512/vertical_u8_lp.rs new file mode 100644 index 0000000..4fa8b74 --- /dev/null +++ b/src/avx512/vertical_u8_lp.rs @@ -0,0 +1,393 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::avx2::utils::avx2_pack_u16; +use crate::filter_weights::FilterBounds; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +pub(crate) fn convolve_vertical_avx512_row_lp( + dst_width: usize, + bounds: &FilterBounds, + src: &[u8], + dst: &mut [u8], + src_stride: usize, + weights: &[i16], +) { + unsafe { + convolve_vertical_avx2_row_impl(dst_width, bounds, src, dst, src_stride, weights); + } +} + +#[inline(always)] +unsafe fn m256dot( + store0: __m256i, + store1: __m256i, + row: __m256i, + weight: __m256i, +) -> (__m256i, __m256i) { + let lo = _mm256_unpacklo_epi8(row, row); + let hi = _mm256_unpackhi_epi8(row, row); + + let store0 = _mm256_add_epi16( + store0, + _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(lo), weight), + ); + let store1 = _mm256_add_epi16( + store1, + _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(hi), weight), + ); + (store0, store1) +} + +#[inline(always)] +unsafe fn m512dot( + store0: __m512i, + store1: __m512i, + row: __m512i, + weight: __m512i, +) -> (__m512i, __m512i) { + let lo = _mm512_unpacklo_epi8(row, row); + let hi = _mm512_unpackhi_epi8(row, row); + + let store0 = _mm512_add_epi16( + store0, + _mm512_mulhrs_epi16(_mm512_srli_epi16::<2>(lo), weight), + ); + let store1 = _mm512_add_epi16( + store1, + _mm512_mulhrs_epi16(_mm512_srli_epi16::<2>(hi), weight), + ); + (store0, store1) +} + +#[target_feature(enable = "avx512f", enable = "avx512bw")] +/// This inlining is required to activate all features for runtime dispatch +unsafe fn convolve_vertical_avx2_row_impl( + _: usize, + bounds: &FilterBounds, + src: &[u8], + dst: &mut [u8], + src_stride: usize, + weight: &[i16], +) { + let bounds_size = bounds.size; + const SCALE: u32 = 6; + const R_SHR_SCALE: i32 = SCALE as i32; + const AR_SHR_SCALE: u32 = SCALE; + const ROUNDING: i16 = 1 << (R_SHR_SCALE - 1); + + let mut cx = 0usize; + + let mut rem = dst; + + let iter_64 = rem.chunks_exact_mut(64); + + for dst in iter_64 { + let mut store0 = _mm512_set1_epi16(ROUNDING); + let mut store1 = _mm512_set1_epi16(ROUNDING); + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weights = weight.get_unchecked(0..2); + let v_weight0 = _mm512_set1_epi16(weights[0]); + let v_weight1 = _mm512_set1_epi16(weights[1]); + let v_offset0 = src_stride * py + px; + let src_ptr0 = src.get_unchecked(v_offset0..); + let v_offset1 = src_stride * (py + 1) + px; + let src_ptr1 = src.get_unchecked(v_offset1..); + + let item_row0 = _mm512_loadu_si512(src_ptr0.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row0, v_weight0); + + let item_row1 = _mm512_loadu_si512(src_ptr1.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row1, v_weight1); + } else if bounds_size == 3 { + let py = bounds.start; + let weights = weight.get_unchecked(0..3); + let v_weight0 = _mm512_set1_epi16(weights[0]); + let v_weight1 = _mm512_set1_epi16(weights[1]); + let v_weight2 = _mm512_set1_epi16(weights[2]); + let v_offset0 = src_stride * py + px; + let src_ptr0 = src.get_unchecked(v_offset0..); + let v_offset1 = src_stride * (py + 1) + px; + let src_ptr1 = src.get_unchecked(v_offset1..); + let v_offset2 = src_stride * (py + 2) + px; + let src_ptr2 = src.get_unchecked(v_offset2..); + + let item_row0 = _mm512_loadu_si512(src_ptr0.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row0, v_weight0); + + let item_row1 = _mm512_loadu_si512(src_ptr1.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row1, v_weight1); + + let item_row2 = _mm512_loadu_si512(src_ptr2.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row2, v_weight2); + } else if bounds_size == 4 { + let py = bounds.start; + let weights = weight.get_unchecked(0..4); + let v_weight0 = _mm512_set1_epi16(weights[0]); + let v_weight1 = _mm512_set1_epi16(weights[1]); + let v_weight2 = _mm512_set1_epi16(weights[2]); + let v_weight3 = _mm512_set1_epi16(weights[3]); + let v_offset0 = src_stride * py + px; + let src_ptr0 = src.get_unchecked(v_offset0..); + let v_offset1 = src_stride * (py + 1) + px; + let src_ptr1 = src.get_unchecked(v_offset1..); + let v_offset2 = src_stride * (py + 2) + px; + let src_ptr2 = src.get_unchecked(v_offset2..); + let v_offset3 = src_stride * (py + 3) + px; + let src_ptr3 = src.get_unchecked(v_offset3..); + + let item_row0 = _mm512_loadu_si512(src_ptr0.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row0, v_weight0); + + let item_row1 = _mm512_loadu_si512(src_ptr1.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row1, v_weight1); + + let item_row2 = _mm512_loadu_si512(src_ptr2.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row2, v_weight2); + + let item_row3 = _mm512_loadu_si512(src_ptr3.as_ptr() as *const _); + (store0, store1) = m512dot(store0, store1, item_row3, v_weight3); + } else { + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm512_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let item_row0 = _mm512_loadu_si512(src_ptr.as_ptr() as *const _); + + (store0, store1) = m512dot(store0, store1, item_row0, v_weight); + } + } + + let rebased0 = _mm512_srai_epi16::(store0); + let rebased1 = _mm512_srai_epi16::(store1); + + let shrank0 = _mm512_packus_epi16(rebased0, rebased1); + + _mm512_storeu_si512(dst.as_mut_ptr() as *mut _, shrank0); + + cx += 64; + } + + rem = rem.chunks_exact_mut(64).into_remainder(); + + let iter_32 = rem.chunks_exact_mut(32); + + for dst in iter_32 { + let mut store0 = _mm256_set1_epi16(ROUNDING); + let mut store1 = _mm256_set1_epi16(ROUNDING); + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weights = weight.get_unchecked(0..2); + let v_weight0 = _mm256_set1_epi16(weights[0]); + let v_weight1 = _mm256_set1_epi16(weights[1]); + let v_offset0 = src_stride * py + px; + let src_ptr0 = src.get_unchecked(v_offset0..); + let v_offset1 = src_stride * (py + 1) + px; + let src_ptr1 = src.get_unchecked(v_offset1..); + + let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); + + let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row1, v_weight1); + } else if bounds_size == 3 { + let py = bounds.start; + let weights = weight.get_unchecked(0..3); + let v_weight0 = _mm256_set1_epi16(weights[0]); + let v_weight1 = _mm256_set1_epi16(weights[1]); + let v_weight2 = _mm256_set1_epi16(weights[2]); + let v_offset0 = src_stride * py + px; + let src_ptr0 = src.get_unchecked(v_offset0..); + let v_offset1 = src_stride * (py + 1) + px; + let src_ptr1 = src.get_unchecked(v_offset1..); + let v_offset2 = src_stride * (py + 2) + px; + let src_ptr2 = src.get_unchecked(v_offset2..); + + let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); + + let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row1, v_weight1); + + let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row2, v_weight2); + } else if bounds_size == 4 { + let py = bounds.start; + let weights = weight.get_unchecked(0..4); + let v_weight0 = _mm256_set1_epi16(weights[0]); + let v_weight1 = _mm256_set1_epi16(weights[1]); + let v_weight2 = _mm256_set1_epi16(weights[2]); + let v_weight3 = _mm256_set1_epi16(weights[3]); + let v_offset0 = src_stride * py + px; + let src_ptr0 = src.get_unchecked(v_offset0..); + let v_offset1 = src_stride * (py + 1) + px; + let src_ptr1 = src.get_unchecked(v_offset1..); + let v_offset2 = src_stride * (py + 2) + px; + let src_ptr2 = src.get_unchecked(v_offset2..); + let v_offset3 = src_stride * (py + 3) + px; + let src_ptr3 = src.get_unchecked(v_offset3..); + + let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); + + let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row1, v_weight1); + + let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row2, v_weight2); + + let item_row3 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i); + (store0, store1) = m256dot(store0, store1, item_row3, v_weight3); + } else { + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm256_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let item_row0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); + + (store0, store1) = m256dot(store0, store1, item_row0, v_weight); + } + } + + let rebased0 = _mm256_srai_epi16::(store0); + let rebased1 = _mm256_srai_epi16::(store1); + + let shrank0 = _mm256_packus_epi16(rebased0, rebased1); + _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0); + + cx += 32; + } + + rem = rem.chunks_exact_mut(32).into_remainder(); + let iter_16 = rem.chunks_exact_mut(16); + + for dst in iter_16 { + let mut store0 = _mm256_set1_epi16(ROUNDING); + + let px = cx; + + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm256_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let mut item_row = _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256( + _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i), + )); + item_row = _mm256_unpacklo_epi8(item_row, item_row); + store0 = _mm256_add_epi16( + store0, + _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(item_row), v_weight), + ); + } + + store0 = _mm256_srai_epi16::(store0); + + let packed = avx2_pack_u16(store0, store0); + + let rebased0 = _mm256_castsi256_si128(packed); + _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, rebased0); + + cx += 16; + } + + rem = rem.chunks_exact_mut(16).into_remainder(); + let iter_8 = rem.chunks_exact_mut(8); + + for dst in iter_8 { + let mut store = _mm_set1_epi16(ROUNDING); + + let px = cx; + + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let mut item_row = _mm_loadu_si64(src_ptr.as_ptr()); + item_row = _mm_unpacklo_epi8(item_row, item_row); + + let low = _mm_srli_epi16::<2>(item_row); + store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); + } + + let rebased = _mm_srai_epi16::(store); + let shrank = _mm_packus_epi16(rebased, rebased); + _mm_storeu_si64(dst.as_mut_ptr(), shrank); + + cx += 8; + } + + rem = rem.chunks_exact_mut(8).into_remainder(); + let iter_1 = rem.iter_mut(); + + for dst in iter_1 { + let mut store = _mm_set1_epi16(ROUNDING); + + let px = cx; + + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..(v_offset + 1)); + let item_row = _mm_set1_epi8(src_ptr[0] as i8); + + store = _mm_add_epi16( + store, + _mm_mulhrs_epi16( + _mm_srli_epi16::<2>(_mm_unpacklo_epi8(item_row, item_row)), + v_weight, + ), + ); + } + + let rebased = _mm_srai_epi16::(store); + let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased)); + *dst = value as u8; + + cx += 1; + } +} diff --git a/src/lib.rs b/src/lib.rs index 1824e9f..9e52411 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,9 @@ #![deny(deprecated)] // #![deny(unreachable_code, unused)] #![allow(clippy::too_many_arguments)] +#![cfg_attr(feature = "nightly_avx512", feature(cfg_version))] +#![cfg_attr(feature = "nightly_avx512", feature(avx512_target_feature))] +#![cfg_attr(feature = "nightly_avx512", feature(stdarch_x86_avx512))] mod alpha_check; #[cfg(feature = "half")] mod alpha_handle_f16; @@ -38,6 +41,11 @@ mod alpha_handle_u8; mod ar30; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] mod avx2; +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + feature = "nightly_avx512" +))] +mod avx512; mod color_group; #[cfg(feature = "colorspaces")] mod colors; diff --git a/src/plane_u8.rs b/src/plane_u8.rs index 099febe..dc57c41 100644 --- a/src/plane_u8.rs +++ b/src/plane_u8.rs @@ -120,6 +120,13 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 1> { _dispatcher = convolve_vertical_avx_row; } } + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avx512bw") { + if _scale_factor < 8. { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; + } + } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index dae1038..29db8eb 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -119,6 +119,13 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 3> { _dispatcher = convolve_vertical_avx_row; } } + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avx512bw") { + if _scale_factor < 8. { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; + } + } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs index 14d3a23..0a3398e 100644 --- a/src/rgba_u8.rs +++ b/src/rgba_u8.rs @@ -141,6 +141,13 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 4> { _dispatcher = convolve_vertical_avx_row; } } + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avx512bw") { + if _scale_factor < 8. { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; + } + } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] { diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs index d5f5c33..efb1c01 100644 --- a/src/sse/alpha_u16.rs +++ b/src/sse/alpha_u16.rs @@ -121,8 +121,7 @@ trait DisassociateAlpha { struct DisassociateAlphaDefault {} impl DisassociateAlphaDefault { - #[inline] - #[target_feature(enable = "sse4.1")] + #[inline(always)] unsafe fn disassociate_chunk(&self, in_place: &mut [u16], v_max_colors: __m128) { let src_ptr = in_place.as_ptr(); let row0 = _mm_loadu_si128(src_ptr as *const __m128i); @@ -362,8 +361,7 @@ impl Sse41PremultiplyExecutor struct Sse41PremultiplyExecutorAny {} impl Sse41PremultiplyExecutorAny { - #[inline] - #[target_feature(enable = "sse4.1")] + #[inline(always)] unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m128) { let src_ptr = src.as_ptr(); let row0 = _mm_loadu_si128(src_ptr as *const __m128i); diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs index 6e23821..8fba54c 100644 --- a/src/sse/alpha_u8.rs +++ b/src/sse/alpha_u8.rs @@ -64,10 +64,7 @@ pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i { let lo = _mm_unpacklo_epi8(x, zeros); let hi = _mm_unpackhi_epi8(x, zeros); - let scale = _mm_set1_epi16(255); - let is_zero_mask = _mm_cmpeq_epi8(a, zeros); - let a = _mm_select_si128(is_zero_mask, scale, a); let scale_ps = _mm_set1_ps(255f32); @@ -114,8 +111,7 @@ trait Sse41PremultiplyExecutorRgba8 { struct Sse41PremultiplyExecutor8Default {} impl Sse41PremultiplyExecutor8Default { - #[inline] - #[target_feature(enable = "sse4.1")] + #[inline(always)] unsafe fn premultiply_chunk(&self, dst: &mut [u8], src: &[u8]) { let zeros = _mm_setzero_si128(); let src_ptr = src.as_ptr(); @@ -254,8 +250,7 @@ trait DisassociateAlpha { struct DisassociateAlphaDefault {} impl DisassociateAlphaDefault { - #[inline] - #[target_feature(enable = "sse4.1")] + #[inline(always)] unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) { let src_ptr = in_place.as_ptr(); let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 45417dd..30fd525 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -64,10 +64,7 @@ pub(crate) use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alp pub(crate) use alpha_f32::sse_premultiply_alpha_rgba_f32; pub(crate) use alpha_f32::sse_unpremultiply_alpha_rgba_f32; pub(crate) use alpha_u16::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16}; -pub(crate) use alpha_u8::{ - _mm_div_by_255_epi16, sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba, - sse_unpremultiply_row, -}; +pub(crate) use alpha_u8::{sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba}; pub(crate) use check_alpha::{ sse_has_non_constant_cap_alpha_rgba16, sse_has_non_constant_cap_alpha_rgba8, };