diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index a72a34b..2fdcdcc 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -44,6 +44,17 @@ jobs: - uses: dtolnay/rust-toolchain@stable - run: cargo clippy + tests: + name: Testing + strategy: + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@nightly + - run: cargo test + fuzz_rgba_8bit: name: Fuzzing 8bit strategy: diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs index 3ac926c..93b8f04 100644 --- a/app/benches/resize_rgba/main.rs +++ b/app/benches/resize_rgba/main.rs @@ -4,7 +4,8 @@ use fast_image_resize::FilterType::Lanczos3; use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer}; use image::{GenericImageView, ImageReader}; use pic_scale::{ - ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy, + ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ScalingU16, + ThreadingPolicy, }; pub fn criterion_benchmark(c: &mut Criterion) { @@ -14,114 +15,164 @@ pub fn criterion_benchmark(c: &mut Criterion) { .unwrap(); let dimensions = img.dimensions(); let src_bytes = img.as_bytes(); - c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| { - let mut copied: Vec = Vec::from(src_bytes); - b.iter(|| { - let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); - scaler.set_threading_policy(ThreadingPolicy::Single); - let store = ImageStore::::from_slice( - &mut copied, - dimensions.0 as usize, - dimensions.1 as usize, - ) - .unwrap(); - let mut target = - ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); - _ = scaler.resize_rgba(&store, &mut target, true); - }) - }); - - let f32_image: Vec = src_bytes.iter().map(|&x| x as f32 / 255f32).collect(); + // c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| { + // let mut copied: Vec = Vec::from(src_bytes); + // b.iter(|| { + // let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + // scaler.set_threading_policy(ThreadingPolicy::Single); + // let store = ImageStore::::from_slice( + // &mut copied, + // dimensions.0 as usize, + // dimensions.1 as usize, + // ) + // .unwrap(); + // let mut target = + // ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + // _ = scaler.resize_rgba(&store, &mut target, true); + // }) + // }); + // + // let f32_image: Vec = src_bytes.iter().map(|&x| x as f32 / 255f32).collect(); + // + // c.bench_function("Pic scale RGBA with alpha f32: Lanczos 3", |b| { + // let mut copied: Vec = Vec::from(f32_image.clone()); + // b.iter(|| { + // let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + // scaler.set_threading_policy(ThreadingPolicy::Single); + // let store = ImageStore::::from_slice( + // &mut copied, + // dimensions.0 as usize, + // dimensions.1 as usize, + // ) + // .unwrap(); + // let mut target = + // ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + // _ = scaler.resize_rgba_f32(&store, &mut target, false); + // }) + // }); + // + // c.bench_function("Fast image resize RGBA with alpha: Lanczos 3", |b| { + // let mut vc = Vec::from(img.as_bytes()); + // b.iter(|| { + // let pixel_type: PixelType = PixelType::U8x4; + // let src_image = + // Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); + // let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type); + // + // let mut resizer = Resizer::new(); + // #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + // unsafe { + // resizer.set_cpu_extensions(CpuExtensions::Neon); + // } + // #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + // unsafe { + // resizer.set_cpu_extensions(CpuExtensions::Avx2); + // } + // resizer + // .resize( + // &src_image, + // &mut dst_image, + // &ResizeOptions::new() + // .resize_alg(ResizeAlg::Convolution(Lanczos3)) + // .use_alpha(true), + // ) + // .unwrap(); + // }) + // }); + // + // c.bench_function("Pic scale RGBA without alpha: Lanczos 3", |b| { + // let mut copied: Vec = Vec::from(src_bytes); + // b.iter(|| { + // let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + // scaler.set_threading_policy(ThreadingPolicy::Single); + // let store = ImageStore::::from_slice( + // &mut copied, + // dimensions.0 as usize, + // dimensions.1 as usize, + // ) + // .unwrap(); + // let mut target = + // ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + // _ = scaler.resize_rgba(&store, &mut target, false); + // }) + // }); + // + // c.bench_function("Fast image resize RGBA without alpha: Lanczos 3", |b| { + // let mut vc = Vec::from(img.as_bytes()); + // b.iter(|| { + // let pixel_type: PixelType = PixelType::U8x4; + // let src_image = + // Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); + // let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type); + // + // let mut resizer = Resizer::new(); + // #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + // unsafe { + // resizer.set_cpu_extensions(CpuExtensions::Neon); + // } + // #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + // unsafe { + // resizer.set_cpu_extensions(CpuExtensions::Avx2); + // } + // resizer + // .resize( + // &src_image, + // &mut dst_image, + // &ResizeOptions::new() + // .resize_alg(ResizeAlg::Convolution(Lanczos3)) + // .use_alpha(false), + // ) + // .unwrap(); + // }) + // }); - c.bench_function("Pic scale RGBA with alpha f32: Lanczos 3", |b| { - let mut copied: Vec = Vec::from(f32_image.clone()); + c.bench_function("Pic scale RGBA10 with alpha: Lanczos 3", |b| { + let mut copied: Vec = Vec::from( + src_bytes + .iter() + .map(|&x| ((x as u16) << 2) | ((x as u16) >> 6)) + .collect::>(), + ); b.iter(|| { let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); - let store = ImageStore::::from_slice( + let store = ImageStore::::from_slice( &mut copied, dimensions.0 as usize, dimensions.1 as usize, ) .unwrap(); - let mut target = - ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); - _ = scaler.resize_rgba_f32(&store, &mut target, false); + let mut target = ImageStoreMut::alloc_with_depth( + dimensions.0 as usize / 4, + dimensions.1 as usize / 4, + 10, + ); + _ = scaler.resize_rgba_u16(&store, &mut target, true); }) }); - c.bench_function("Fast image resize RGBA with alpha: Lanczos 3", |b| { - let mut vc = Vec::from(img.as_bytes()); - b.iter(|| { - let pixel_type: PixelType = PixelType::U8x4; - let src_image = - Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); - let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type); - - let mut resizer = Resizer::new(); - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - unsafe { - resizer.set_cpu_extensions(CpuExtensions::Neon); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - unsafe { - resizer.set_cpu_extensions(CpuExtensions::Avx2); - } - resizer - .resize( - &src_image, - &mut dst_image, - &ResizeOptions::new() - .resize_alg(ResizeAlg::Convolution(Lanczos3)) - .use_alpha(true), - ) - .unwrap(); - }) - }); - - c.bench_function("Pic scale RGBA without alpha: Lanczos 3", |b| { - let mut copied: Vec = Vec::from(src_bytes); + c.bench_function("Pic scale RGBA10 without alpha: Lanczos 3", |b| { + let mut copied: Vec = Vec::from( + src_bytes + .iter() + .map(|&x| ((x as u16) << 2) | ((x as u16) >> 6)) + .collect::>(), + ); b.iter(|| { let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); - let store = ImageStore::::from_slice( + let store = ImageStore::::from_slice( &mut copied, dimensions.0 as usize, dimensions.1 as usize, ) .unwrap(); - let mut target = - ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); - _ = scaler.resize_rgba(&store, &mut target, false); - }) - }); - - c.bench_function("Fast image resize RGBA without alpha: Lanczos 3", |b| { - let mut vc = Vec::from(img.as_bytes()); - b.iter(|| { - let pixel_type: PixelType = PixelType::U8x4; - let src_image = - Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); - let mut dst_image = Image::new(dimensions.0 / 2, dimensions.1 / 2, pixel_type); - - let mut resizer = Resizer::new(); - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - unsafe { - resizer.set_cpu_extensions(CpuExtensions::Neon); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - unsafe { - resizer.set_cpu_extensions(CpuExtensions::Avx2); - } - resizer - .resize( - &src_image, - &mut dst_image, - &ResizeOptions::new() - .resize_alg(ResizeAlg::Convolution(Lanczos3)) - .use_alpha(false), - ) - .unwrap(); + let mut target = ImageStoreMut::alloc_with_depth( + dimensions.0 as usize / 4, + dimensions.1 as usize / 4, + 10, + ); + _ = scaler.resize_rgba_u16(&store, &mut target, false); }) }); } diff --git a/app/src/main.rs b/app/src/main.rs index 52a7faf..25367ac 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -3,17 +3,14 @@ mod split; use std::time::Instant; -use crate::merge::merge_channels_3; -use crate::split::split_channels_3; use fast_image_resize::images::Image; use fast_image_resize::{ CpuExtensions, FilterType, IntoImageView, PixelType, ResizeAlg, ResizeOptions, Resizer, }; use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ - Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, JzazbzScaler, LChScaler, LabScaler, - LinearApproxScaler, LinearScaler, LuvScaler, OklabScaler, ResamplingFunction, Scaler, Scaling, - ScalingU16, SigmoidalScaler, ThreadingPolicy, TransferFunction, XYZScaler, + ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingU16, + ThreadingPolicy, }; fn resize_plane( @@ -53,15 +50,16 @@ fn main() { let transient = img.to_rgba8(); let mut bytes = Vec::from(transient.as_bytes()); - let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + let mut scaler = Scaler::new(ResamplingFunction::Bilinear); scaler.set_threading_policy(ThreadingPolicy::Single); // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); + // let store = - ImageStore::::from_slice(&mut choke, dimensions.0 as usize, dimensions.1 as usize) + ImageStore::::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize) .unwrap(); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); @@ -78,13 +76,13 @@ fn main() { // .unwrap(); let mut dst_store = ImageStoreMut::::alloc_with_depth( - dimensions.0 as usize / 3, - dimensions.1 as usize / 3, + dimensions.0 as usize, + dimensions.1 as usize / 2, 10, ); scaler - .resize_rgba_u16(&store, &mut dst_store, false) + .resize_rgba_u16(&store, &mut dst_store, true) .unwrap(); let elapsed_time = start_time.elapsed(); @@ -167,7 +165,7 @@ fn main() { .iter() .map(|&x| (x >> 2) as u8) .collect(); - // + // let dst = dst_store.as_bytes(); // let dst = resized; // image::save_buffer( diff --git a/src/alpha_check.rs b/src/alpha_check.rs index 8f253b9..2accea8 100644 --- a/src/alpha_check.rs +++ b/src/alpha_check.rs @@ -30,10 +30,12 @@ use num_traits::AsPrimitive; use std::ops::{AddAssign, BitXor}; +#[allow(dead_code)] pub(crate) fn has_non_constant_cap_alpha_rgba8(store: &[u8], width: usize) -> bool { has_non_constant_cap_alpha::(store, width) } +#[allow(dead_code)] pub(crate) fn has_non_constant_cap_alpha_rgba16(store: &[u16], width: usize) -> bool { has_non_constant_cap_alpha::(store, width) } @@ -61,7 +63,7 @@ where if store.is_empty() { return false; } - let first = store[0]; + let first = store[ALPHA_CHANNEL_INDEX]; let mut row_sums: J = 0u32.as_(); for row in store.chunks_exact(width * CHANNELS) { for color in row.chunks_exact(CHANNELS) { @@ -87,7 +89,7 @@ fn has_non_constant_cap_alpha_f32_impl(pixel.3)); - let high_r = _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), - high_alpha, - ); - let high_g = _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), - high_alpha, - ); - let high_b = _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), - high_alpha, - ); - let r_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), - ); - let g_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), - ); - let b_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), - ); - let dst_ptr = dst.as_mut_ptr(); - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); + let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); + let high_r = _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), + high_alpha, + ); + let high_g = _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), + high_alpha, + ); + let high_b = _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), + high_alpha, + ); + let r_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), + ); + let g_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), + ); + let b_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), + ); + let dst_ptr = dst.as_mut_ptr(); + let (d_lane0, d_lane1, d_lane2, d_lane3) = + avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); + _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); + _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); + _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + premultiply_pixel_f16_row(rem, src_rem); } @@ -155,84 +153,82 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16( unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3)); - let zeros = _mm256_setzero_ps(); - let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros); - let low_r = _mm256_blendv_ps( - _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_g = _mm256_blendv_ps( - _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_b = _mm256_blendv_ps( - _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha), - zeros, - low_alpha_zero_mask, - ); + for dst in rem.chunks_exact_mut(16 * 4) { + let src_ptr = dst.as_ptr(); + let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); + let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); + let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); - let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros); - let high_r = _mm256_blendv_ps( - _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_g = _mm256_blendv_ps( - _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_b = _mm256_blendv_ps( - _mm256_mul_ps( - _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let r_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), - ); - let g_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), - ); - let b_values = avx_combine_epi( - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), - _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), - ); - let dst_ptr = dst.as_mut_ptr(); - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } + let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3)); + let zeros = _mm256_setzero_ps(); + let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros); + let low_r = _mm256_blendv_ps( + _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_g = _mm256_blendv_ps( + _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_b = _mm256_blendv_ps( + _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha), + zeros, + low_alpha_zero_mask, + ); - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3)); + let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros); + let high_r = _mm256_blendv_ps( + _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_g = _mm256_blendv_ps( + _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_b = _mm256_blendv_ps( + _mm256_mul_ps( + _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let r_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r), + ); + let g_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g), + ); + let b_values = avx_combine_epi( + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b), + _mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b), + ); + let dst_ptr = dst.as_mut_ptr(); + let (d_lane0, d_lane1, d_lane2, d_lane3) = + avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); + _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); + _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); + _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + unpremultiply_pixel_f16_row(rem); } diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs index f136208..f5b4d40 100644 --- a/src/avx2/alpha_f32.rs +++ b/src/avx2/alpha_f32.rs @@ -129,34 +129,32 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32]) let mut rem = dst; let mut src_rem = src; - unsafe { - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let rgba0 = _mm256_loadu_ps(src_ptr); - let rgba1 = _mm256_loadu_ps(src_ptr.add(8)); - let rgba2 = _mm256_loadu_ps(src_ptr.add(16)); - let rgba3 = _mm256_loadu_ps(src_ptr.add(24)); - let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3); - - let rrr = _mm256_mul_ps(rrr, aaa); - let ggg = _mm256_mul_ps(ggg, aaa); - let bbb = _mm256_mul_ps(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_ps(dst_ptr, rgba0); - _mm256_storeu_ps(dst_ptr.add(8), rgba1); - _mm256_storeu_ps(dst_ptr.add(16), rgba2); - _mm256_storeu_ps(dst_ptr.add(24), rgba3); - - _cx += 8; - } - - rem = rem.chunks_exact_mut(8 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(8 * 4).remainder(); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let src_ptr = src.as_ptr(); + let rgba0 = _mm256_loadu_ps(src_ptr); + let rgba1 = _mm256_loadu_ps(src_ptr.add(8)); + let rgba2 = _mm256_loadu_ps(src_ptr.add(16)); + let rgba3 = _mm256_loadu_ps(src_ptr.add(24)); + let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3); + + let rrr = _mm256_mul_ps(rrr, aaa); + let ggg = _mm256_mul_ps(ggg, aaa); + let bbb = _mm256_mul_ps(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm256_storeu_ps(dst_ptr, rgba0); + _mm256_storeu_ps(dst_ptr.add(8), rgba1); + _mm256_storeu_ps(dst_ptr.add(16), rgba2); + _mm256_storeu_ps(dst_ptr.add(24), rgba3); + + _cx += 8; } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(8 * 4).remainder(); + premultiply_pixel_f32_row(rem, src_rem); } diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs index 83f6550..a77b32d 100644 --- a/src/avx2/alpha_u16.rs +++ b/src/avx2/alpha_u16.rs @@ -75,6 +75,17 @@ pub(crate) unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i { _mm256_srli_epi32::(_mm256_add_epi32(v, _mm256_srli_epi32::(v))) } +#[inline(always)] +unsafe fn _mm256_div_by_epi32(v: __m256i) -> __m256i { + if BIT_DEPTH == 10 { + _mm256_div_by_1023_epi32(v) + } else if BIT_DEPTH == 12 { + _mm256_div_by_4095_epi32(v) + } else { + _mm256_div_by_65535_epi32(v) + } +} + pub(crate) fn avx_premultiply_alpha_rgba_u16( dst: &mut [u16], src: &[u16], @@ -88,234 +99,187 @@ pub(crate) fn avx_premultiply_alpha_rgba_u16( } } -#[target_feature(enable = "avx2")] -/// This inlining is required to activate all features for runtime dispatch -unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) { - let max_colors = (1 << bit_depth) - 1; +trait Avx2PremultiplyExecutor { + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize); +} - let mut rem = dst; - let mut src_rem = src; +#[derive(Default)] +struct Avx2PremultiplyExecutorDefault {} + +impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorDefault { + #[target_feature(enable = "avx2")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = dst; + let mut src_rem = src; + + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); + let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); + + let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); + + let zeros = _mm256_setzero_si256(); + let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); + let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); + + let new_rrr = _mm256_packus_epi32( + _mm256_div_by_epi32::(_mm256_madd_epi16( + _mm256_unpacklo_epi16(pixel.0, zeros), + low_alpha, + )), + _mm256_div_by_epi32::(_mm256_madd_epi16( + _mm256_unpackhi_epi16(pixel.0, zeros), + high_alpha, + )), + ); + let new_ggg = _mm256_packus_epi32( + _mm256_div_by_epi32::(_mm256_madd_epi16( + _mm256_unpacklo_epi16(pixel.1, zeros), + low_alpha, + )), + _mm256_div_by_epi32::(_mm256_madd_epi16( + _mm256_unpackhi_epi16(pixel.1, zeros), + high_alpha, + )), + ); + let new_bbb = _mm256_packus_epi32( + _mm256_div_by_epi32::(_mm256_madd_epi16( + _mm256_unpacklo_epi16(pixel.2, zeros), + low_alpha, + )), + _mm256_div_by_epi32::(_mm256_madd_epi16( + _mm256_unpackhi_epi16(pixel.2, zeros), + high_alpha, + )), + ); + + let dst_ptr = dst.as_mut_ptr(); + + let (d_lane0, d_lane1, d_lane2, d_lane3) = + avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); + + _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); + _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); + _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); + } - unsafe { - if bit_depth == 10 { - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let zeros = _mm256_setzero_si256(); - let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); - let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - - let new_rrr = _mm256_packus_epi32( - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.0, zeros), - low_alpha, - )), - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.0, zeros), - high_alpha, - )), - ); - let new_ggg = _mm256_packus_epi32( - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.1, zeros), - low_alpha, - )), - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.1, zeros), - high_alpha, - )), - ); - let new_bbb = _mm256_packus_epi32( - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.2, zeros), - low_alpha, - )), - _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.2, zeros), - high_alpha, - )), - ); - - let dst_ptr = dst.as_mut_ptr(); - - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); - - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - } else if bit_depth == 12 { - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let zeros = _mm256_setzero_si256(); - let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); - let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - - let new_rrr = _mm256_packus_epi32( - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.0, zeros), - low_alpha, - )), - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.0, zeros), - high_alpha, - )), - ); - let new_ggg = _mm256_packus_epi32( - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.1, zeros), - low_alpha, - )), - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.1, zeros), - high_alpha, - )), - ); - let new_bbb = _mm256_packus_epi32( - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpacklo_epi16(pixel.2, zeros), - low_alpha, - )), - _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_unpackhi_epi16(pixel.2, zeros), - high_alpha, - )), - ); - - let dst_ptr = dst.as_mut_ptr(); - - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); - - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - } else if bit_depth == 16 { - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let zeros = _mm256_setzero_si256(); - let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); - let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - - let new_rrr = _mm256_packus_epi32( - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpacklo_epi16(pixel.0, zeros), - low_alpha, - )), - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpackhi_epi16(pixel.0, zeros), - high_alpha, - )), - ); - let new_ggg = _mm256_packus_epi32( - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpacklo_epi16(pixel.1, zeros), - low_alpha, - )), - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpackhi_epi16(pixel.1, zeros), - high_alpha, - )), - ); - let new_bbb = _mm256_packus_epi32( - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpacklo_epi16(pixel.2, zeros), - low_alpha, - )), - _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_unpackhi_epi16(pixel.2, zeros), - high_alpha, - )), - ); - - let dst_ptr = dst.as_mut_ptr(); - - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); - - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } - } else { - let v_scale_colors = _mm256_set1_ps((1. / max_colors as f64) as f32); - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); - let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); - - let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let zeros = _mm256_setzero_si256(); - - let low_alpha = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)), - v_scale_colors, - ); - let high_alpha = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)), - v_scale_colors, - ); - - let new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha); - let new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha); - let new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha); - - let dst_ptr = dst.as_mut_ptr(); - - let (d_lane0, d_lane1, d_lane2, d_lane3) = - avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); - - _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); - _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); - _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); - } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + + premultiply_alpha_rgba_row(rem, src_rem, max_colors); + } +} + +#[derive(Default)] +struct Avx2PremultiplyExecutorAnyBit {} + +impl Avx2PremultiplyExecutor for Avx2PremultiplyExecutorAnyBit { + #[target_feature(enable = "avx2")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = dst; + let mut src_rem = src; + + let v_scale_colors = _mm256_set1_ps((1. / max_colors as f64) as f32); + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i); + let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i); + + let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); + + let zeros = _mm256_setzero_si256(); + + let low_alpha = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)), + v_scale_colors, + ); + let high_alpha = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)), + v_scale_colors, + ); + + let new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha); + let new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha); + let new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha); + + let dst_ptr = dst.as_mut_ptr(); + + let (d_lane0, d_lane1, d_lane2, d_lane3) = + avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3); + + _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0); + _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2); + _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); src_rem = src_rem.chunks_exact(16 * 4).remainder(); + + premultiply_alpha_rgba_row(rem, src_rem, max_colors); } +} + +#[target_feature(enable = "avx2")] +/// This inlining is required to activate all features for runtime dispatch +unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) { + if bit_depth == 10 { + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorDefault::<10>::default(), + ); + } else if bit_depth == 12 { + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorDefault::<12>::default(), + ); + } else if bit_depth == 16 { + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorDefault::<16>::default(), + ); + } else { + avx_pa_dispatch( + dst, + src, + bit_depth, + Avx2PremultiplyExecutorAnyBit::default(), + ); + }; +} - premultiply_alpha_rgba_row(rem, src_rem, max_colors); +#[target_feature(enable = "avx2")] +/// This inlining is required to activate all features for runtime dispatch +#[inline] +unsafe fn avx_pa_dispatch( + dst: &mut [u16], + src: &[u16], + bit_depth: usize, + dispatch: impl Avx2PremultiplyExecutor, +) { + dispatch.premultiply(dst, src, bit_depth); } #[target_feature(enable = "avx2")] diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 35c8f94..4ecff54 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -110,102 +110,100 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let mut rem = dst; let mut src_rem = src; - unsafe { - for (dst, src) in rem - .chunks_exact_mut(32 * 4) - .zip(src_rem.chunks_exact(32 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); - let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); - let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let zeros = _mm256_setzero_si256(); - - let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros); - - let aaa_low = _mm256_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm256_unpackhi_epi8(aaa, zeros); - - rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low)); - rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high)); - ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low)); - ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high)); - bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low)); - bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high)); - - let rrr = _mm256_packus_epi16(rrr_low, rrr_high); - let ggg = _mm256_packus_epi16(ggg_low, ggg_high); - let bbb = _mm256_packus_epi16(bbb_low, bbb_high); - - let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); - let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); - _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); - _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); - } - - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(32 * 4).remainder(); - - let zeros = _mm_setzero_si128(); - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); - - let aaa_low = _mm_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm_unpackhi_epi8(aaa, zeros); - - rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); - rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); - ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); - ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); - bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); - bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); - - let rrr = _mm_packus_epi16(rrr_low, rrr_high); - let ggg = _mm_packus_epi16(ggg_low, ggg_high); - let bbb = _mm_packus_epi16(bbb_low, bbb_high); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); + for (dst, src) in rem + .chunks_exact_mut(32 * 4) + .zip(src_rem.chunks_exact(32 * 4)) + { + let src_ptr = src.as_ptr(); + let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); + let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); + let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let zeros = _mm256_setzero_si256(); + + let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm256_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm256_unpackhi_epi8(aaa, zeros); + + rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low)); + rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high)); + ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low)); + ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high)); + bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low)); + bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm256_packus_epi16(rrr_low, rrr_high); + let ggg = _mm256_packus_epi16(ggg_low, ggg_high); + let bbb = _mm256_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); + let dst_ptr = dst.as_mut_ptr(); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); + _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + } + + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(32 * 4).remainder(); + + let zeros = _mm_setzero_si128(); + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm_unpackhi_epi8(aaa, zeros); + + rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); + rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); + ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); + ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); + bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); + bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm_packus_epi16(rrr_low, rrr_high); + let ggg = _mm_packus_epi16(ggg_low, ggg_high); + let bbb = _mm_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + premultiply_alpha_rgba_row_impl(rem, src_rem); } @@ -249,54 +247,52 @@ pub(crate) fn avx_unpremultiply_alpha_rgba( unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(32 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); - let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); - let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let rrr = avx2_unpremultiply_row(rrr, aaa); - let ggg = avx2_unpremultiply_row(ggg, aaa); - let bbb = avx2_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); - _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); - _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); - } - - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let rrr = sse_unpremultiply_row(rrr, aaa); - let ggg = sse_unpremultiply_row(ggg, aaa); - let bbb = sse_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + for dst in rem.chunks_exact_mut(32 * 4) { + let src_ptr = dst.as_ptr(); + let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); + let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); + let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let rrr = avx2_unpremultiply_row(rrr, aaa); + let ggg = avx2_unpremultiply_row(ggg, aaa); + let bbb = avx2_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); + _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + } + + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); + + for dst in rem.chunks_exact_mut(16 * 4) { + let src_ptr = dst.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let rrr = sse_unpremultiply_row(rrr, aaa); + let ggg = sse_unpremultiply_row(ggg, aaa); + let bbb = sse_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + unpremultiply_alpha_rgba_row_impl(rem); } diff --git a/src/avx2/check_alpha.rs b/src/avx2/check_alpha.rs new file mode 100644 index 0000000..fe6ac32 --- /dev/null +++ b/src/avx2/check_alpha.rs @@ -0,0 +1,247 @@ +/* + * Copyright (c) Radzivon Bartoshyk 12/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::sse::_mm_hsum_epi32; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +/// Checks if image has constant alpha by xor rows +pub(crate) fn avx_has_non_constant_cap_alpha_rgba8( + store: &[u8], + width: usize, + stride: usize, +) -> bool { + unsafe { avx_has_non_constant_cap_alpha_rgba8_impl(store, width, stride) } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx_has_non_constant_cap_alpha_rgba8_impl( + store: &[u8], + width: usize, + stride: usize, +) -> bool { + if store.is_empty() { + return true; + } + + let ash0 = _mm256_setr_epi8( + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 3, -1, -1, -1, 7, -1, -1, -1, + 11, -1, -1, -1, 15, -1, -1, -1, + ); + + let sh0 = _mm_setr_epi8(3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1); + + let first_alpha = store[3]; + let def_alpha = _mm256_set1_epi32(first_alpha as i32); + + for row in store.chunks_exact(stride) { + let row = &row[0..width * 4]; + let mut sums = _mm256_set1_epi32(0); + + for chunk in row.chunks_exact(32 * 4) { + let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i); + let mut r1 = _mm256_loadu_si256(chunk.get_unchecked(32..).as_ptr() as *const __m256i); + let mut r2 = _mm256_loadu_si256(chunk.get_unchecked(64..).as_ptr() as *const __m256i); + let mut r3 = _mm256_loadu_si256(chunk.get_unchecked(96..).as_ptr() as *const __m256i); + + r0 = _mm256_xor_si256(_mm256_shuffle_epi8(r0, ash0), def_alpha); + r1 = _mm256_xor_si256(_mm256_shuffle_epi8(r1, ash0), def_alpha); + r2 = _mm256_xor_si256(_mm256_shuffle_epi8(r2, ash0), def_alpha); + r3 = _mm256_xor_si256(_mm256_shuffle_epi8(r3, ash0), def_alpha); + + sums = _mm256_add_epi32(sums, r0); + sums = _mm256_add_epi32(sums, r1); + sums = _mm256_add_epi32(sums, r2); + sums = _mm256_add_epi32(sums, r3); + } + + let row = row.chunks_exact(32 * 4).remainder(); + + for chunk in row.chunks_exact(32) { + let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i); + + r0 = _mm256_xor_si256(_mm256_shuffle_epi8(r0, ash0), def_alpha); + + sums = _mm256_add_epi32(sums, r0); + } + + let row = row.chunks_exact(32).remainder(); + + let mut sums = _mm_add_epi32( + _mm256_castsi256_si128(sums), + _mm256_extracti128_si256::<1>(sums), + ); + let def_alpha = _mm_set1_epi32(first_alpha as i32); + + for chunk in row.chunks_exact(16) { + let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + + r0 = _mm_shuffle_epi8(r0, sh0); + + let alphas = _mm_xor_si128(r0, def_alpha); + + sums = _mm_add_epi32(sums, alphas); + } + + let row = row.chunks_exact(16).remainder(); + + let mut h_sum = _mm_hsum_epi32(sums); + + for chunk in row.chunks_exact(4) { + h_sum += chunk[3] as i32 ^ first_alpha as i32; + } + + if h_sum != 0 { + return true; + } + } + false +} + +/// Checks if image has constant alpha by xor rows for image 16bits +pub(crate) fn avx_has_non_constant_cap_alpha_rgba16( + store: &[u16], + width: usize, + stride: usize, +) -> bool { + unsafe { avx_has_non_constant_cap_alpha_rgba16_impl(store, width, stride) } +} + +#[target_feature(enable = "avx2")] +unsafe fn avx_has_non_constant_cap_alpha_rgba16_impl( + store: &[u16], + width: usize, + stride: usize, +) -> bool { + if store.is_empty() { + return true; + } + + let ash0 = _mm256_setr_epi8( + 6, 7, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, 14, 15, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + ); + + let first_alpha = store[3]; + let def_alpha = _mm256_set1_epi32(first_alpha as i32); + + for row in store.chunks_exact(stride) { + let row = &row[0..width * 4]; + let mut sums = _mm256_set1_epi32(0); + for chunk in row.chunks_exact(16 * 4) { + let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i); + let mut r1 = _mm256_loadu_si256(chunk.get_unchecked(16..).as_ptr() as *const __m256i); + let mut r2 = _mm256_loadu_si256(chunk.get_unchecked(32..).as_ptr() as *const __m256i); + let mut r3 = _mm256_loadu_si256(chunk.get_unchecked(48..).as_ptr() as *const __m256i); + + r0 = _mm256_shuffle_epi8(r0, ash0); + r1 = _mm256_shuffle_epi8(r1, ash0); + r2 = _mm256_shuffle_epi8(r2, ash0); + r3 = _mm256_shuffle_epi8(r3, ash0); + + let r01 = _mm256_xor_si256(_mm256_unpacklo_epi32(r0, r1), def_alpha); + let r23 = _mm256_xor_si256(_mm256_unpacklo_epi32(r2, r3), def_alpha); + + sums = _mm256_add_epi32(sums, r01); + sums = _mm256_add_epi32(sums, r23); + } + + let row = row.chunks_exact(16 * 4).remainder(); + + for chunk in row.chunks_exact(16) { + let mut r0 = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i); + + r0 = _mm256_shuffle_epi8(r0, ash0); + + let alphas = _mm256_xor_si256(_mm256_unpacklo_epi32(r0, r0), def_alpha); + + sums = _mm256_add_epi32(sums, alphas); + } + + let row = row.chunks_exact(16).remainder(); + + let mut h_sum = _mm_hsum_epi32(_mm_add_epi32( + _mm256_castsi256_si128(sums), + _mm256_extracti128_si256::<1>(sums), + )); + + for chunk in row.chunks_exact(4) { + h_sum += chunk[3] as i32 ^ first_alpha as i32; + } + + if h_sum != 0 { + return true; + } + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn check_alpha_exists_rgba8() { + let image_size = 256usize; + let mut image = vec![0u8; image_size * image_size * 4]; + image[3 + 150 * 4] = 75; + let has_alpha = avx_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4); + assert_eq!(true, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba8() { + let image_size = 256usize; + let image = vec![255u8; image_size * image_size * 4]; + let has_alpha = avx_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4); + assert_eq!(false, has_alpha); + } + + #[test] + fn check_alpha_exists_rgba16() { + let image_size = 256usize; + let mut image = vec![0u16; image_size * image_size * 4]; + image[3] = 715; + image[7] = 715; + image[11] = 715; + image[15] = 715; + let has_alpha = avx_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4); + assert_eq!(true, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba16() { + let image_size = 256usize; + let image = vec![255u16; image_size * image_size * 4]; + let has_alpha = avx_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4); + assert_eq!(false, has_alpha); + } +} diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 39fb038..9cb38ad 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -32,6 +32,7 @@ mod alpha_f16; mod alpha_f32; mod alpha_u16; mod alpha_u8; +mod check_alpha; #[cfg(feature = "half")] mod rgba_f16; mod rgba_f32; @@ -51,6 +52,9 @@ pub(crate) use alpha_f32::avx_unpremultiply_alpha_rgba_f32; pub(crate) use alpha_u16::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16}; pub(crate) use alpha_u8::avx_premultiply_alpha_rgba; pub(crate) use alpha_u8::avx_unpremultiply_alpha_rgba; +pub(crate) use check_alpha::{ + avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8, +}; #[cfg(feature = "half")] pub(crate) use rgba_f16::{ convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16, diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs index 105b2f2..b2bc4d5 100644 --- a/src/avx2/rgba_f16.rs +++ b/src/avx2/rgba_f16.rs @@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16( let src_ptr = src.add(start_x * COMPONENTS); let rgb_pixel = _mm_loadu_si64(src_ptr as *const u8); let pixels = avx_combine_ps(_mm_cvtph_ps(rgb_pixel), _mm_setzero_ps()); - let acc = _mm256_fma_ps::(store_0, pixels, weight0); - acc + _mm256_fma_ps::(store_0, pixels, weight0) } #[inline(always)] @@ -70,8 +69,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16( let rgb_pixel_1 = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(rgb_pixels_row_0)); let acc = _mm256_fma_ps::(store_0, rgb_pixel_0, weight0); - let acc = _mm256_fma_ps::(acc, rgb_pixel_1, weight1); - acc + _mm256_fma_ps::(acc, rgb_pixel_1, weight1) } #[inline(always)] @@ -112,8 +110,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16( const COMPONENTS: usize = 4; let src_ptr = src.add(start_x * COMPONENTS); let rgb_pixels = _mm_loadu_si128(src_ptr as *const __m128i); - let acc = _mm256_fma_ps::(store_0, _mm256_cvtph_ps(rgb_pixels), weight0); - acc + _mm256_fma_ps::(store_0, _mm256_cvtph_ps(rgb_pixels), weight0) } pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16( diff --git a/src/avx2/rgba_u8_lb.rs b/src/avx2/rgba_u8_lb.rs index b38ad25..d9663d3 100644 --- a/src/avx2/rgba_u8_lb.rs +++ b/src/avx2/rgba_u8_lb.rs @@ -98,220 +98,214 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 4; + const CHANNELS: usize = 4; - const SCALE: i32 = 6; - const V_SHR: i32 = SCALE; - const ROUNDING: i16 = 1 << (V_SHR - 1); + const SCALE: i32 = 6; + const V_SHR: i32 = SCALE; + const ROUNDING: i16 = 1 << (V_SHR - 1); - let vld = _mm256_setr_epi16( - ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ); + let vld = _mm256_setr_epi16( + ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ); + + let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 8 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 8 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - - let w0 = w_ptr[0]; - let w1 = w_ptr[1]; - let w2 = w_ptr[2]; - let w3 = w_ptr[3]; - let w4 = w_ptr[4]; - let w5 = w_ptr[5]; - let w6 = w_ptr[6]; - let w7 = w_ptr[7]; - - let weights0 = _mm256_setr_epi16( - w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5, - ); - let weights1 = _mm256_setr_epi16( - w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7, - ); - - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm256_loadu_si256( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - let rgb_pixel_1 = _mm256_loadu_si256( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - let rgb_pixel_2 = _mm256_loadu_si256( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - let rgb_pixel_3 = _mm256_loadu_si256( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, - ); - - store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1); - store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1); - store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1); - store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1); - - jx += 8; - } - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - - let w0 = w_ptr[0]; - let w1 = w_ptr[1]; - let w2 = w_ptr[2]; - let w3 = w_ptr[3]; - - let weights = _mm256_setr_epi16( - w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3, - ); - - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - - let rgb_pixel_1 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - let rgb_pixel_2 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - let rgb_pixel_3 = - _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ))); - - store_0 = hdot2(store_0, rgb_pixel_0, weights); - store_1 = hdot2(store_1, rgb_pixel_1, weights); - store_2 = hdot2(store_2, rgb_pixel_2, weights); - store_3 = hdot2(store_3, rgb_pixel_3, weights); - - jx += 4; - } - - let mut store_0 = _mm_add_epi16( - _mm256_castsi256_si128(store_0), - _mm256_extracti128_si256::<1>(store_0), + let w0 = w_ptr[0]; + let w1 = w_ptr[1]; + let w2 = w_ptr[2]; + let w3 = w_ptr[3]; + let w4 = w_ptr[4]; + let w5 = w_ptr[5]; + let w6 = w_ptr[6]; + let w7 = w_ptr[7]; + + let weights0 = _mm256_setr_epi16( + w0, w0, w0, w0, w1, w1, w1, w1, w4, w4, w4, w5, w5, w5, w5, w5, ); - let mut store_1 = _mm_add_epi16( - _mm256_castsi256_si128(store_1), - _mm256_extracti128_si256::<1>(store_1), + let weights1 = _mm256_setr_epi16( + w2, w2, w2, w2, w3, w3, w3, w3, w6, w6, w6, w6, w7, w7, w7, w7, ); - let mut store_2 = _mm_add_epi16( - _mm256_castsi256_si128(store_2), - _mm256_extracti128_si256::<1>(store_2), + + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm256_loadu_si256( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - let mut store_3 = _mm_add_epi16( - _mm256_castsi256_si128(store_3), - _mm256_extracti128_si256::<1>(store_3), + let rgb_pixel_1 = _mm256_loadu_si256( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - - let rgb_pixel_0 = - _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_1 = - _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_2 = - _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_3 = - _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - - store_0 = hdot(store_0, rgb_pixel_0, weight01); - store_1 = hdot(store_1, rgb_pixel_1, weight01); - store_2 = hdot(store_2, rgb_pixel_2, weight01); - store_3 = hdot(store_3, rgb_pixel_3, weight01); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - - let weight0 = _mm_set1_epi16(w_ptr[0]); - - let start_bounds = bounds.start + jx; - - store_0 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); - jx += 1; - } - - store_0 = _mm_add_hi_lo_epi16(store_0); - store_1 = _mm_add_hi_lo_epi16(store_1); - store_2 = _mm_add_hi_lo_epi16(store_2); - store_3 = _mm_add_hi_lo_epi16(store_3); - - let store_16_8_0 = _mm_srai_epi16::(store_0); - let store_16_8_1 = _mm_srai_epi16::(store_1); - let store_16_8_2 = _mm_srai_epi16::(store_2); - let store_16_8_3 = _mm_srai_epi16::(store_3); - - _mm_storeu_si32( - chunk0.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_0, store_16_8_0), + let rgb_pixel_2 = _mm256_loadu_si256( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - _mm_storeu_si32( - chunk1.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_1, store_16_8_1), + let rgb_pixel_3 = _mm256_loadu_si256( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m256i, ); - _mm_storeu_si32( - chunk2.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_2, store_16_8_2), + + store_0 = hdot4(store_0, rgb_pixel_0, weights0, weights1); + store_1 = hdot4(store_1, rgb_pixel_1, weights0, weights1); + store_2 = hdot4(store_2, rgb_pixel_2, weights0, weights1); + store_3 = hdot4(store_3, rgb_pixel_3, weights0, weights1); + + jx += 8; + } + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + + let w0 = w_ptr[0]; + let w1 = w_ptr[1]; + let w2 = w_ptr[2]; + let w3 = w_ptr[3]; + + let weights = _mm256_setr_epi16( + w0, w0, w0, w0, w1, w1, w1, w1, w2, w2, w2, w2, w3, w3, w3, w3, ); - _mm_storeu_si32( - chunk3.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_3, store_16_8_3), + + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + + let rgb_pixel_1 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + let rgb_pixel_2 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + let rgb_pixel_3 = + _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(_mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ))); + + store_0 = hdot2(store_0, rgb_pixel_0, weights); + store_1 = hdot2(store_1, rgb_pixel_1, weights); + store_2 = hdot2(store_2, rgb_pixel_2, weights); + store_3 = hdot2(store_3, rgb_pixel_3, weights); + + jx += 4; + } + + let mut store_0 = _mm_add_epi16( + _mm256_castsi256_si128(store_0), + _mm256_extracti128_si256::<1>(store_0), + ); + let mut store_1 = _mm_add_epi16( + _mm256_castsi256_si128(store_1), + _mm256_extracti128_si256::<1>(store_1), + ); + let mut store_2 = _mm_add_epi16( + _mm256_castsi256_si128(store_2), + _mm256_extracti128_si256::<1>(store_2), + ); + let mut store_3 = _mm_add_epi16( + _mm256_castsi256_si128(store_3), + _mm256_extracti128_si256::<1>(store_3), + ); + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, ); + + let rgb_pixel_0 = + _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_1 = + _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_2 = + _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_3 = + _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + + store_0 = hdot(store_0, rgb_pixel_0, weight01); + store_1 = hdot(store_1, rgb_pixel_1, weight01); + store_2 = hdot(store_2, rgb_pixel_2, weight01); + store_3 = hdot(store_3, rgb_pixel_3, weight01); + + jx += 2; } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + + let weight0 = _mm_set1_epi16(w_ptr[0]); + + let start_bounds = bounds.start + jx; + + store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); + jx += 1; + } + + store_0 = _mm_add_hi_lo_epi16(store_0); + store_1 = _mm_add_hi_lo_epi16(store_1); + store_2 = _mm_add_hi_lo_epi16(store_2); + store_3 = _mm_add_hi_lo_epi16(store_3); + + let store_16_8_0 = _mm_srai_epi16::(store_0); + let store_16_8_1 = _mm_srai_epi16::(store_1); + let store_16_8_2 = _mm_srai_epi16::(store_2); + let store_16_8_3 = _mm_srai_epi16::(store_3); + + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } diff --git a/src/avx2/vertical_u16_lb.rs b/src/avx2/vertical_u16_lb.rs index c083c57..2622766 100644 --- a/src/avx2/vertical_u16_lb.rs +++ b/src/avx2/vertical_u16_lb.rs @@ -59,7 +59,7 @@ unsafe fn convolve_column_lb_avx_u16_impl( weight: &[i16], bit_depth: u32, ) { - assert!(bit_depth >= 1 && bit_depth <= 16); + assert!((1..=16).contains(&bit_depth)); let max_colors = (1 << bit_depth) - 1; let mut cx = 0usize; @@ -217,8 +217,8 @@ unsafe fn convolve_column_lb_avx_u16_impl( if bounds_size == 2 { let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); + let v_weight0 = _mm_set1_epi16(weights[0]); + let v_weight1 = _mm_set1_epi16(weights[1]); let py = bounds.start; let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); @@ -238,9 +238,9 @@ unsafe fn convolve_column_lb_avx_u16_impl( } else if bounds_size == 3 { let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); + let v_weight0 = _mm_set1_epi16(weights[0]); + let v_weight1 = _mm_set1_epi16(weights[1]); + let v_weight2 = _mm_set1_epi16(weights[2]); let py = bounds.start; let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); @@ -267,10 +267,10 @@ unsafe fn convolve_column_lb_avx_u16_impl( } else if bounds_size == 4 { let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); - let v_weight3 = _mm_set1_epi32(weights[3] as i32); + let v_weight0 = _mm_set1_epi16(weights[0]); + let v_weight1 = _mm_set1_epi16(weights[1]); + let v_weight2 = _mm_set1_epi16(weights[2]); + let v_weight3 = _mm_set1_epi16(weights[3]); let py = bounds.start; let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); @@ -306,7 +306,7 @@ unsafe fn convolve_column_lb_avx_u16_impl( let py = bounds.start + j; let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - let v_weight = _mm_set1_epi32(k_weight as i32); + let v_weight = _mm_set1_epi16(k_weight); let item_row = _mm_loadu_si64(src_ptr.as_ptr() as *const u8); diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs index 31fb2e6..cb1ddee 100644 --- a/src/avx2/vertical_u8.rs +++ b/src/avx2/vertical_u8.rs @@ -70,6 +70,7 @@ unsafe fn convolve_vertical_part_avx_64( ) { let zeros = _mm256_setzero_si256(); let vld = _mm256_set1_epi32(ROUNDING_CONST); + let mut store_0 = vld; let mut store_1 = vld; let mut store_2 = vld; @@ -129,7 +130,7 @@ unsafe fn convolve_vertical_part_avx_64( for j in jj..bounds_size { let py = start_y + j; let weight = *filter.get_unchecked(j); - let v_weight = _mm256_set1_epi32(weight as i32); + let v_weight = _mm256_set1_epi16(weight); let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); @@ -188,7 +189,7 @@ unsafe fn convolve_vertical_part_avx_32( for j in 0..bounds_size { let py = start_y + j; let weight = *filter.get_unchecked(j); - let v_weight = _mm256_set1_epi32(weight as i32); + let v_weight = _mm256_set1_epi16(weight); let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); @@ -232,14 +233,14 @@ unsafe fn convolve_vertical_part_8_avx( for j in 0..bounds_size { let py = start_y + j; let weight = *filter.get_unchecked(j); - let v_weight = _mm256_set1_epi32(weight as i32); + let v_weight = _mm256_set1_epi16(weight); let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row = _mm256_cvtepu16_epi32(_mm_unpacklo_epi8( _mm_loadu_si64(src_ptr.as_ptr()), _mm_setzero_si128(), )); - store_0 = _mm256_add_epi32(store_0, _mm256_mullo_epi32(item_row, v_weight)); + store_0 = _mm256_add_epi32(store_0, _mm256_madd_epi16(item_row, v_weight)); } const MASK: i32 = shuffle(3, 1, 2, 0); @@ -278,8 +279,8 @@ unsafe fn convolve_vertical_part_avx( if bounds_size == 2 { let py = start_y; let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); + let v_weight0 = _mm256_set1_epi16(weight[0]); + let v_weight1 = _mm256_set1_epi16(weight[1]); let src_ptr0 = src.get_unchecked(src_stride * py + px); let src_ptr1 = src.get_unchecked(src_stride * (py + 1) + px); let item_row0 = _mm256_insert_epi8::<0>(_mm256_setzero_si256(), *src_ptr0 as i8); @@ -290,9 +291,9 @@ unsafe fn convolve_vertical_part_avx( } else if bounds_size == 3 { let py = start_y; let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let v_weight2 = _mm256_set1_epi32(weight[2] as i32); + let v_weight0 = _mm256_set1_epi16(weight[0]); + let v_weight1 = _mm256_set1_epi16(weight[1]); + let v_weight2 = _mm256_set1_epi16(weight[2]); let src_ptr0 = src.get_unchecked(src_stride * py + px); let src_ptr1 = src.get_unchecked(src_stride * (py + 1) + px); let src_ptr2 = src.get_unchecked(src_stride * (py + 2) + px); @@ -306,10 +307,10 @@ unsafe fn convolve_vertical_part_avx( } else if bounds_size == 4 { let py = start_y; let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let v_weight2 = _mm256_set1_epi32(weight[2] as i32); - let v_weight3 = _mm256_set1_epi32(weight[3] as i32); + let v_weight0 = _mm256_set1_epi16(weight[0]); + let v_weight1 = _mm256_set1_epi16(weight[1]); + let v_weight2 = _mm256_set1_epi16(weight[2]); + let v_weight3 = _mm256_set1_epi16(weight[3]); let src_ptr0 = src.get_unchecked(src_stride * py + px); let src_ptr1 = src.get_unchecked(src_stride * (py + 1) + px); let src_ptr2 = src.get_unchecked(src_stride * (py + 2) + px); @@ -327,7 +328,7 @@ unsafe fn convolve_vertical_part_avx( for j in 0..bounds.size { let py = start_y + j; let weight = *filter.get_unchecked(j); - let v_weight = _mm256_set1_epi32(weight as i32); + let v_weight = _mm256_set1_epi16(weight); let src_ptr = src.get_unchecked(src_stride * py + px); let item_row = _mm256_setr_epi32(*src_ptr as i32, 0, 0, 0, 0, 0, 0, 0); diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs index 7139282..f6dcda0 100644 --- a/src/avx2/vertical_u8_lp.rs +++ b/src/avx2/vertical_u8_lp.rs @@ -85,6 +85,7 @@ unsafe fn convolve_vertical_avx2_row_impl( let mut cx = 0usize; let mut rem = dst; + let iter_64 = rem.chunks_exact_mut(64); for dst in iter_64 { @@ -228,6 +229,7 @@ unsafe fn convolve_vertical_avx2_row_impl( } rem = rem.chunks_exact_mut(64).into_remainder(); + let iter_32 = rem.chunks_exact_mut(32); for dst in iter_32 { diff --git a/src/cpu_features.rs b/src/cpu_features.rs index 975fd4a..e110a04 100644 --- a/src/cpu_features.rs +++ b/src/cpu_features.rs @@ -64,7 +64,7 @@ fn apple_has_cpu_feature(_feature_name: &str) -> bool { /// Test aarch64 cpu with *fp16* check, /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used -#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))] +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] pub(crate) fn is_aarch_f16_supported() -> bool { #[cfg(any(target_os = "macos", target_os = "ios"))] { diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs index 03bd2d2..27ede3b 100644 --- a/src/dispatch_group_f16.rs +++ b/src/dispatch_group_f16.rs @@ -35,6 +35,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; +#[allow(clippy::type_complexity)] pub(crate) fn convolve_vertical_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, @@ -78,6 +79,7 @@ pub(crate) fn convolve_vertical_dispatch_f16( } } +#[allow(clippy::type_complexity)] pub(crate) fn convolve_horizontal_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, diff --git a/src/dispatch_group_u16.rs b/src/dispatch_group_u16.rs index 0beee63..af33b6a 100644 --- a/src/dispatch_group_u16.rs +++ b/src/dispatch_group_u16.rs @@ -27,7 +27,11 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::filter_weights::FilterWeights; +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +use crate::cpu_features::is_aarch_f16_supported; +use crate::filter_weights::{ + DefaultWeightsConverter, FilterBounds, FilterWeights, WeightsConverter, +}; use crate::handler_provider::{ ColumnHandlerFixedPoint, ColumnHandlerFloatingPoint, RowHandlerFixedPoint, RowHandlerFloatingPoint, @@ -189,25 +193,50 @@ pub(crate) fn convolve_vertical_dispatch_u16( ); }); } else { - let approx = filter_weights.numerical_approximation_i16::(0); - destination_image - .par_chunks_exact_mut(dst_stride) - .enumerate() - .for_each(|(y, row)| { - let bounds = filter_weights.bounds[y]; - let filter_offset = y * filter_weights.aligned_size; - let weights = &approx.weights[filter_offset..]; - let source_buffer = image_store.buffer.as_ref(); - u16::handle_fixed_column::( + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + if is_aarch_f16_supported() { + use crate::filter_weights::WeightFloat16Converter; + execute_low_precision_row( + true, + image_store, + &filter_weights, + src_stride, + dst_stride, + bit_depth, dst_width, - &bounds, - source_buffer, - row, + destination_image, + HighBitDepthFloat16LowerHandler::default(), + WeightFloat16Converter::default(), + ); + } else { + execute_low_precision_row( + true, + image_store, + &filter_weights, src_stride, - weights, - bit_depth as u32, + dst_stride, + bit_depth, + dst_width, + destination_image, + DefaultHighBitDepthLowerHandler::default(), + DefaultWeightsConverter::default(), ); - }); + } + } + #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] + execute_low_precision_row( + true, + image_store, + &filter_weights, + src_stride, + dst_stride, + bit_depth, + dst_width, + destination_image, + DefaultHighBitDepthLowerHandler::default(), + DefaultWeightsConverter::default(), + ); } }); } else if bit_depth > 12 { @@ -231,8 +260,145 @@ pub(crate) fn convolve_vertical_dispatch_u16( ); }); } else { - let destination_image = destination.buffer.borrow_mut(); - let approx = filter_weights.numerical_approximation_i16::(0); + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + if is_aarch_f16_supported() { + use crate::filter_weights::WeightFloat16Converter; + execute_low_precision_row( + false, + image_store, + &filter_weights, + src_stride, + dst_stride, + bit_depth, + dst_width, + destination.buffer.borrow_mut(), + HighBitDepthFloat16LowerHandler::default(), + WeightFloat16Converter::default(), + ); + } else { + execute_low_precision_row( + false, + image_store, + &filter_weights, + src_stride, + dst_stride, + bit_depth, + dst_width, + destination.buffer.borrow_mut(), + DefaultHighBitDepthLowerHandler::default(), + DefaultWeightsConverter::default(), + ); + } + } + #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] + execute_low_precision_row( + false, + image_store, + &filter_weights, + src_stride, + dst_stride, + bit_depth, + dst_width, + destination.buffer.borrow_mut(), + DefaultHighBitDepthLowerHandler::default(), + DefaultWeightsConverter::default(), + ); + } +} + +trait HandleHighBitDepthLower { + fn handle_fixed_column( + &self, + dst_width: usize, + bounds: &FilterBounds, + src: &[u16], + dst: &mut [u16], + src_stride: usize, + weight: &[i16], + bit_depth: u32, + ); +} + +#[derive(Default)] +struct DefaultHighBitDepthLowerHandler {} + +impl HandleHighBitDepthLower + for DefaultHighBitDepthLowerHandler +{ + fn handle_fixed_column( + &self, + dst_width: usize, + bounds: &FilterBounds, + src: &[u16], + dst: &mut [u16], + src_stride: usize, + weight: &[i16], + bit_depth: u32, + ) { + u16::handle_fixed_column::( + dst_width, bounds, src, dst, src_stride, weight, bit_depth, + ); + } +} + +#[derive(Default)] +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +struct HighBitDepthFloat16LowerHandler {} + +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +impl HandleHighBitDepthLower + for HighBitDepthFloat16LowerHandler +{ + fn handle_fixed_column( + &self, + dst_width: usize, + bounds: &FilterBounds, + src: &[u16], + dst: &mut [u16], + src_stride: usize, + weight: &[i16], + bit_depth: u32, + ) { + use crate::neon::convolve_column_lb_u16_f16; + convolve_column_lb_u16_f16(dst_width, bounds, src, dst, src_stride, weight, bit_depth); + } +} + +#[inline] +fn execute_low_precision_row( + is_parallel: bool, + image_store: &ImageStore, + filter_weights: &FilterWeights, + src_stride: usize, + dst_stride: usize, + bit_depth: usize, + dst_width: usize, + destination_image: &mut [u16], + handler: impl HandleHighBitDepthLower + Sync, + weights: impl WeightsConverter, +) { + let approx = weights.prepare_weights(filter_weights); + if is_parallel { + destination_image + .par_chunks_exact_mut(dst_stride) + .enumerate() + .for_each(|(y, row)| { + let bounds = filter_weights.bounds[y]; + let filter_offset = y * filter_weights.aligned_size; + let weights = &approx.weights[filter_offset..]; + let source_buffer = image_store.buffer.as_ref(); + handler.handle_fixed_column( + dst_width, + &bounds, + source_buffer, + row, + src_stride, + weights, + bit_depth as u32, + ); + }); + } else { destination_image .chunks_exact_mut(dst_stride) .enumerate() @@ -241,7 +407,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( let filter_offset = y * filter_weights.aligned_size; let weights = &approx.weights[filter_offset..]; let source_buffer = image_store.buffer.as_ref(); - u16::handle_fixed_column::( + handler.handle_fixed_column( dst_width, &bounds, source_buffer, diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs index 89d9eaa..bcdb3cf 100644 --- a/src/dispatch_group_u8.rs +++ b/src/dispatch_group_u8.rs @@ -27,7 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::filter_weights::{FilterBounds, FilterWeights, WeightsConverter}; use crate::image_store::ImageStoreMut; use crate::support::PRECISION; use crate::ImageStore; @@ -44,8 +44,9 @@ pub(crate) fn convolve_horizontal_dispatch_u8( pool: &Option, dispatcher_4_rows: Option)>, dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights), + weights_converter: impl WeightsConverter, ) { - let approx_weights = filter_weights.numerical_approximation_i16::(0); + let approx_weights = weights_converter.prepare_weights(&filter_weights); let src = image_store.buffer.as_ref(); let dst = destination.buffer.borrow_mut(); diff --git a/src/f16.rs b/src/f16.rs index a445a9e..6573ace 100644 --- a/src/f16.rs +++ b/src/f16.rs @@ -95,16 +95,18 @@ fn convolve_horizontal_rgb_native_row_f16( convolve_row_handler_floating_point::(src, dst, filter_weights, 8) } -impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 4> { +impl HorizontalConvolutionPass for ImageStore<'_, f16, 4> { fn convolve_horizontal( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, ) { + #[allow(clippy::type_complexity)] let mut _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), > = Some(convolve_horizontal_rgba_4_row_f16::<4>); + #[allow(clippy::type_complexity)] let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]) = convolve_horizontal_rgb_native_row_f16::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -167,13 +169,14 @@ fn convolve_vertical_rgb_native_row_f16( column_handler_floating_point::(bounds, src, dst, src_stride, weight, 8); } -impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 4> { +impl VerticalConvolutionPass for ImageStore<'_, f16, 4> { fn convolve_vertical( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, ) { + #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = convolve_vertical_rgb_native_row_f16::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -211,16 +214,18 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 4> { } } -impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 3> { +impl HorizontalConvolutionPass for ImageStore<'_, f16, 3> { fn convolve_horizontal( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, ) { + #[allow(clippy::type_complexity)] let mut _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), > = Some(convolve_horizontal_rgba_4_row_f16::<3>); + #[allow(clippy::type_complexity)] let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]) = convolve_horizontal_rgb_native_row_f16::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -263,13 +268,14 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 3> { } } -impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 3> { +impl VerticalConvolutionPass for ImageStore<'_, f16, 3> { fn convolve_vertical( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, ) { + #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = convolve_vertical_rgb_native_row_f16::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -307,13 +313,14 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 3> { } } -impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 1> { +impl HorizontalConvolutionPass for ImageStore<'_, f16, 1> { fn convolve_horizontal( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, ) { + #[allow(clippy::type_complexity)] let _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), > = Some(convolve_horizontal_rgba_4_row_f16::<1>); @@ -330,13 +337,14 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 1> { } } -impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 1> { +impl VerticalConvolutionPass for ImageStore<'_, f16, 1> { fn convolve_vertical( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, ) { + #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = convolve_vertical_rgb_native_row_f16::<1>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] diff --git a/src/filter_weights.rs b/src/filter_weights.rs index 6e2823d..b1694b7 100644 --- a/src/filter_weights.rs +++ b/src/filter_weights.rs @@ -109,3 +109,40 @@ impl FilterWeights { ) } } + +pub(crate) trait WeightsConverter { + fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights; +} + +#[derive(Default)] +pub(crate) struct DefaultWeightsConverter {} + +impl WeightsConverter for DefaultWeightsConverter { + fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights { + use crate::support::PRECISION; + weights.numerical_approximation_i16::(0) + } +} + +#[derive(Default)] +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +pub(crate) struct WeightFloat16Converter {} + +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +impl WeightsConverter for WeightFloat16Converter { + fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights { + use crate::neon::convert_weights_to_f16; + let converted_weights = convert_weights_to_f16(&weights.weights); + + let new_bounds = weights.bounds.to_vec(); + + FilterWeights::new( + converted_weights, + weights.kernel_size, + weights.kernel_size, + weights.distinct_elements, + weights.coeffs_size, + new_bounds, + ) + } +} diff --git a/src/image_store.rs b/src/image_store.rs index 2f7f490..beff353 100644 --- a/src/image_store.rs +++ b/src/image_store.rs @@ -26,10 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_check::{ - has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8, - has_non_constant_cap_alpha_rgba_f32, -}; +use crate::alpha_check::has_non_constant_cap_alpha_rgba_f32; #[cfg(feature = "half")] use crate::alpha_handle_f16::{premultiply_alpha_rgba_f16, unpremultiply_alpha_rgba_f16}; use crate::alpha_handle_f32::{premultiply_alpha_rgba_f32, unpremultiply_alpha_rgba_f32}; @@ -64,7 +61,7 @@ where /// Image height pub height: usize, /// This is private field, currently used only for u16, will be automatically passed from upper func - pub(crate) bit_depth: usize, + pub bit_depth: usize, } #[derive(Debug)] @@ -398,7 +395,39 @@ impl AssociateAlpha for ImageStore<'_, u8, 4> { premultiply_alpha_rgba(dst, src, self.width, self.height, pool); } + #[cfg(not(any( + any(target_arch = "x86_64", target_arch = "x86"), + all(target_arch = "aarch64", target_feature = "neon") + )))] + fn is_alpha_premultiplication_needed(&self) -> bool { + use crate::alpha_check::has_non_constant_cap_alpha_rgba8; + has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width) + } + + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] fn is_alpha_premultiplication_needed(&self) -> bool { + use crate::neon::neon_has_non_constant_cap_alpha_rgba8; + neon_has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width, self.width * 4) + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn is_alpha_premultiplication_needed(&self) -> bool { + use crate::alpha_check::has_non_constant_cap_alpha_rgba8; + use crate::avx2::avx_has_non_constant_cap_alpha_rgba8; + use crate::sse::sse_has_non_constant_cap_alpha_rgba8; + if std::arch::is_x86_feature_detected!("avx2") { + return avx_has_non_constant_cap_alpha_rgba8( + self.buffer.as_ref(), + self.width, + self.width * 4, + ); + } else if std::arch::is_x86_feature_detected!("sse4.1") { + return sse_has_non_constant_cap_alpha_rgba8( + self.buffer.as_ref(), + self.width, + self.width * 4, + ); + } has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width) } } @@ -414,10 +443,42 @@ impl AssociateAlpha for ImageStore<'_, u16, 4> { fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, u16, 4>, pool: &Option) { let dst = into.buffer.borrow_mut(); let src = self.buffer.as_ref(); - premultiply_alpha_rgba_u16(dst, src, self.width, self.height, self.bit_depth, pool); + premultiply_alpha_rgba_u16(dst, src, self.width, self.height, into.bit_depth, pool); + } + + #[cfg(not(any( + any(target_arch = "x86_64", target_arch = "x86"), + all(target_arch = "aarch64", target_feature = "neon") + )))] + fn is_alpha_premultiplication_needed(&self) -> bool { + use crate::alpha_check::has_non_constant_cap_alpha_rgba16; + has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width) + } + + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + fn is_alpha_premultiplication_needed(&self) -> bool { + use crate::neon::neon_has_non_constant_cap_alpha_rgba16; + neon_has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width, self.width * 4) } + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] fn is_alpha_premultiplication_needed(&self) -> bool { + use crate::alpha_check::has_non_constant_cap_alpha_rgba16; + use crate::avx2::avx_has_non_constant_cap_alpha_rgba16; + use crate::sse::sse_has_non_constant_cap_alpha_rgba16; + if std::arch::is_x86_feature_detected!("avx2") { + return avx_has_non_constant_cap_alpha_rgba16( + self.buffer.as_ref(), + self.width, + self.width * 4, + ); + } else if std::arch::is_x86_feature_detected!("sse4.1") { + return sse_has_non_constant_cap_alpha_rgba16( + self.buffer.as_ref(), + self.width, + self.width * 4, + ); + } has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width) } } @@ -472,3 +533,27 @@ impl UnassociateAlpha for ImageStoreMut<'_, half::f16, 4> { unpremultiply_alpha_rgba_f16(dst, self.width, self.height, pool); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn image_store_alpha_test_rgba8() { + let image_size = 256usize; + let mut image = vec![0u8; image_size * image_size * 4]; + image[3 + 150 * 4] = 75; + let store = ImageStore::::from_slice(&image, image_size, image_size).unwrap(); + let has_alpha = store.is_alpha_premultiplication_needed(); + assert_eq!(true, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba8() { + let image_size = 256usize; + let image = vec![255u8; image_size * image_size * 4]; + let store = ImageStore::::from_slice(&image, image_size, image_size).unwrap(); + let has_alpha = store.is_alpha_premultiplication_needed(); + assert_eq!(false, has_alpha); + } +} diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs index bc7048f..084f989 100644 --- a/src/neon/alpha_u16.rs +++ b/src/neon/alpha_u16.rs @@ -26,7 +26,9 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_handle_u16::{premultiply_alpha_rgba_row, unpremultiply_alpha_rgba_row}; +use crate::cpu_features::is_aarch_f16_supported; +use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16}; +use crate::neon::xreinterpretq_f16_u16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; @@ -47,118 +49,199 @@ unsafe fn neon_div_by_65535_n(v: uint32x4_t) -> uint16x4_t { vqrshrn_n_u32::<16>(vrsraq_n_u32::<16>(v, v)) } -pub fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) { - assert_ne!(bit_depth, 0, "Something goes wrong!"); - let max_colors = (1 << bit_depth) - 1; +#[inline(always)] +unsafe fn neon_div_by(v: uint32x4_t) -> uint16x4_t { + match BIT_DEPTH { + 10 => neon_div_by_1023_n(v), + 12 => neon_div_by_4095_n(v), + 16 => neon_div_by_65535_n(v), + _ => neon_div_by_1023_n(v), + } +} - let v_max_colors_scale = unsafe { vdupq_n_f32((1. / max_colors as f64) as f32) }; +trait NeonPremultiplyExecutor { + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize); +} - let mut rem = dst; - let mut src_rem = src; +#[derive(Default)] +struct NeonPremultiplyExecutorDefault {} - unsafe { - if bit_depth == 10 { - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let pixel = vld4q_u16(src.as_ptr()); +impl NeonPremultiplyExecutor for NeonPremultiplyExecutorDefault { + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + assert_ne!(bit_depth, 0, "Something goes wrong!"); + assert!((1..=16).contains(&bit_depth)); - let low_a = vget_low_u16(pixel.3); + let mut rem = dst; + let mut src_rem = src; + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let pixel = vld4q_u16(src.as_ptr()); + + let low_a = vget_low_u16(pixel.3); + + let new_r = vcombine_u16( + neon_div_by::(vmull_u16(vget_low_u16(pixel.0), low_a)), + neon_div_by::(vmull_high_u16(pixel.0, pixel.3)), + ); + + let new_g = vcombine_u16( + neon_div_by::(vmull_u16(vget_low_u16(pixel.1), low_a)), + neon_div_by::(vmull_high_u16(pixel.1, pixel.3)), + ); - let new_r = vcombine_u16( - neon_div_by_1023_n(vmull_u16(vget_low_u16(pixel.0), low_a)), - neon_div_by_1023_n(vmull_high_u16(pixel.0, pixel.3)), - ); + let new_b = vcombine_u16( + neon_div_by::(vmull_u16(vget_low_u16(pixel.2), low_a)), + neon_div_by::(vmull_high_u16(pixel.2, pixel.3)), + ); - let new_g = vcombine_u16( - neon_div_by_1023_n(vmull_u16(vget_low_u16(pixel.1), low_a)), - neon_div_by_1023_n(vmull_high_u16(pixel.1, pixel.3)), - ); + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); - let new_b = vcombine_u16( - neon_div_by_1023_n(vmull_u16(vget_low_u16(pixel.2), low_a)), - neon_div_by_1023_n(vmull_high_u16(pixel.2, pixel.3)), - ); + vst4q_u16(dst.as_mut_ptr(), new_px); + } - let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(8 * 4).remainder(); - vst4q_u16(dst.as_mut_ptr(), new_px); - } - } else if bit_depth == 12 { - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let pixel = vld4q_u16(src.as_ptr()); + if !rem.is_empty() { + assert!(src_rem.len() < 8 * 4); + assert!(rem.len() < 8 * 4); + let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4]; + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len()); - let low_a = vget_low_u16(pixel.3); + let pixel = vld4q_u16(buffer.as_ptr()); - let new_r = vcombine_u16( - neon_div_by_4095_n(vmull_u16(vget_low_u16(pixel.0), low_a)), - neon_div_by_4095_n(vmull_high_u16(pixel.0, pixel.3)), - ); + let low_a = vget_low_u16(pixel.3); - let new_g = vcombine_u16( - neon_div_by_4095_n(vmull_u16(vget_low_u16(pixel.1), low_a)), - neon_div_by_4095_n(vmull_high_u16(pixel.1, pixel.3)), - ); + let new_r = vcombine_u16( + neon_div_by::(vmull_u16(vget_low_u16(pixel.0), low_a)), + neon_div_by::(vmull_high_u16(pixel.0, pixel.3)), + ); - let new_b = vcombine_u16( - neon_div_by_4095_n(vmull_u16(vget_low_u16(pixel.2), low_a)), - neon_div_by_4095_n(vmull_high_u16(pixel.2, pixel.3)), - ); + let new_g = vcombine_u16( + neon_div_by::(vmull_u16(vget_low_u16(pixel.1), low_a)), + neon_div_by::(vmull_high_u16(pixel.1, pixel.3)), + ); - let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + let new_b = vcombine_u16( + neon_div_by::(vmull_u16(vget_low_u16(pixel.2), low_a)), + neon_div_by::(vmull_high_u16(pixel.2, pixel.3)), + ); - vst4q_u16(dst.as_mut_ptr(), new_px); - } - } else if bit_depth == 16 { - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let pixel = vld4q_u16(src.as_ptr()); + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); - let low_a = vget_low_u16(pixel.3); + vst4q_u16(buffer.as_mut_ptr(), new_px); - let new_r = vcombine_u16( - neon_div_by_65535_n(vmull_u16(vget_low_u16(pixel.0), low_a)), - neon_div_by_65535_n(vmull_high_u16(pixel.0, pixel.3)), - ); + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} - let new_g = vcombine_u16( - neon_div_by_65535_n(vmull_u16(vget_low_u16(pixel.1), low_a)), - neon_div_by_65535_n(vmull_high_u16(pixel.1, pixel.3)), - ); +#[derive(Default)] +struct NeonPremultiplyExecutorAnyBitDepth {} - let new_b = vcombine_u16( - neon_div_by_65535_n(vmull_u16(vget_low_u16(pixel.2), low_a)), - neon_div_by_65535_n(vmull_high_u16(pixel.2, pixel.3)), - ); +impl NeonPremultiplyExecutor for NeonPremultiplyExecutorAnyBitDepth { + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + assert_ne!(bit_depth, 0, "Something goes wrong!"); + assert!((1..=16).contains(&bit_depth)); + let max_colors = (1 << bit_depth) - 1; + let mut rem = dst; + let mut src_rem = src; - let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + let v_max_colors_scale = vdupq_n_f32((1. / max_colors as f64) as f32); - vst4q_u16(dst.as_mut_ptr(), new_px); - } - } else { - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let pixel = vld4q_u16(src.as_ptr()); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let pixel = vld4q_u16(src.as_ptr()); - let low_a = vmovl_u16(vget_low_u16(pixel.3)); - let high_a = vmovl_high_u16(pixel.3); + let low_a = vmovl_u16(vget_low_u16(pixel.3)); + let high_a = vmovl_high_u16(pixel.3); - let low_a = vmulq_f32(vcvtq_f32_u32(low_a), v_max_colors_scale); - let hi_a = vmulq_f32(vcvtq_f32_u32(high_a), v_max_colors_scale); + let low_a = vmulq_f32(vcvtq_f32_u32(low_a), v_max_colors_scale); + let hi_a = vmulq_f32(vcvtq_f32_u32(high_a), v_max_colors_scale); - let new_r = v_scale_by_alpha(pixel.0, low_a, hi_a); + let new_r = v_scale_by_alpha(pixel.0, low_a, hi_a); - let new_g = v_scale_by_alpha(pixel.1, low_a, hi_a); + let new_g = v_scale_by_alpha(pixel.1, low_a, hi_a); - let new_b = v_scale_by_alpha(pixel.2, low_a, hi_a); + let new_b = v_scale_by_alpha(pixel.2, low_a, hi_a); - let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); - vst4q_u16(dst.as_mut_ptr(), new_px); - } + vst4q_u16(dst.as_mut_ptr(), new_px); } rem = rem.chunks_exact_mut(8 * 4).into_remainder(); src_rem = src_rem.chunks_exact(8 * 4).remainder(); + + if !rem.is_empty() { + assert!(src_rem.len() < 8 * 4); + assert!(rem.len() < 8 * 4); + let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4]; + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len()); + + let pixel = vld4q_u16(buffer.as_ptr()); + + let low_a = vmovl_u16(vget_low_u16(pixel.3)); + let high_a = vmovl_high_u16(pixel.3); + + let low_a = vmulq_f32(vcvtq_f32_u32(low_a), v_max_colors_scale); + let hi_a = vmulq_f32(vcvtq_f32_u32(high_a), v_max_colors_scale); + + let new_r = v_scale_by_alpha(pixel.0, low_a, hi_a); + + let new_g = v_scale_by_alpha(pixel.1, low_a, hi_a); + + let new_b = v_scale_by_alpha(pixel.2, low_a, hi_a); + + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + + vst4q_u16(buffer.as_mut_ptr(), new_px); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) { + assert_ne!(bit_depth, 0, "Something goes wrong!"); + + if bit_depth == 10 { + neon_pa_dispatch( + dst, + src, + bit_depth, + NeonPremultiplyExecutorDefault::<10>::default(), + ) + } else if bit_depth == 12 { + neon_pa_dispatch( + dst, + src, + bit_depth, + NeonPremultiplyExecutorDefault::<12>::default(), + ) + } else if bit_depth == 16 { + neon_pa_dispatch( + dst, + src, + bit_depth, + NeonPremultiplyExecutorDefault::<16>::default(), + ) + } else { + neon_pa_dispatch( + dst, + src, + bit_depth, + NeonPremultiplyExecutorAnyBitDepth::default(), + ) } +} - premultiply_alpha_rgba_row(rem, src_rem, max_colors); +#[inline] +fn neon_pa_dispatch( + dst: &mut [u16], + src: &[u16], + bit_depth: usize, + dispatch: impl NeonPremultiplyExecutor, +) { + unsafe { dispatch.premultiply(dst, src, bit_depth) } } pub(crate) fn neon_premultiply_alpha_rgba_u16( @@ -204,12 +287,19 @@ unsafe fn v_scale_by_alpha( vcombine_u16(vmovn_u32(new_ll), vmovn_u32(new_lh)) } -fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize) { - let max_colors = (1 << bit_depth) - 1; +trait DisassociateAlpha { + unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize); +} - let mut rem = in_place; +#[derive(Default)] +struct NeonDisassociateAlpha {} + +impl DisassociateAlpha for NeonDisassociateAlpha { + unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = in_place; - unsafe { let v_max_colors_f = vdupq_n_f32(max_colors as f32); let ones = vdupq_n_f32(1.); for dst in rem.chunks_exact_mut(8 * 4) { @@ -245,11 +335,165 @@ fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize) vst4q_u16(dst.as_mut_ptr(), new_px); } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + + if !rem.is_empty() { + assert!(rem.len() < 8 * 4); + let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4]; + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); + + let pixel = vld4q_u16(buffer.as_ptr()); + + let is_alpha_zero_mask = vceqzq_u16(pixel.3); + + let low_a = vmovl_u16(vget_low_u16(pixel.3)); + let high_a = vmovl_high_u16(pixel.3); + + let low_a = vmulq_f32(vdivq_f32(ones, vcvtq_f32_u32(low_a)), v_max_colors_f); + let hi_a = vmulq_f32(vdivq_f32(ones, vcvtq_f32_u32(high_a)), v_max_colors_f); + + let new_r = vbslq_u16( + is_alpha_zero_mask, + pixel.0, + v_scale_by_alpha(pixel.0, low_a, hi_a), + ); + + let new_g = vbslq_u16( + is_alpha_zero_mask, + pixel.1, + v_scale_by_alpha(pixel.1, low_a, hi_a), + ); + + let new_b = vbslq_u16( + is_alpha_zero_mask, + pixel.2, + v_scale_by_alpha(pixel.2, low_a, hi_a), + ); + + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + + vst4q_u16(buffer.as_mut_ptr(), new_px); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +#[derive(Default)] +struct NeonDisassociateAlphaFloat16 {} + +impl DisassociateAlpha for NeonDisassociateAlphaFloat16 { + #[target_feature(enable = "fp16")] + unsafe fn disassociate(&self, in_place: &mut [u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = in_place; + + let v_max_colors_f = xvcvtq_f16_u16(vdupq_n_u16(max_colors as u16)); + + let ones = xreinterpretq_f16_u16(vdupq_n_u16(15360)); // 15360 = 1f16 + + for dst in rem.chunks_exact_mut(8 * 4) { + let pixel = vld4q_u16(dst.as_ptr()); + + let is_alpha_zero_mask = vceqzq_u16(pixel.3); + + let a_vals = xvdivq_f16(ones, xvcvtq_f16_u16(pixel.3)); + + let new_r = vbslq_u16( + is_alpha_zero_mask, + pixel.0, + xvcvtaq_u16_f16(xvmulq_f16( + xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_vals), + v_max_colors_f, + )), + ); + + let new_g = vbslq_u16( + is_alpha_zero_mask, + pixel.1, + xvcvtaq_u16_f16(xvmulq_f16( + xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_vals), + v_max_colors_f, + )), + ); + + let new_b = vbslq_u16( + is_alpha_zero_mask, + pixel.2, + xvcvtaq_u16_f16(xvmulq_f16( + xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_vals), + v_max_colors_f, + )), + ); + + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + + vst4q_u16(dst.as_mut_ptr(), new_px); + } rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + + if !rem.is_empty() { + assert!(rem.len() < 8 * 4); + let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4]; + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); + + let pixel = vld4q_u16(buffer.as_ptr()); + + let is_alpha_zero_mask = vceqzq_u16(pixel.3); + + let a_vals = xvdivq_f16(ones, xvcvtq_f16_u16(pixel.3)); + + let new_r = vbslq_u16( + is_alpha_zero_mask, + pixel.0, + xvcvtaq_u16_f16(xvmulq_f16( + xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_vals), + v_max_colors_f, + )), + ); + + let new_g = vbslq_u16( + is_alpha_zero_mask, + pixel.1, + xvcvtaq_u16_f16(xvmulq_f16( + xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_vals), + v_max_colors_f, + )), + ); + + let new_b = vbslq_u16( + is_alpha_zero_mask, + pixel.2, + xvcvtaq_u16_f16(xvmulq_f16( + xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_vals), + v_max_colors_f, + )), + ); + + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + + vst4q_u16(buffer.as_mut_ptr(), new_px); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } } +} + +#[inline] +unsafe fn neon_un_row(in_place: &mut [u16], bit_depth: usize, handler: impl DisassociateAlpha) { + handler.disassociate(in_place, bit_depth); +} - unpremultiply_alpha_rgba_row(rem, max_colors); +fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize) { + unsafe { + if (bit_depth == 10 || bit_depth == 12) && is_aarch_f16_supported() { + neon_un_row(in_place, bit_depth, NeonDisassociateAlphaFloat16::default()); + } else { + neon_un_row(in_place, bit_depth, NeonDisassociateAlpha::default()); + } + } } pub(crate) fn neon_unpremultiply_alpha_rgba_u16( diff --git a/src/neon/alpha_u8.rs b/src/neon/alpha_u8.rs index c296a63..553d992 100644 --- a/src/neon/alpha_u8.rs +++ b/src/neon/alpha_u8.rs @@ -27,7 +27,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiply_alpha_rgba_row_impl}; +use crate::alpha_handle_u8::premultiply_alpha_rgba_row_impl; +use crate::cpu_features::is_aarch_f16_supported; +use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvmulq_f16, xvrecpeq_f16}; +use crate::neon::xreinterpretq_f16_u16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; @@ -48,33 +51,6 @@ macro_rules! premultiply_vec { }}; } -macro_rules! unpremultiply_vec { - ($v: expr, $a_values: expr) => {{ - let scale = vdupq_n_u8(255); - let hi = vmull_high_u8($v, scale); - let lo = vmull_u8(vget_low_u8($v), vget_low_u8(scale)); - let lo_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))); - let lo_hi = vcvtq_f32_u32(vmovl_high_u16(lo)); - let hi_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))); - let hi_hi = vcvtq_f32_u32(vmovl_high_u16(hi)); - let zero_mask = vmvnq_u8(vceqzq_u8($a_values)); - let a_hi = vmovl_high_u8($a_values); - let a_lo = vmovl_u8(vget_low_u8($a_values)); - let a_lo_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_lo)))); - let a_lo_hi = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_lo))); - let a_hi_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_hi)))); - let a_hi_ho = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_hi))); - - let lo_lo = vcvtaq_u32_f32(vmulq_f32(lo_lo, a_lo_lo)); - let lo_hi = vcvtaq_u32_f32(vmulq_f32(lo_hi, a_lo_hi)); - let hi_lo = vcvtaq_u32_f32(vmulq_f32(hi_lo, a_hi_lo)); - let hi_hi = vcvtaq_u32_f32(vmulq_f32(hi_hi, a_hi_ho)); - let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi)); - let hi = vcombine_u16(vmovn_u32(hi_lo), vmovn_u32(hi_hi)); - vandq_u8(vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)), zero_mask) - }}; -} - unsafe fn neon_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let mut rem = dst; let mut src_rem = src; @@ -158,24 +134,192 @@ pub(crate) fn neon_premultiply_alpha_rgba( } } -unsafe fn neon_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { - let mut rem = in_place; +trait DisassociateAlpha { + unsafe fn disassociate(&self, in_place: &mut [u8]); +} + +#[derive(Default)] +struct NeonDisassociateAlpha {} + +impl NeonDisassociateAlpha { + #[inline(always)] + unsafe fn unpremultiply_vec(v: uint8x16_t, a_values: uint8x16_t) -> uint8x16_t { + let scale = vdupq_n_u8(255); + let hi = vmull_high_u8(v, scale); + let lo = vmull_u8(vget_low_u8(v), vget_low_u8(scale)); + let lo_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))); + let lo_hi = vcvtq_f32_u32(vmovl_high_u16(lo)); + let hi_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))); + let hi_hi = vcvtq_f32_u32(vmovl_high_u16(hi)); + let zero_mask = vmvnq_u8(vceqzq_u8(a_values)); + let a_hi = vmovl_high_u8(a_values); + let a_lo = vmovl_u8(vget_low_u8(a_values)); + let a_lo_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_lo)))); + let a_lo_hi = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_lo))); + let a_hi_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_hi)))); + let a_hi_ho = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_hi))); + + let lo_lo = vcvtaq_u32_f32(vmulq_f32(lo_lo, a_lo_lo)); + let lo_hi = vcvtaq_u32_f32(vmulq_f32(lo_hi, a_lo_hi)); + let hi_lo = vcvtaq_u32_f32(vmulq_f32(hi_lo, a_hi_lo)); + let hi_hi = vcvtaq_u32_f32(vmulq_f32(hi_hi, a_hi_ho)); + let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi)); + let hi = vcombine_u16(vmovn_u32(hi_lo), vmovn_u32(hi_hi)); + vandq_u8(vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)), zero_mask) + } + + #[inline(always)] + unsafe fn unpremultiply_vech(v: uint8x8_t, a_values: uint8x8_t) -> uint8x8_t { + let scale = vdupq_n_u8(255); + let lo = vmull_u8(v, vget_low_u8(scale)); + let lo_lo = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))); + let lo_hi = vcvtq_f32_u32(vmovl_high_u16(lo)); + let zero_mask = vmvn_u8(vceqz_u8(a_values)); + let a_lo = vmovl_u8(a_values); + let a_lo_lo = vrecpeq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_lo)))); + let a_lo_hi = vrecpeq_f32(vcvtq_f32_u32(vmovl_high_u16(a_lo))); + + let lo_lo = vcvtaq_u32_f32(vmulq_f32(lo_lo, a_lo_lo)); + let lo_hi = vcvtaq_u32_f32(vmulq_f32(lo_hi, a_lo_hi)); + let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi)); + vand_u8(vqmovn_u16(lo), zero_mask) + } +} + +impl DisassociateAlpha for NeonDisassociateAlpha { + unsafe fn disassociate(&self, in_place: &mut [u8]) { + let mut rem = in_place; - unsafe { for dst in rem.chunks_exact_mut(16 * 4) { let src_ptr = dst.as_ptr(); let mut pixel = vld4q_u8(src_ptr); - pixel.0 = unpremultiply_vec!(pixel.0, pixel.3); - pixel.1 = unpremultiply_vec!(pixel.1, pixel.3); - pixel.2 = unpremultiply_vec!(pixel.2, pixel.3); + pixel.0 = Self::unpremultiply_vec(pixel.0, pixel.3); + pixel.1 = Self::unpremultiply_vec(pixel.1, pixel.3); + pixel.2 = Self::unpremultiply_vec(pixel.2, pixel.3); + let dst_ptr = dst.as_mut_ptr(); + vst4q_u8(dst_ptr, pixel); + } + + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + + for dst in rem.chunks_exact_mut(8 * 4) { + let src_ptr = dst.as_ptr(); + let mut pixel = vld4_u8(src_ptr); + pixel.0 = Self::unpremultiply_vech(pixel.0, pixel.3); + pixel.1 = Self::unpremultiply_vech(pixel.1, pixel.3); + pixel.2 = Self::unpremultiply_vech(pixel.2, pixel.3); + vst4_u8(dst.as_mut_ptr(), pixel); + } + + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + + if !rem.is_empty() { + assert!(rem.len() < 8 * 4); + let mut buffer: [u8; 8 * 4] = [0u8; 8 * 4]; + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); + + let mut pixel = vld4_u8(buffer.as_ptr()); + pixel.0 = Self::unpremultiply_vech(pixel.0, pixel.3); + pixel.1 = Self::unpremultiply_vech(pixel.1, pixel.3); + pixel.2 = Self::unpremultiply_vech(pixel.2, pixel.3); + vst4_u8(buffer.as_mut_ptr(), pixel); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + +#[derive(Default)] +struct NeonDisassociateAlphaFloat16 {} + +impl NeonDisassociateAlphaFloat16 { + #[inline] + #[target_feature(enable = "fp16")] + /// Float16 feature un pre-multiplication, it's faster with a little slower precision + unsafe fn unpremultiply_vec_f16(v: uint8x16_t, a_values: uint8x16_t) -> uint8x16_t { + // 23544 = 255f16 + let v_scale = xreinterpretq_f16_u16(vdupq_n_u16(23544)); + + let lo = xvmulq_f16(xvcvtq_f16_u16(vmovl_u8(vget_low_u8(v))), v_scale); + let hi = xvmulq_f16(xvcvtq_f16_u16(vmovl_high_u8(v)), v_scale); + + let lo_a = xvrecpeq_f16(xvcvtq_f16_u16(vmovl_u8(vget_low_u8(a_values)))); + let hi_a = xvrecpeq_f16(xvcvtq_f16_u16(vmovl_high_u8(a_values))); + + let zero_mask = vmvnq_u8(vceqzq_u8(a_values)); + + let lo = xvcvtaq_u16_f16(xvmulq_f16(lo, lo_a)); + let hi = xvcvtaq_u16_f16(xvmulq_f16(hi, hi_a)); + vandq_u8(vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)), zero_mask) + } + + #[inline] + #[target_feature(enable = "fp16")] + /// Float16 feature un pre-multiplication, it's faster with a little slower precision + unsafe fn unpremultiply_vec_f16h(v: uint8x8_t, a_values: uint8x8_t) -> uint8x8_t { + // 23544 = 255f16 + let v_scale = xreinterpretq_f16_u16(vdupq_n_u16(23544)); + let lo = xvmulq_f16(xvcvtq_f16_u16(vmovl_u8(v)), v_scale); + let lo_a = xvrecpeq_f16(xvcvtq_f16_u16(vmovl_u8(a_values))); + let zero_mask = vmvn_u8(vceqz_u8(a_values)); + let lo = xvcvtaq_u16_f16(xvmulq_f16(lo, lo_a)); + vand_u8(vqmovn_u16(lo), zero_mask) + } +} + +impl DisassociateAlpha for NeonDisassociateAlphaFloat16 { + #[target_feature(enable = "fp16")] + unsafe fn disassociate(&self, in_place: &mut [u8]) { + let mut rem = in_place; + + for dst in rem.chunks_exact_mut(16 * 4) { + let mut pixel = vld4q_u8(dst.as_ptr()); + pixel.0 = Self::unpremultiply_vec_f16(pixel.0, pixel.3); + pixel.1 = Self::unpremultiply_vec_f16(pixel.1, pixel.3); + pixel.2 = Self::unpremultiply_vec_f16(pixel.2, pixel.3); let dst_ptr = dst.as_mut_ptr(); vst4q_u8(dst_ptr, pixel); } rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + + for dst in rem.chunks_exact_mut(8 * 4) { + let mut pixel = vld4_u8(dst.as_ptr()); + pixel.0 = Self::unpremultiply_vec_f16h(pixel.0, pixel.3); + pixel.1 = Self::unpremultiply_vec_f16h(pixel.1, pixel.3); + pixel.2 = Self::unpremultiply_vec_f16h(pixel.2, pixel.3); + let dst_ptr = dst.as_mut_ptr(); + vst4_u8(dst_ptr, pixel); + } + + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + + if !rem.is_empty() { + assert!(rem.len() < 8 * 4); + let mut buffer: [u8; 8 * 4] = [0u8; 8 * 4]; + std::ptr::copy_nonoverlapping(rem.as_ptr(), buffer.as_mut_ptr(), rem.len()); + + let mut pixel = vld4_u8(buffer.as_ptr()); + pixel.0 = Self::unpremultiply_vec_f16h(pixel.0, pixel.3); + pixel.1 = Self::unpremultiply_vec_f16h(pixel.1, pixel.3); + pixel.2 = Self::unpremultiply_vec_f16h(pixel.2, pixel.3); + vst4_u8(buffer.as_mut_ptr(), pixel); + + std::ptr::copy_nonoverlapping(buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } } +} + +unsafe fn neon_dis_dispatch(in_place: &mut [u8], handler: impl DisassociateAlpha) { + handler.disassociate(in_place); +} - unpremultiply_alpha_rgba_row_impl(rem); +unsafe fn neon_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { + if is_aarch_f16_supported() { + neon_dis_dispatch(in_place, NeonDisassociateAlphaFloat16::default()); + } else { + neon_dis_dispatch(in_place, NeonDisassociateAlpha::default()); + } } pub(crate) fn neon_unpremultiply_alpha_rgba( diff --git a/src/neon/check_alpha.rs b/src/neon/check_alpha.rs new file mode 100644 index 0000000..a2944ff --- /dev/null +++ b/src/neon/check_alpha.rs @@ -0,0 +1,172 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use std::arch::aarch64::*; + +/// Checks if image has constant alpha by xor rows +pub(crate) fn neon_has_non_constant_cap_alpha_rgba8( + store: &[u8], + width: usize, + stride: usize, +) -> bool { + unsafe { + if store.is_empty() { + return true; + } + + let first_alpha = store[3]; + + let v_first_alpha = vdupq_n_u8(first_alpha); + + for row in store.chunks_exact(stride) { + let row = &row[0..width * 4]; + let mut sums = vdupq_n_u32(0); + for chunk in row.chunks_exact(16 * 4) { + let loaded = vld4q_u8(chunk.as_ptr()); + let blend_result = veorq_u8(loaded.3, v_first_alpha); + let blend32 = vpaddlq_u16(vpaddlq_u8(blend_result)); + sums = vaddq_u32(sums, blend32); + } + + let row = row.chunks_exact(16 * 4).remainder(); + + for chunk in row.chunks_exact(8 * 4) { + let loaded = vld4_u8(chunk.as_ptr()); + let blend_result = veor_u8(loaded.3, vget_low_u8(v_first_alpha)); + let blend32 = vpaddl_u16(vpaddl_u8(blend_result)); + sums = vaddq_u32(sums, vcombine_u32(blend32, blend32)); + } + + let row = row.chunks_exact(8 * 4).remainder(); + + let mut h_sum = vaddvq_u32(sums); + + for chunk in row.chunks_exact(4) { + h_sum += chunk[3] as u32 ^ first_alpha as u32; + } + + if h_sum != 0 { + return true; + } + } + + false + } +} + +/// Checks if image has constant alpha by xor rows for image 16bits +pub(crate) fn neon_has_non_constant_cap_alpha_rgba16( + store: &[u16], + width: usize, + stride: usize, +) -> bool { + unsafe { + if store.is_empty() { + return true; + } + + let first_alpha = store[3]; + let def_alpha = vdupq_n_u16(first_alpha); + + for row in store.chunks_exact(stride) { + let row = &row[0..width * 4]; + let mut sums = vdupq_n_u32(0); + for chunk in row.chunks_exact(8 * 4) { + let r0 = vld4q_u16(chunk.as_ptr()); + + let pxor = veorq_u16(r0.3, def_alpha); + sums = vaddq_u32(sums, vpaddlq_u16(pxor)); + } + + let row = row.chunks_exact(8 * 4).remainder(); + + for chunk in row.chunks_exact(4 * 4) { + let r0 = vld4_u16(chunk.as_ptr()); + + let pxor = veor_u16(r0.3, vget_low_u16(def_alpha)); + let pw = vpaddl_u16(pxor); + sums = vaddq_u32(sums, vcombine_u32(pw, pw)); + } + + let row = row.chunks_exact(4 * 4).remainder(); + + let mut h_sum = vaddvq_u32(sums); + + for chunk in row.chunks_exact(4) { + h_sum += chunk[3] as u32 ^ first_alpha as u32; + } + + if h_sum != 0 { + return true; + } + } + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn check_alpha_exists_rgba8() { + let image_size = 256usize; + let mut image = vec![0u8; image_size * image_size * 4]; + image[3 + 150 * 4] = 75; + let has_alpha = neon_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4); + assert_eq!(true, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba8() { + let image_size = 256usize; + let image = vec![255u8; image_size * image_size * 4]; + let has_alpha = neon_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4); + assert_eq!(false, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba16() { + let image_size = 256usize; + let image = vec![255u16; image_size * image_size * 4]; + let has_alpha = neon_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4); + assert_eq!(false, has_alpha); + } + + #[test] + fn check_alpha_exists_rgba16() { + let image_size = 256usize; + let mut image = vec![0u16; image_size * image_size * 4]; + image[3] = 715; + image[7] = 715; + image[11] = 715; + image[15] = 715; + let has_alpha = neon_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4); + assert_eq!(true, has_alpha); + } +} diff --git a/src/neon/f16_utils.rs b/src/neon/f16_utils.rs index 0a94d05..4b4f9ee 100644 --- a/src/neon/f16_utils.rs +++ b/src/neon/f16_utils.rs @@ -32,7 +32,6 @@ use std::arch::aarch64::*; use std::arch::asm; /// Provides basic support for f16 - #[allow(unused)] macro_rules! static_assert { ($e:expr) => { @@ -90,18 +89,21 @@ pub(crate) struct x_float16x8x4_t( ); #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t { - let store: uint16x4_t = vld1_u16(std::mem::transmute(ptr)); + let store: uint16x4_t = vld1_u16(ptr as *const _); std::mem::transmute(store) } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t { - let store: uint16x8_t = vld1q_u16(std::mem::transmute(ptr)); + let store: uint16x8_t = vld1q_u16(ptr as *const _); std::mem::transmute(store) } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t { let ptr_u16 = ptr as *const u16; x_float16x8x2_t( @@ -111,6 +113,7 @@ pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t { } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t { let ptr_u16 = ptr as *const u16; x_float16x8x4_t( @@ -123,19 +126,25 @@ pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t { #[inline] pub(crate) unsafe fn xvget_low_f16(x: x_float16x8_t) -> x_float16x4_t { - std::mem::transmute(vget_low_u16(std::mem::transmute(x))) + std::mem::transmute::(vget_low_u16(std::mem::transmute::< + x_float16x8_t, + uint16x8_t, + >(x))) } #[inline] pub(crate) unsafe fn xvget_high_f16(x: x_float16x8_t) -> x_float16x4_t { - std::mem::transmute(vget_high_u16(std::mem::transmute(x))) + std::mem::transmute::(vget_high_u16(std::mem::transmute::< + x_float16x8_t, + uint16x8_t, + >(x))) } #[inline] pub(crate) unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16x8_t { - std::mem::transmute(vcombine_u16( - std::mem::transmute(low), - std::mem::transmute(high), + std::mem::transmute::(vcombine_u16( + std::mem::transmute::(low), + std::mem::transmute::(high), )) } @@ -159,12 +168,14 @@ pub(crate) unsafe fn xreinterpretq_f16_u16(x: uint16x8_t) -> x_float16x8_t { std::mem::transmute(x) } -#[inline] +/// Sets register to f16 zero +#[inline(always)] pub(super) unsafe fn xvzerosq_f16() -> x_float16x8_t { xreinterpretq_f16_u16(vdupq_n_u16(0)) } -#[inline] +/// Sets register to f16 zero +#[inline(always)] pub(super) unsafe fn xvzeros_f16() -> x_float16x4_t { xreinterpret_f16_u16(vdup_n_u16(0)) } @@ -192,6 +203,94 @@ pub(super) unsafe fn xvcvt_f16_f32(v: float32x4_t) -> x_float16x4_t { xreinterpret_f16_u16(result) } +/// This instruction converts each element in a vector from fixed-point to floating-point +/// using the rounding mode that is specified by the FPCR, and writes the result +/// to the SIMD&FP destination register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_f16_u16) +#[inline] +#[target_feature(enable = "fp16")] +pub(super) unsafe fn xvcvtq_f16_u16(v: uint16x8_t) -> x_float16x8_t { + let result: uint16x8_t; + asm!( + "ucvtf {0:v}.8h, {1:v}.8h", + out(vreg) result, + in(vreg) v, + options(pure, nomem, nostack)); + xreinterpretq_f16_u16(result) +} + +/// This instruction converts each element in a vector from fixed-point to floating-point +/// using the rounding mode that is specified by the FPCR, and writes the result +/// to the SIMD&FP destination register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_f16_u16) +#[inline] +#[target_feature(enable = "fp16")] +pub(super) unsafe fn xvcvt_f16_u16(v: uint16x4_t) -> x_float16x4_t { + let result: uint16x4_t; + asm!( + "ucvtf {0:v}.4h, {1:v}.4h", + out(vreg) result, + in(vreg) v, + options(pure, nomem, nostack)); + xreinterpret_f16_u16(result) +} + +/// Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). +/// This instruction converts each element in a vector from a floating-point value to an unsigned +/// integer value using the Round to Nearest with Ties to Away rounding mode and writes the result +/// to the SIMD&FP destination register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtaq_u16_f16) +#[inline] +#[target_feature(enable = "fp16")] +pub(super) unsafe fn xvcvtaq_u16_f16(v: x_float16x8_t) -> uint16x8_t { + let result: uint16x8_t; + asm!( + "fcvtau {0:v}.8h, {1:v}.8h", + out(vreg) result, + in(vreg) xreinterpretq_u16_f16(v), + options(pure, nomem, nostack)); + result +} + +/// Floating-point Convert to Unsigned integer, rounding to nearest with ties to Away (vector). +/// This instruction converts each element in a vector from a floating-point value to an unsigned +/// integer value using the Round to Nearest with Ties to Away rounding mode and writes the result +/// to the SIMD&FP destination register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvta_u16_f16) +#[inline] +#[target_feature(enable = "fp16")] +pub(super) unsafe fn xvcvta_u16_f16(v: x_float16x4_t) -> uint16x4_t { + let result: uint16x4_t; + asm!( + "fcvtau {0:v}.4h, {1:v}.4h", + out(vreg) result, + in(vreg) xreinterpret_u16_f16(v), + options(pure, nomem, nostack)); + result +} + +/// Floating-point Reciprocal Estimate. +/// This instruction finds an approximate reciprocal estimate for each vector element +/// in the source SIMD&FP register, places the result in a vector, +/// and writes the vector to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrecpeq_f16) +#[inline] +#[target_feature(enable = "fp16")] +pub(super) unsafe fn xvrecpeq_f16(v: x_float16x8_t) -> x_float16x8_t { + let result: uint16x8_t; + asm!( + "frecpe {0:v}.8h, {1:v}.8h", + out(vreg) result, + in(vreg) xreinterpretq_u16_f16(v), + options(pure, nomem, nostack)); + xreinterpretq_f16_u16(result) +} + // #[inline] // pub(super) unsafe fn xvadd_f16(v1: x_float16x4_t, v2: x_float16x4_t) -> x_float16x4_t { // let result: uint16x4_t; @@ -239,6 +338,12 @@ pub(super) unsafe fn xvcombine_f16(v1: x_float16x4_t, v2: x_float16x4_t) -> x_fl // xreinterpret_f16_u16(result) // } +/// Floating-point fused Multiply-Add to accumulator (vector). +/// This instruction multiplies corresponding floating-point values in the vectors +/// in the two source SIMD&FP registers, adds the product to the corresponding vector element +/// of the destination SIMD&FP register, and writes the result to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvfmla_f16( @@ -257,6 +362,13 @@ pub(super) unsafe fn xvfmla_f16( xreinterpret_f16_u16(result) } +/// Floating-point fused Multiply-Add to accumulator (vector). +/// This instruction multiplies corresponding floating-point values in the vectors +/// in the two source SIMD&FP registers, adds the product to the corresponding +/// vector element of the destination SIMD&FP register, +/// and writes the result to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_laneq_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvfmla_laneq_f16( @@ -335,6 +447,13 @@ pub(super) unsafe fn xvfmla_laneq_f16( xreinterpret_f16_u16(result) } +/// Floating-point fused Multiply-Add to accumulator (vector). +/// This instruction multiplies corresponding floating-point values in the vectors +/// in the two source SIMD&FP registers, adds the product to the corresponding +/// vector element of the destination SIMD&FP register, +/// and writes the result to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfma_lane_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvfmla_lane_f16( @@ -381,6 +500,13 @@ pub(super) unsafe fn xvfmla_lane_f16( xreinterpret_f16_u16(result) } +/// Floating-point fused Multiply-Add to accumulator (vector). +/// This instruction multiplies corresponding floating-point values in the vectors +/// in the two source SIMD&FP registers, adds the product to the corresponding +/// vector element of the destination SIMD&FP register, +/// and writes the result to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmaq_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvfmlaq_f16( @@ -427,6 +553,12 @@ pub(super) unsafe fn xvfmlaq_f16( // xvadd_f16(a, xvmul_f16(b, c)) // } +/// Floating-point Multiply (vector). +/// This instruction multiplies corresponding floating-point values in the vectors in the two +/// source SIMD&FP registers, +/// places the result in a vector, and writes the vector to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmulq_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvmulq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float16x8_t { @@ -441,6 +573,33 @@ pub(super) unsafe fn xvmulq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float xreinterpretq_f16_u16(result) } +/// Floating-point Multiply (vector). +/// This instruction multiplies corresponding floating-point values in the vectors +/// in the two source SIMD&FP registers, places the result in a vector, +/// and writes the vector to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f16) +#[target_feature(enable = "fp16")] +#[inline] +pub(super) unsafe fn xvmul_f16(v1: x_float16x4_t, v2: x_float16x4_t) -> x_float16x4_t { + let result: uint16x4_t; + asm!( + "fmul {0:v}.4h, {1:v}.4h, {2:v}.4h", + out(vreg) result, + in(vreg) xreinterpret_u16_f16(v1), + in(vreg) xreinterpret_u16_f16(v2), + options(pure, nomem, nostack) + ); + xreinterpret_f16_u16(result) +} + +/// Floating-point Divide (vector). +/// This instruction divides the floating-point values in the elements +/// in the first source SIMD&FP register, by the floating-point values +/// in the corresponding elements in the second source SIMD&FP register, +/// places the results in a vector, and writes the vector to the destination SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdivq_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvdivq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float16x8_t { @@ -455,6 +614,12 @@ pub(super) unsafe fn xvdivq_f16(v1: x_float16x8_t, v2: x_float16x8_t) -> x_float xreinterpretq_f16_u16(result) } +/// Bitwise Select. +/// This instruction sets each bit in the destination SIMD&FP register +/// to the corresponding bit from the first source SIMD&FP register when the +/// original destination bit was 1, otherwise from the second source SIMD&FP register. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vbslq_f16) #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvbslq_f16( @@ -474,16 +639,19 @@ pub(super) unsafe fn xvbslq_f16( } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvst_f16(ptr: *mut half::f16, x: x_float16x4_t) { - vst1_u16(std::mem::transmute(ptr), xreinterpret_u16_f16(x)) + vst1_u16(ptr as *mut u16, xreinterpret_u16_f16(x)) } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvstq_f16(ptr: *mut half::f16, x: x_float16x8_t) { - vst1q_u16(std::mem::transmute(ptr), xreinterpretq_u16_f16(x)) + vst1q_u16(ptr as *mut u16, xreinterpretq_u16_f16(x)) } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) { let ptr_u16 = ptr as *mut u16; vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0)); @@ -491,6 +659,7 @@ pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) { } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) { let ptr_u16 = ptr as *mut u16; vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0)); @@ -510,6 +679,7 @@ pub(crate) unsafe fn xvdup_laneq_f16(a: x_float16x8_t) -> x_float1 } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvld1q_lane_f16( ptr: *const half::f16, src: x_float16x8_t, @@ -521,6 +691,7 @@ pub(crate) unsafe fn xvld1q_lane_f16( } #[inline] +#[cfg(feature = "half")] pub(crate) unsafe fn xvsetq_lane_f16( v: half::f16, r: x_float16x8_t, @@ -531,6 +702,13 @@ pub(crate) unsafe fn xvsetq_lane_f16( )) } +/// Floating-point Compare Equal to zero (vector). +/// This instruction reads each floating-point value in the source SIMD&FP register +/// and if the value is equal to zero sets every bit of the corresponding vector element +/// in the destination SIMD&FP register to one, otherwise sets every bit of the +/// corresponding vector element in the destination SIMD&FP register to zero. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqzq_f16) #[target_feature(enable = "fp16")] #[inline] pub(crate) unsafe fn vceqzq_f16(a: x_float16x8_t) -> uint16x8_t { diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 00c0c9d..c537f9e 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -34,9 +34,9 @@ mod alpha_f32; mod alpha_u16; mod alpha_u8; mod ar30; +mod check_alpha; #[cfg(feature = "half")] mod convolve_f16; -#[cfg(feature = "half")] mod f16_utils; mod horizontal_ar30; mod plane_f32; @@ -63,7 +63,9 @@ mod vertical_f16_full; mod vertical_f32; mod vertical_u16; mod vertical_u16_lb; +mod vertical_u16_lb_f16; mod vertical_u8; +mod weights; #[cfg(feature = "half")] pub(crate) use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16}; @@ -76,7 +78,9 @@ pub(crate) use alpha_f32::neon_unpremultiply_alpha_rgba_f32; pub(crate) use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16}; pub(crate) use alpha_u8::neon_premultiply_alpha_rgba; pub(crate) use alpha_u8::neon_unpremultiply_alpha_rgba; -#[cfg(feature = "half")] +pub(crate) use check_alpha::{ + neon_has_non_constant_cap_alpha_rgba16, neon_has_non_constant_cap_alpha_rgba8, +}; pub(crate) use f16_utils::*; pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30; pub(crate) use plane_f32::convolve_horizontal_plane_neon_row_one; @@ -122,6 +126,8 @@ pub(crate) use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16; pub(crate) use vertical_f32::convolve_vertical_rgb_neon_row_f32; pub(crate) use vertical_u16::convolve_column_u16; pub(crate) use vertical_u16_lb::convolve_column_lb_u16; +pub(crate) use vertical_u16_lb_f16::convolve_column_lb_u16_f16; pub(crate) use vertical_u8::{ convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision, }; +pub(crate) use weights::convert_weights_to_f16; diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs index aa8e265..183097c 100644 --- a/src/neon/rgba_u8.rs +++ b/src/neon/rgba_u8.rs @@ -195,6 +195,13 @@ unsafe fn conv_horiz_rgba_1_u8_i16( vqrdmlah_s16(store, lo, w0) } +/// Checking NEON `rdm` availability is required before a call. +/// +/// RDM feature has slightly lower precision and won't work really well on huge kernel which +/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. +/// +/// # Safety +/// - Check `rdm` availability before the call. pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16( src: &[u8], src_stride: usize, @@ -316,21 +323,22 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl( let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2)); let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3)); - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0)); - let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); - - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1)); - let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); - - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2)); - let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); - - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); - let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); + vst1_lane_u32::<0>( + chunk0.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_0), + ); + vst1_lane_u32::<0>( + chunk1.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_1), + ); + vst1_lane_u32::<0>( + chunk2.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_2), + ); + vst1_lane_u32::<0>( + chunk3.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8), + ); } } @@ -510,14 +518,21 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( let store_16 = vqshrun_n_s32::(store); let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); - let dest_ptr_32 = dst.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(value); + vst1_lane_u32::<0>( + dst.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8), + ); } } } -/// Checking NEON `rdm` availability is required before a call +/// Checking NEON `rdm` availability is required before a call. +/// +/// RDM feature has slightly lower precision and won't work really well on huge kernel which +/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. +/// +/// # Safety +/// - Check `rdm` availability before the call. pub(crate) fn convolve_horizontal_rgba_neon_row_i16( src: &[u8], dst: &mut [u8], diff --git a/src/neon/utils.rs b/src/neon/utils.rs index 76c2381..a435d96 100644 --- a/src/neon/utils.rs +++ b/src/neon/utils.rs @@ -138,19 +138,13 @@ pub(crate) unsafe fn prefer_vfmaq_lane_f32( #[inline(always)] pub(crate) unsafe fn load_3b_as_u16x4(src_ptr: *const u8) -> uint16x4_t { - let v_new_value1 = u16::from_le_bytes([src_ptr.read_unaligned(), 0]); - let v_new_value2 = u16::from_le_bytes([src_ptr.add(1).read_unaligned(), 0]); - let v_new_value3 = u16::from_le_bytes([src_ptr.add(2).read_unaligned(), 0]); - let arr = [v_new_value1, v_new_value2, v_new_value3, 0]; - vld1_u16(arr.as_ptr()) + let mut v = vreinterpret_u8_u16(vld1_lane_u16::<0>(src_ptr as *const u16, vdup_n_u16(0))); + v = vld1_lane_u8::<2>(src_ptr.add(2), v); + vget_low_u16(vmovl_u8(v)) } #[inline(always)] pub(crate) unsafe fn load_4b_as_u16x4(src_ptr: *const u8) -> uint16x4_t { - let v_new_value1 = u16::from_le_bytes([src_ptr.read_unaligned(), 0]); - let v_new_value2 = u16::from_le_bytes([src_ptr.add(1).read_unaligned(), 0]); - let v_new_value3 = u16::from_le_bytes([src_ptr.add(2).read_unaligned(), 0]); - let v_new_value4 = u16::from_le_bytes([src_ptr.add(3).read_unaligned(), 0]); - let arr = [v_new_value1, v_new_value2, v_new_value3, v_new_value4]; - vld1_u16(arr.as_ptr()) + let j = vreinterpret_u8_u32(vld1_lane_u32::<0>(src_ptr as *const u32, vdup_n_u32(0))); + vget_low_u16(vmovl_u8(j)) } diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs index 71226ef..d85d1d6 100644 --- a/src/neon/vertical_f32.rs +++ b/src/neon/vertical_f32.rs @@ -255,7 +255,7 @@ unsafe fn convolve_vertical_part_neon_1_f32( } let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - dst_ptr.write_unaligned(vgetq_lane_f32::<0>(store_0)); + vst1q_lane_f32::<0>(dst_ptr, store_0); } pub(crate) fn convolve_vertical_rgb_neon_row_f32( diff --git a/src/neon/vertical_u16.rs b/src/neon/vertical_u16.rs index bf34f50..05aaba2 100644 --- a/src/neon/vertical_u16.rs +++ b/src/neon/vertical_u16.rs @@ -221,10 +221,10 @@ pub(crate) fn convolve_column_u16( store3 = prefer_vfmaq_f32(store3, hi1, v_weight); } } - let u_store0 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store0, zeros)), v_max_colors); - let u_store1 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store1, zeros)), v_max_colors); - let u_store2 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store2, zeros)), v_max_colors); - let u_store3 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store3, zeros)), v_max_colors); + let u_store0 = vminq_u32(vcvtaq_u32_f32(store0), v_max_colors); + let u_store1 = vminq_u32(vcvtaq_u32_f32(store1), v_max_colors); + let u_store2 = vminq_u32(vcvtaq_u32_f32(store2), v_max_colors); + let u_store3 = vminq_u32(vcvtaq_u32_f32(store3), v_max_colors); let item0 = vcombine_u16(vqmovn_u32(u_store0), vqmovn_u32(u_store1)); vst1q_u16(dst.as_mut_ptr(), item0); @@ -348,8 +348,8 @@ pub(crate) fn convolve_column_u16( } } - let u_store0 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store0, zeros)), v_max_colors); - let u_store1 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store1, zeros)), v_max_colors); + let u_store0 = vminq_u32(vcvtaq_u32_f32(store0), v_max_colors); + let u_store1 = vminq_u32(vcvtaq_u32_f32(store1), v_max_colors); let item = vcombine_u16(vqmovn_u32(u_store0), vqmovn_u32(u_store1)); vst1q_u16(dst.as_mut_ptr(), item); @@ -444,7 +444,7 @@ pub(crate) fn convolve_column_u16( } } - let u_store0 = vminq_u32(vcvtaq_u32_f32(vmaxq_f32(store0, zeros)), v_max_colors); + let u_store0 = vminq_u32(vcvtaq_u32_f32(store0), v_max_colors); vst1_u16(dst.as_mut_ptr(), vqmovn_u32(u_store0)); diff --git a/src/neon/vertical_u16_lb_f16.rs b/src/neon/vertical_u16_lb_f16.rs new file mode 100644 index 0000000..4bc7f1c --- /dev/null +++ b/src/neon/vertical_u16_lb_f16.rs @@ -0,0 +1,188 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::filter_weights::FilterBounds; +use crate::neon::f16_utils::{ + xvcvt_f16_u16, xvcvta_u16_f16, xvcvtaq_u16_f16, xvcvtq_f16_u16, xvfmla_f16, xvfmlaq_f16, + xvzerosq_f16, +}; +use crate::neon::{xreinterpret_f16_u16, xreinterpretq_f16_u16, xvget_low_f16}; +use std::arch::aarch64::*; + +pub(crate) fn convolve_column_lb_u16_f16( + j0: usize, + bounds: &FilterBounds, + src: &[u16], + dst: &mut [u16], + src_stride: usize, + weight: &[i16], + bit_depth: u32, +) { + unsafe { + let transmuted_u16 = + std::slice::from_raw_parts(weight.as_ptr() as *const u16, weight.len()); + convolve_column_lb_u16_f16_impl( + j0, + bounds, + src, + dst, + src_stride, + transmuted_u16, + bit_depth, + ); + } +} + +#[target_feature(enable = "fp16")] +unsafe fn convolve_column_lb_u16_f16_impl( + _: usize, + bounds: &FilterBounds, + src: &[u16], + dst: &mut [u16], + src_stride: usize, + weight: &[u16], + bit_depth: u32, +) { + let max_colors = (1 << bit_depth) - 1; + let mut cx = 0usize; + + let bounds_size = bounds.size; + + let initial_store = xvzerosq_f16(); + + let v_max_colors = vdupq_n_u16(max_colors); + + let v_px = cx; + + let iter16 = dst.chunks_exact_mut(16); + + for (x, dst) in iter16.enumerate() { + let mut store0 = initial_store; + let mut store1 = initial_store; + + let v_dx = v_px + x * 16; + + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); + + let v_weight = xreinterpretq_f16_u16(vdupq_n_u16(k_weight)); + + let item_row0 = vld1q_u16(src_ptr.as_ptr()); + let item_row1 = vld1q_u16(src_ptr.as_ptr().add(8)); + + store0 = xvfmlaq_f16(store0, xvcvtq_f16_u16(item_row0), v_weight); + store1 = xvfmlaq_f16(store1, xvcvtq_f16_u16(item_row1), v_weight); + } + + let item0 = vminq_u16(xvcvtaq_u16_f16(store0), v_max_colors); + let item1 = vminq_u16(xvcvtaq_u16_f16(store1), v_max_colors); + + vst1q_u16(dst.as_mut_ptr(), item0); + vst1q_u16(dst.as_mut_ptr().add(8), item1); + + cx = v_dx; + } + + let tail16 = dst.chunks_exact_mut(16).into_remainder(); + let iter8 = tail16.chunks_exact_mut(8); + + let v_px = cx; + + for (x, dst) in iter8.enumerate() { + let mut store0 = initial_store; + + let v_dx = v_px + x * 8; + + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); + + let v_weight = xreinterpretq_f16_u16(vdupq_n_u16(k_weight)); + + let item_row = vld1q_u16(src_ptr.as_ptr()); + + store0 = xvfmlaq_f16(store0, xvcvtq_f16_u16(item_row), v_weight); + } + + let item = vminq_u16(xvcvtaq_u16_f16(store0), v_max_colors); + vst1q_u16(dst.as_mut_ptr(), item); + + cx = v_dx; + } + + let tail8 = tail16.chunks_exact_mut(8).into_remainder(); + let iter4 = tail8.chunks_exact_mut(4); + + let v_cx = cx; + + for (x, dst) in iter4.enumerate() { + let mut store0 = xvget_low_f16(initial_store); + + let v_dx = v_cx + x * 4; + + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); + + let v_weight = xreinterpret_f16_u16(vdup_n_u16(k_weight)); + + let item_row = vld1_u16(src_ptr.as_ptr()); + + store0 = xvfmla_f16(store0, xvcvt_f16_u16(item_row), v_weight); + } + + let u_store0 = vmin_u16(xvcvta_u16_f16(store0), vget_low_u16(v_max_colors)); + vst1_u16(dst.as_mut_ptr(), u_store0); + + cx = v_dx; + } + + let tail4 = tail8.chunks_exact_mut(4).into_remainder(); + + let a_px = cx; + + for (x, dst) in tail4.iter_mut().enumerate() { + let mut store0 = xvget_low_f16(initial_store); + + let v_px = a_px + x; + + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let offset = src_stride * py + v_px; + let src_ptr = src.get_unchecked(offset..(offset + 1)); + + let v_weight = xreinterpret_f16_u16(vdup_n_u16(k_weight)); + let item_row = xvcvt_f16_u16(vld1_lane_u16::<0>(src_ptr.as_ptr(), vdup_n_u16(0))); + store0 = xvfmla_f16(store0, item_row, v_weight); + } + + let u_store0 = vmin_u16(xvcvta_u16_f16(store0), vget_low_u16(v_max_colors)); + vst1_lane_u16::<0>(dst, u_store0); + } +} diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs index 2a0f44a..62bb824 100644 --- a/src/neon/vertical_u8.rs +++ b/src/neon/vertical_u8.rs @@ -69,6 +69,13 @@ macro_rules! accumulate_4_into_lane { }}; } +/// Checking NEON `rdm` availability is required before a call. +/// +/// RDM feature has slightly lower precision and won't work really well on huge kernel which +/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. +/// +/// # Safety +/// - Check `rdm` availability before the call. pub(crate) fn convolve_vertical_neon_i16_precision( width: usize, bounds: &FilterBounds, @@ -133,7 +140,6 @@ unsafe fn convolve_vertical_neon_row_upper( let mut cx = 0usize; unsafe { - let zeros = vdupq_n_s16(0); let iter_64 = dst.chunks_exact_mut(64); let bounds_size = bounds.size; @@ -258,15 +264,6 @@ unsafe fn convolve_vertical_neon_row_upper( } } - store_0 = vmaxq_s16(store_0, zeros); - store_1 = vmaxq_s16(store_1, zeros); - store_2 = vmaxq_s16(store_2, zeros); - store_3 = vmaxq_s16(store_3, zeros); - store_4 = vmaxq_s16(store_4, zeros); - store_5 = vmaxq_s16(store_5, zeros); - store_6 = vmaxq_s16(store_6, zeros); - store_7 = vmaxq_s16(store_7, zeros); - let item00 = vqshrun_n_s16::(store_0); let item01 = vqshrun_n_s16::(store_1); let item10 = vqshrun_n_s16::(store_2); @@ -380,11 +377,6 @@ unsafe fn convolve_vertical_neon_row_upper( } } - store_0 = vmaxq_s16(store_0, zeros); - store_1 = vmaxq_s16(store_1, zeros); - store_2 = vmaxq_s16(store_2, zeros); - store_3 = vmaxq_s16(store_3, zeros); - let item00 = vqshrun_n_s16::(store_0); let item01 = vqshrun_n_s16::(store_1); let item10 = vqshrun_n_s16::(store_2); @@ -475,9 +467,6 @@ unsafe fn convolve_vertical_neon_row_upper( } } - store_0 = vmaxq_s16(store_0, zeros); - store_1 = vmaxq_s16(store_1, zeros); - let item0 = vqshrun_n_s16::(store_0); let item1 = vqshrun_n_s16::(store_1); @@ -568,8 +557,6 @@ unsafe fn convolve_vertical_neon_row_upper( } } - store_0 = vmaxq_s16(store_0, zeros); - let item = vqshrun_n_s16::(store_0); vst1_u8(dst.as_mut_ptr(), item); @@ -658,8 +645,6 @@ unsafe fn convolve_vertical_neon_row_upper( } } - store = vmaxq_s16(store, zeros); - let shrinked_store = vqshrun_n_s16::(store); let value = vget_lane_u8::<0>(shrinked_store); *dst = value; @@ -1207,10 +1192,6 @@ fn convolve_vertical_neon_row_full( } } - let zeros = vdupq_n_s32(0); - - store = vmaxq_s32(store, zeros); - let shrinked_store = vqshrun_n_s32::(store); let low_16 = vcombine_u16(shrinked_store, shrinked_store); diff --git a/src/neon/weights.rs b/src/neon/weights.rs new file mode 100644 index 0000000..2f7e390 --- /dev/null +++ b/src/neon/weights.rs @@ -0,0 +1,70 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::neon::f16_utils::{xvcombine_f16, xvcvt_f16_f32}; +use crate::neon::utils::xvld1q_f32_x2; +use crate::neon::{xreinterpret_u16_f16, xreinterpretq_u16_f16}; +use std::arch::aarch64::*; + +pub(crate) fn convert_weights_to_f16(weights: &[f32]) -> Vec { + unsafe { convert_weights_to_f16_impl(weights) } +} + +#[target_feature(enable = "fp16")] +unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec { + let mut new_weights = vec![0i16; weights.len()]; + + for (dst, src) in new_weights.chunks_exact_mut(8).zip(weights.chunks_exact(8)) { + let j = xvld1q_f32_x2(src.as_ptr()); + let cvt0 = xvcvt_f16_f32(j.0); + let cvt1 = xvcvt_f16_f32(j.1); + vst1q_u16( + dst.as_mut_ptr() as *mut u16, + xreinterpretq_u16_f16(xvcombine_f16(cvt0, cvt1)), + ); + } + + let dst = new_weights.chunks_exact_mut(8).into_remainder(); + let src = weights.chunks_exact(8).remainder(); + + for (dst, src) in dst.chunks_exact_mut(4).zip(src.chunks_exact(4)) { + let j = vld1q_f32(src.as_ptr()); + let cvt = xvcvt_f16_f32(j); + vst1_u16(dst.as_mut_ptr() as *mut u16, xreinterpret_u16_f16(cvt)); + } + + let dst = dst.chunks_exact_mut(4).into_remainder(); + let src = src.chunks_exact(4).remainder(); + + for (dst, src) in dst.chunks_exact_mut(1).zip(src.iter()) { + let j = xvcvt_f16_f32(vld1q_lane_f32::<0>(src, vdupq_n_f32(0.))); + vst1_lane_u16::<0>(dst.as_mut_ptr() as *mut u16, xreinterpret_u16_f16(j)); + } + + new_weights +} diff --git a/src/plane_u8.rs b/src/plane_u8.rs index b2d43b8..099febe 100644 --- a/src/plane_u8.rs +++ b/src/plane_u8.rs @@ -30,7 +30,7 @@ use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp}; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; -use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{handle_fixed_column_u8, handle_fixed_row_u8}; use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] @@ -79,6 +79,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 1> { _pool, _dispatcher_4_rows, _dispatcher_1_row, + DefaultWeightsConverter::default(), ); } } diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index 8aff370..dae1038 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -30,7 +30,7 @@ use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp}; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; -use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{ handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8, }; @@ -78,6 +78,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { pool, _dispatcher_4_rows, _dispatcher_1_row, + DefaultWeightsConverter::default(), ); } } diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs index b99b586..14d3a23 100644 --- a/src/rgba_u8.rs +++ b/src/rgba_u8.rs @@ -35,7 +35,7 @@ use crate::avx2::{ }; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; -use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{ handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8, }; @@ -79,7 +79,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if std::is_x86_feature_detected!("sse4.1") { + if std::arch::is_x86_feature_detected!("sse4.1") { if _scale_factor < 8. { _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_lb); _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one_lb; @@ -88,7 +88,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one; } } - if std::is_x86_feature_detected!("avx2") || _scale_factor < 8. { + if std::arch::is_x86_feature_detected!("avx2") || _scale_factor < 8. { _dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_lb); _dispatcher_1_row = convolve_horizontal_rgba_avx_rows_one_lb; } @@ -100,6 +100,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { _pool, _dispatcher_4_rows, _dispatcher_1_row, + DefaultWeightsConverter::default(), ); } } @@ -126,14 +127,14 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 4> { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if std::is_x86_feature_detected!("sse4.1") { + if std::arch::is_x86_feature_detected!("sse4.1") { if _scale_factor < 8. { _dispatcher = convolve_vertical_sse_row_lp; } else { _dispatcher = convolve_vertical_sse_row; } } - if std::is_x86_feature_detected!("avx2") { + if std::arch::is_x86_feature_detected!("avx2") { if _scale_factor < 8. { _dispatcher = convolve_vertical_avx_row_lp; } else { diff --git a/src/scaler.rs b/src/scaler.rs index e726d39..283ccf4 100644 --- a/src/scaler.rs +++ b/src/scaler.rs @@ -1044,3 +1044,196 @@ impl Scaler { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn check_rgba8_resizing_vertical() { + let image_width = 255; + let image_height = 512; + const CN: usize = 4; + let mut image = vec![0u8; image_height * image_width * CN]; + image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Bilinear); + scaler.set_threading_policy(ThreadingPolicy::Single); + let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + let mut target_store = ImageStoreMut::alloc(image_width, image_height / 2); + scaler + .resize_rgba(&src_store, &mut target_store, false) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data + [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgba8_resizing_both() { + let image_width = 255; + let image_height = 512; + const CN: usize = 4; + let mut image = vec![0u8; image_height * image_width * CN]; + image[0] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Bilinear); + scaler.set_threading_policy(ThreadingPolicy::Single); + let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + let mut target_store = ImageStoreMut::alloc(image_width / 2, image_height / 2); + scaler + .resize_rgba(&src_store, &mut target_store, false) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data[0]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgba8_resizing_alpha() { + let image_width = 255; + let image_height = 512; + const CN: usize = 4; + let mut image = vec![0u8; image_height * image_width * CN]; + image[0] = 174; + image[7] = 1; + let mut scaler = Scaler::new(ResamplingFunction::Bilinear); + scaler.set_threading_policy(ThreadingPolicy::Single); + let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + let mut target_store = ImageStoreMut::alloc(image_width / 2, image_height / 2); + scaler + .resize_rgba(&src_store, &mut target_store, true) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data[0]; + assert_eq!(resized, 0); + } + + #[test] + fn check_rgb8_resizing_vertical() { + let image_width = 255; + let image_height = 512; + const CN: usize = 3; + let mut image = vec![0u8; image_height * image_width * CN]; + image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Bilinear); + scaler.set_threading_policy(ThreadingPolicy::Single); + let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + let mut target_store = ImageStoreMut::alloc(image_width, image_height / 2); + scaler.resize_rgb(&src_store, &mut target_store).unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data + [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgba10_resizing_vertical() { + let image_width = 8; + let image_height = 8; + const CN: usize = 4; + let mut image = vec![0u16; image_height * image_width * CN]; + image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + src_store.bit_depth = 10; + let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 10); + scaler + .resize_rgba_u16(&src_store, &mut target_store, false) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data + [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgb10_resizing_vertical() { + let image_width = 8; + let image_height = 4; + const CN: usize = 3; + let mut image = vec![0; image_height * image_width * CN]; + image[0] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + src_store.bit_depth = 10; + let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 10); + scaler + .resize_rgb_u16(&src_store, &mut target_store) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data[0]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgb16_resizing_vertical() { + let image_width = 8; + let image_height = 8; + const CN: usize = 3; + let mut image = vec![164; image_height * image_width * CN]; + image[0] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + src_store.bit_depth = 10; + let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 16); + scaler + .resize_rgb_u16(&src_store, &mut target_store) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data[0]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgba16_resizing_vertical() { + let image_width = 8; + let image_height = 8; + const CN: usize = 4; + let mut image = vec![0u16; image_height * image_width * CN]; + image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + let mut src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + src_store.bit_depth = 10; + let mut target_store = ImageStoreMut::alloc_with_depth(image_width, image_height / 2, 16); + scaler + .resize_rgba_u16(&src_store, &mut target_store, false) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data + [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN]; + assert_ne!(resized, 0); + } + + #[test] + fn check_rgba8_nearest_vertical() { + let image_width = 255; + let image_height = 512; + const CN: usize = 4; + let mut image = vec![0u8; image_height * image_width * CN]; + image[image_width * CN * (image_height.div_ceil(2)) + (image_width - 1) * CN] = 174; + let mut scaler = Scaler::new(ResamplingFunction::Nearest); + scaler.set_threading_policy(ThreadingPolicy::Single); + let src_store = ImageStore::from_slice(&image, image_width, image_height).unwrap(); + let mut target_store = ImageStoreMut::alloc(image_width, image_height / 2); + scaler + .resize_rgba(&src_store, &mut target_store, false) + .unwrap(); + let target_data = target_store.buffer.borrow(); + + let resized = target_data + [image_width * CN * ((image_height / 2).div_ceil(2)) + (image_width - 1) * CN]; + assert_eq!(resized, 174); + } +} diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs index b5ed21d..43f8bd0 100644 --- a/src/sse/alpha_f16.rs +++ b/src/sse/alpha_f16.rs @@ -195,78 +195,76 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16c( unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(8 * 4) { - let src_ptr = dst.as_ptr(); - let lane0 = _mm_loadu_si128(src_ptr as *const __m128i); - let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - - let low_alpha = _mm_cvtph_psx::(pixel.3); - let zeros = _mm_setzero_ps(); - let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros); - let low_r = _mm_blendv_ps( - _mm_mul_ps(_mm_cvtph_psx::(pixel.0), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_g = _mm_blendv_ps( - _mm_mul_ps(_mm_cvtph_psx::(pixel.1), low_alpha), - zeros, - low_alpha_zero_mask, - ); - let low_b = _mm_blendv_ps( - _mm_mul_ps(_mm_cvtph_psx::(pixel.2), low_alpha), - zeros, - low_alpha_zero_mask, - ); + for dst in rem.chunks_exact_mut(8 * 4) { + let src_ptr = dst.as_ptr(); + let lane0 = _mm_loadu_si128(src_ptr as *const __m128i); + let lane1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let lane2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let lane3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let pixel = sse_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let high_alpha = _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.3)); - let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros); - let high_r = _mm_blendv_ps( - _mm_mul_ps( - _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.0)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_g = _mm_blendv_ps( - _mm_mul_ps( - _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.1)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let high_b = _mm_blendv_ps( - _mm_mul_ps( - _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.2)), - high_alpha, - ), - zeros, - high_alpha_zero_mask, - ); - let r_values = - _mm_unpacklo_epi64(_mm_cvtps_phx::(low_r), _mm_cvtps_phx::(high_r)); - let g_values = - _mm_unpacklo_epi64(_mm_cvtps_phx::(low_g), _mm_cvtps_phx::(high_g)); - let b_values = - _mm_unpacklo_epi64(_mm_cvtps_phx::(low_b), _mm_cvtps_phx::(high_b)); - let dst_ptr = dst.as_mut_ptr(); - let (d_lane0, d_lane1, d_lane2, d_lane3) = - sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); - _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3); - } + let low_alpha = _mm_cvtph_psx::(pixel.3); + let zeros = _mm_setzero_ps(); + let low_alpha_zero_mask = _mm_cmpeq_ps(low_alpha, zeros); + let low_r = _mm_blendv_ps( + _mm_mul_ps(_mm_cvtph_psx::(pixel.0), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_g = _mm_blendv_ps( + _mm_mul_ps(_mm_cvtph_psx::(pixel.1), low_alpha), + zeros, + low_alpha_zero_mask, + ); + let low_b = _mm_blendv_ps( + _mm_mul_ps(_mm_cvtph_psx::(pixel.2), low_alpha), + zeros, + low_alpha_zero_mask, + ); - rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + let high_alpha = _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.3)); + let high_alpha_zero_mask = _mm_cmpeq_ps(high_alpha, zeros); + let high_r = _mm_blendv_ps( + _mm_mul_ps( + _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.0)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_g = _mm_blendv_ps( + _mm_mul_ps( + _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.1)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let high_b = _mm_blendv_ps( + _mm_mul_ps( + _mm_cvtph_psx::(_mm_srli_si128::<8>(pixel.2)), + high_alpha, + ), + zeros, + high_alpha_zero_mask, + ); + let r_values = + _mm_unpacklo_epi64(_mm_cvtps_phx::(low_r), _mm_cvtps_phx::(high_r)); + let g_values = + _mm_unpacklo_epi64(_mm_cvtps_phx::(low_g), _mm_cvtps_phx::(high_g)); + let b_values = + _mm_unpacklo_epi64(_mm_cvtps_phx::(low_b), _mm_cvtps_phx::(high_b)); + let dst_ptr = dst.as_mut_ptr(); + let (d_lane0, d_lane1, d_lane2, d_lane3) = + sse_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3); + _mm_storeu_si128(dst_ptr as *mut __m128i, d_lane0); + _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, d_lane1); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, d_lane2); + _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, d_lane3); } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + unpremultiply_pixel_f16_row(rem); } diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs index 38e79e4..5910361 100644 --- a/src/sse/alpha_u16.rs +++ b/src/sse/alpha_u16.rs @@ -88,6 +88,17 @@ pub(crate) unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i { _mm_srli_epi32::(_mm_add_epi32(v, _mm_srli_epi32::(v))) } +#[inline(always)] +unsafe fn _mm_div_by(v: __m128i) -> __m128i { + if BIT_DEPTH == 10 { + _mm_div_by_1023_epi32(v) + } else if BIT_DEPTH == 12 { + _mm_div_by_4095_epi32(v) + } else { + _mm_div_by_65535_epi32(v) + } +} + pub(crate) fn unpremultiply_alpha_sse_rgba_u16( in_place: &mut [u16], width: usize, @@ -226,169 +237,144 @@ pub(crate) fn premultiply_alpha_sse_rgba_u16( } } -#[target_feature(enable = "sse4.1")] -unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) { - let max_colors = (1 << bit_depth) - 1; +trait Sse41PremultiplyExecutor { + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize); +} - let mut rem = dst; - let mut src_rem = src; +#[derive(Default)] +struct Sse41PremultiplyExecutorDefault {} - unsafe { - if bit_depth == 10 { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); - let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); - - let new_rrrr = _mm_packus_epi32( - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)), - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)), - ); - let new_gggg = _mm_packus_epi32( - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)), - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)), - ); - let new_bbbb = _mm_packus_epi32( - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)), - _mm_div_by_1023_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)), - ); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } - } else if bit_depth == 12 { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); - let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); - - let new_rrrr = _mm_packus_epi32( - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)), - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)), - ); - let new_gggg = _mm_packus_epi32( - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)), - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)), - ); - let new_bbbb = _mm_packus_epi32( - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)), - _mm_div_by_4095_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)), - ); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } - } else if bit_depth == 16 { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); - let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); - - let new_rrrr = _mm_packus_epi32( - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpacklo_epi16(rrrr, zeros), - a_lo_f, - )), - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpackhi_epi16(rrrr, zeros), - a_hi_f, - )), - ); - let new_gggg = _mm_packus_epi32( - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpacklo_epi16(gggg, zeros), - a_lo_f, - )), - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpackhi_epi16(gggg, zeros), - a_hi_f, - )), - ); - let new_bbbb = _mm_packus_epi32( - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpacklo_epi16(bbbb, zeros), - a_lo_f, - )), - _mm_div_by_65535_epi32(_mm_mullo_epi32( - _mm_unpackhi_epi16(bbbb, zeros), - a_hi_f, - )), - ); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } - } else { - let v_max_colors_scale = - _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors))); - for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { - let src_ptr = src.as_ptr(); - let row0 = _mm_loadu_si128(src_ptr as *const __m128i); - let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); - let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); - let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - - let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128())); - let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128())); - - let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale); - let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale); - let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale); - - let (rgba0, rgba1, rgba2, rgba3) = - sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); - } +impl Sse41PremultiplyExecutor + for Sse41PremultiplyExecutorDefault +{ + #[target_feature(enable = "sse4.1")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = dst; + let mut src_rem = src; + + let zeros = _mm_setzero_si128(); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let src_ptr = src.as_ptr(); + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + + let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); + let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); + + let new_rrrr = _mm_packus_epi32( + _mm_div_by::(_mm_madd_epi16(_mm_unpacklo_epi16(rrrr, zeros), a_lo_f)), + _mm_div_by::(_mm_madd_epi16(_mm_unpackhi_epi16(rrrr, zeros), a_hi_f)), + ); + let new_gggg = _mm_packus_epi32( + _mm_div_by::(_mm_madd_epi16(_mm_unpacklo_epi16(gggg, zeros), a_lo_f)), + _mm_div_by::(_mm_madd_epi16(_mm_unpackhi_epi16(gggg, zeros), a_hi_f)), + ); + let new_bbbb = _mm_packus_epi32( + _mm_div_by::(_mm_madd_epi16(_mm_unpacklo_epi16(bbbb, zeros), a_lo_f)), + _mm_div_by::(_mm_madd_epi16(_mm_unpackhi_epi16(bbbb, zeros), a_hi_f)), + ); + + let (rgba0, rgba1, rgba2, rgba3) = + sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); } rem = rem.chunks_exact_mut(8 * 4).into_remainder(); src_rem = src_rem.chunks_exact(8 * 4).remainder(); + + premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32); } +} + +#[derive(Default)] +struct Sse41PremultiplyExecutorAny {} + +impl Sse41PremultiplyExecutor for Sse41PremultiplyExecutorAny { + #[target_feature(enable = "sse4.1")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + let max_colors = (1 << bit_depth) - 1; + + let mut rem = dst; + let mut src_rem = src; - premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32); + let v_max_colors_scale = + _mm_div_ps(_mm_set1_ps(1.), _mm_cvtepi32_ps(_mm_set1_epi32(max_colors))); + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + let src_ptr = src.as_ptr(); + let row0 = _mm_loadu_si128(src_ptr as *const __m128i); + let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i); + let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); + let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); + + let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128())); + let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128())); + + let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale); + let new_gggg = sse_premultiply_row_u16(gggg, a_lo_f, a_hi_f, v_max_colors_scale); + let new_bbbb = sse_premultiply_row_u16(bbbb, a_lo_f, a_hi_f, v_max_colors_scale); + + let (rgba0, rgba1, rgba2, rgba3) = + sse_interleave_rgba_epi16(new_rrrr, new_gggg, new_bbbb, aaaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(8) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(24) as *mut __m128i, rgba3); + } + + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(8 * 4).remainder(); + + premultiply_alpha_rgba_row(rem, src_rem, max_colors as u32); + } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn pma_sse41_rgba16_dispatch( + dst: &mut [u16], + src: &[u16], + bit_depth: usize, + executor: impl Sse41PremultiplyExecutor, +) { + executor.premultiply(dst, src, bit_depth); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], bit_depth: usize) { + if bit_depth == 10 { + pma_sse41_rgba16_dispatch( + dst, + src, + bit_depth, + Sse41PremultiplyExecutorDefault::<10>::default(), + ) + } else if bit_depth == 12 { + pma_sse41_rgba16_dispatch( + dst, + src, + bit_depth, + Sse41PremultiplyExecutorDefault::<12>::default(), + ) + } else if bit_depth == 16 { + pma_sse41_rgba16_dispatch( + dst, + src, + bit_depth, + Sse41PremultiplyExecutorDefault::<16>::default(), + ) + } else { + pma_sse41_rgba16_dispatch(dst, src, bit_depth, Sse41PremultiplyExecutorAny::default()) + } } diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs index 32f01cd..53a75ab 100644 --- a/src/sse/alpha_u8.rs +++ b/src/sse/alpha_u8.rs @@ -111,55 +111,53 @@ unsafe fn sse_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let mut rem = dst; let mut src_rem = src; - unsafe { - let zeros = _mm_setzero_si128(); - for (dst, src) in rem - .chunks_exact_mut(16 * 4) - .zip(src_rem.chunks_exact(16 * 4)) - { - let src_ptr = src.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); - - let aaa_low = _mm_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm_unpackhi_epi8(aaa, zeros); - - rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); - rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); - ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); - ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); - bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); - bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); - - let rrr = _mm_packus_epi16(rrr_low, rrr_high); - let ggg = _mm_packus_epi16(ggg_low, ggg_high); - let bbb = _mm_packus_epi16(bbb_low, bbb_high); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - - rem = rem.chunks_exact_mut(16 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(16 * 4).remainder(); + let zeros = _mm_setzero_si128(); + for (dst, src) in rem + .chunks_exact_mut(16 * 4) + .zip(src_rem.chunks_exact(16 * 4)) + { + let src_ptr = src.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); + + let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); + + let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm_unpackhi_epi8(aaa, zeros); + + rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); + rrr_high = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_high, aaa_high)); + ggg_low = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_low, aaa_low)); + ggg_high = _mm_div_by_255_epi16(_mm_mullo_epi16(ggg_high, aaa_high)); + bbb_low = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_low, aaa_low)); + bbb_high = _mm_div_by_255_epi16(_mm_mullo_epi16(bbb_high, aaa_high)); + + let rrr = _mm_packus_epi16(rrr_low, rrr_high); + let ggg = _mm_packus_epi16(ggg_low, ggg_high); + let bbb = _mm_packus_epi16(bbb_low, bbb_high); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(16 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(16 * 4).remainder(); + premultiply_alpha_rgba_row_impl(rem, src_rem); } @@ -203,31 +201,30 @@ pub(crate) fn sse_unpremultiply_alpha_rgba( #[target_feature(enable = "sse4.1")] unsafe fn sse_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { let mut rem = in_place; - unsafe { - for dst in rem.chunks_exact_mut(16 * 4) { - let src_ptr = dst.as_ptr(); - let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); - let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); - let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); - let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); - let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - - let rrr = sse_unpremultiply_row(rrr, aaa); - let ggg = sse_unpremultiply_row(ggg, aaa); - let bbb = sse_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); - - let dst_ptr = dst.as_mut_ptr(); - _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); - _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); - _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); - _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); - } - rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + for dst in rem.chunks_exact_mut(16 * 4) { + let src_ptr = dst.as_ptr(); + let rgba0 = _mm_loadu_si128(src_ptr as *const __m128i); + let rgba1 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i); + let rgba2 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i); + let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); + let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + + let rrr = sse_unpremultiply_row(rrr, aaa); + let ggg = sse_unpremultiply_row(ggg, aaa); + let bbb = sse_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(rrr, ggg, bbb, aaa); + + let dst_ptr = dst.as_mut_ptr(); + _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0); + _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1); + _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2); + _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3); } + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + for dst in rem.chunks_exact_mut(4) { let a = dst[3]; if a != 0 { diff --git a/src/sse/check_alpha.rs b/src/sse/check_alpha.rs new file mode 100644 index 0000000..85133e3 --- /dev/null +++ b/src/sse/check_alpha.rs @@ -0,0 +1,219 @@ +/* + * Copyright (c) Radzivon Bartoshyk 12/2024. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::sse::_mm_hsum_epi32; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +/// Checks if image has constant alpha by xor rows +pub(crate) fn sse_has_non_constant_cap_alpha_rgba8( + store: &[u8], + width: usize, + stride: usize, +) -> bool { + unsafe { sse_has_non_constant_cap_alpha_rgba8_impl(store, width, stride) } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn sse_has_non_constant_cap_alpha_rgba8_impl( + store: &[u8], + width: usize, + stride: usize, +) -> bool { + if store.is_empty() { + return true; + } + + let sh0 = _mm_setr_epi8(3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1); + + let first_alpha = store[3]; + let def_alpha = _mm_set1_epi32(first_alpha as i32); + + for row in store.chunks_exact(stride) { + let row = &row[0..width * 4]; + let mut sums = _mm_set1_epi32(0); + for chunk in row.chunks_exact(16 * 4) { + let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + let mut r1 = _mm_loadu_si128(chunk.get_unchecked(16..).as_ptr() as *const __m128i); + let mut r2 = _mm_loadu_si128(chunk.get_unchecked(32..).as_ptr() as *const __m128i); + let mut r3 = _mm_loadu_si128(chunk.get_unchecked(48..).as_ptr() as *const __m128i); + + r0 = _mm_xor_si128(_mm_shuffle_epi8(r0, sh0), def_alpha); + r1 = _mm_xor_si128(_mm_shuffle_epi8(r1, sh0), def_alpha); + r2 = _mm_xor_si128(_mm_shuffle_epi8(r2, sh0), def_alpha); + r3 = _mm_xor_si128(_mm_shuffle_epi8(r3, sh0), def_alpha); + + sums = _mm_add_epi32(sums, r0); + sums = _mm_add_epi32(sums, r1); + sums = _mm_add_epi32(sums, r2); + sums = _mm_add_epi32(sums, r3); + } + + let row = row.chunks_exact(16 * 4).remainder(); + + for chunk in row.chunks_exact(16) { + let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + + r0 = _mm_shuffle_epi8(r0, sh0); + + let alphas = _mm_xor_si128(r0, def_alpha); + + sums = _mm_add_epi32(sums, alphas); + } + + let row = row.chunks_exact(16).remainder(); + + let mut h_sum = _mm_hsum_epi32(sums); + + for chunk in row.chunks_exact(4) { + h_sum += chunk[3] as i32 ^ first_alpha as i32; + } + + if h_sum != 0 { + return true; + } + } + false +} + +/// Checks if image has constant alpha by xor rows for image 16bits +pub(crate) fn sse_has_non_constant_cap_alpha_rgba16( + store: &[u16], + width: usize, + stride: usize, +) -> bool { + unsafe { sse_has_non_constant_cap_alpha_rgba16_impl(store, width, stride) } +} + +#[target_feature(enable = "sse4.1")] +unsafe fn sse_has_non_constant_cap_alpha_rgba16_impl( + store: &[u16], + width: usize, + stride: usize, +) -> bool { + if store.is_empty() { + return true; + } + + let sh0 = _mm_setr_epi8(6, 7, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + + let first_alpha = store[3]; + let def_alpha = _mm_set1_epi32(first_alpha as i32); + + for row in store.chunks_exact(stride) { + let row = &row[0..width * 4]; + let mut sums = _mm_set1_epi32(0); + for chunk in row.chunks_exact(8 * 4) { + let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + let mut r1 = _mm_loadu_si128(chunk.get_unchecked(8..).as_ptr() as *const __m128i); + let mut r2 = _mm_loadu_si128(chunk.get_unchecked(16..).as_ptr() as *const __m128i); + let mut r3 = _mm_loadu_si128(chunk.get_unchecked(24..).as_ptr() as *const __m128i); + + r0 = _mm_shuffle_epi8(r0, sh0); + r1 = _mm_shuffle_epi8(r1, sh0); + r2 = _mm_shuffle_epi8(r2, sh0); + r3 = _mm_shuffle_epi8(r3, sh0); + + let r01 = _mm_xor_si128(_mm_unpacklo_epi32(r0, r1), def_alpha); + let r23 = _mm_xor_si128(_mm_unpacklo_epi32(r2, r3), def_alpha); + + sums = _mm_add_epi32(sums, r01); + sums = _mm_add_epi32(sums, r23); + } + + let row = row.chunks_exact(8 * 4).remainder(); + + for chunk in row.chunks_exact(8) { + let mut r0 = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + + r0 = _mm_shuffle_epi8(r0, sh0); + + let alphas = _mm_xor_si128(_mm_unpacklo_epi32(r0, r0), def_alpha); + + sums = _mm_add_epi32(sums, alphas); + } + + let row = row.chunks_exact(8).remainder(); + + let mut h_sum = _mm_hsum_epi32(sums); + + for chunk in row.chunks_exact(4) { + h_sum += chunk[3] as i32 ^ first_alpha as i32; + } + + if h_sum != 0 { + return true; + } + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn check_alpha_exists_rgba8() { + let image_size = 256usize; + let mut image = vec![0u8; image_size * image_size * 4]; + image[3 + 150 * 4] = 75; + let has_alpha = sse_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4); + assert_eq!(true, has_alpha); + } + + #[test] + fn check_alpha_exists_rgba16() { + let image_size = 256usize; + let mut image = vec![0u16; image_size * image_size * 4]; + image[3] = 715; + image[7] = 715; + image[11] = 715; + image[15] = 715; + let has_alpha = sse_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4); + assert_eq!(true, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba8() { + let image_size = 256usize; + let image = vec![255u8; image_size * image_size * 4]; + let has_alpha = sse_has_non_constant_cap_alpha_rgba8(&image, image_size, image_size * 4); + assert_eq!(false, has_alpha); + } + + #[test] + fn check_alpha_not_exists_rgba16() { + let image_size = 256usize; + let image = vec![255u16; image_size * image_size * 4]; + let has_alpha = sse_has_non_constant_cap_alpha_rgba16(&image, image_size, image_size * 4); + assert_eq!(false, has_alpha); + } +} diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 6726e82..45417dd 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -32,6 +32,7 @@ mod alpha_f16; mod alpha_f32; mod alpha_u16; mod alpha_u8; +mod check_alpha; #[cfg(feature = "half")] mod f16_utils; mod plane_f32; @@ -67,6 +68,9 @@ pub(crate) use alpha_u8::{ _mm_div_by_255_epi16, sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba, sse_unpremultiply_row, }; +pub(crate) use check_alpha::{ + sse_has_non_constant_cap_alpha_rgba16, sse_has_non_constant_cap_alpha_rgba8, +}; pub(crate) use plane_f32::convolve_horizontal_plane_sse_row_one; pub(crate) use plane_f32::convolve_horizontal_plane_sse_rows_4; pub(crate) use plane_u8::{ diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs index 99ed16a..7c83a9a 100644 --- a/src/sse/plane_f32.rs +++ b/src/sse/plane_f32.rs @@ -316,7 +316,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_fma( ); } -#[inline] +#[inline(always)] unsafe fn convolve_horizontal_plane_sse_rows_4_impl( dst_width: usize, _: usize, @@ -326,139 +326,135 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( dst: &mut [f32], dst_stride: usize, ) { - unsafe { - let mut filter_offset = 0usize; - let zeros = _mm_setzero_ps(); - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = filter_weights.bounds.get_unchecked(x); - let mut jx = 0usize; - let mut store_0 = zeros; - let mut store_1 = zeros; - let mut store_2 = zeros; - let mut store_3 = zeros; - - while jx + 16 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights0 = _mm_loadu_ps(ptr); - let read_weights1 = _mm_loadu_ps(ptr.add(4)); - let read_weights2 = _mm_loadu_ps(ptr.add(8)); - let read_weights3 = _mm_loadu_ps(ptr.add(12)); - let weights = (read_weights0, read_weights1, read_weights2, read_weights3); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA); - let s_ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA); - let s_ptr2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA); - let s_ptr3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA); - jx += 16; - } - - while jx + 8 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights0 = _mm_loadu_ps(ptr); - let read_weights1 = _mm_loadu_ps(ptr.add(4)); - let read_weights = (read_weights0, read_weights1); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_8_f32!( - bounds_start, - src, - read_weights.0, - read_weights.1, - store_0, - FMA - ); - let s_ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_8_f32!( - bounds_start, - s_ptr_1, - read_weights.0, - read_weights.1, - store_1, - FMA - ); - let s_ptr2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_8_f32!( - bounds_start, - s_ptr2, - read_weights.0, - read_weights.1, - store_2, - FMA - ); - let s_ptr3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_8_f32!( - bounds_start, - s_ptr3, - read_weights.0, - read_weights.1, - store_3, - FMA - ); - jx += 8; - } - - while jx + 4 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = _mm_loadu_ps(ptr); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA); - let s_ptr_1 = src.get_unchecked(src_stride..); - store_1 = - conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA); - let s_ptr2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA); - let s_ptr3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA); - jx += 4; - } - - while jx + 2 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weights = - _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA); - let ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA); - let ptr_2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA); - let ptr_3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA); - jx += 2; - } - - while jx < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weight0 = _mm_set1_ps(ptr.read_unaligned()); - let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA); - let ptr_1 = src.get_unchecked(src_stride..); - store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA); - let ptr_2 = src.get_unchecked(src_stride * 2..); - store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA); - let ptr_3 = src.get_unchecked(src_stride * 3..); - store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA); - jx += 1; - } - - let px = x; - let dest_ptr = dst.get_unchecked_mut(px); - *dest_ptr = _mm_hsum_ps(store_0); - - let dest_ptr = dst.get_unchecked_mut(px + dst_stride); - *dest_ptr = _mm_hsum_ps(store_1); - - let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2); - *dest_ptr = _mm_hsum_ps(store_2); - - let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3); - *dest_ptr = _mm_hsum_ps(store_3); - - filter_offset += filter_weights.aligned_size; + let mut filter_offset = 0usize; + let zeros = _mm_setzero_ps(); + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 16 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights0 = _mm_loadu_ps(ptr); + let read_weights1 = _mm_loadu_ps(ptr.add(4)); + let read_weights2 = _mm_loadu_ps(ptr.add(8)); + let read_weights3 = _mm_loadu_ps(ptr.add(12)); + let weights = (read_weights0, read_weights1, read_weights2, read_weights3); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA); + jx += 16; } + + while jx + 8 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights0 = _mm_loadu_ps(ptr); + let read_weights1 = _mm_loadu_ps(ptr.add(4)); + let read_weights = (read_weights0, read_weights1); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_8_f32!( + bounds_start, + src, + read_weights.0, + read_weights.1, + store_0, + FMA + ); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr_1, + read_weights.0, + read_weights.1, + store_1, + FMA + ); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr2, + read_weights.0, + read_weights.1, + store_2, + FMA + ); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr3, + read_weights.0, + read_weights.1, + store_3, + FMA + ); + jx += 8; + } + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = _mm_loadu_ps(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_set1_ps(ptr.read_unaligned()); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA); + jx += 1; + } + + let px = x; + let dest_ptr = dst.get_unchecked_mut(px); + *dest_ptr = _mm_hsum_ps(store_0); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride); + *dest_ptr = _mm_hsum_ps(store_1); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2); + *dest_ptr = _mm_hsum_ps(store_2); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3); + *dest_ptr = _mm_hsum_ps(store_3); + + filter_offset += filter_weights.aligned_size; } } diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs index ede3433..637b774 100644 --- a/src/sse/rgb_f16.rs +++ b/src/sse/rgb_f16.rs @@ -82,8 +82,7 @@ unsafe fn convolve_horizontal_parts_4_rgb_f16 let acc = _mm_prefer_fma_ps::(store_0, rgb_pixel_0, weight0); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_1, weight1); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_2, weight2); - let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3); - acc + _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3) } #[inline(always)] @@ -142,8 +141,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f16(store_0, rgb_pixel, weight0); - acc + _mm_prefer_fma_ps::(store_0, rgb_pixel, weight0) } pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16( diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs index ee6e357..cc1b041 100644 --- a/src/sse/rgb_u8.rs +++ b/src/sse/rgb_u8.rs @@ -69,7 +69,6 @@ unsafe fn load_rgb_x4(src: &[u8]) -> __m128i { rgb_pixel } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( src: &[u8], @@ -78,10 +77,9 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 3; + const CHANNELS: usize = 3; - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, @@ -91,7 +89,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( -1, -1, -1, -1); - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, @@ -101,147 +99,142 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl( -1, -1, -1, -1); - let vld = _mm_set1_epi32(ROUNDING_CONST); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - // Will make step in 4 items however since it is RGB it is necessary to make a safe offset - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); - const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); - let weight01 = _mm_shuffle_epi32::(weights); - const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); - let weight23 = _mm_shuffle_epi32::(weights); - let bounds_start = (bounds.start + jx) * CHANNELS; - - let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..)); - let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..)); - let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..)); - let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..)); - - let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); - - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); - - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); - - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = (bounds.start + jx) * CHANNELS; - let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); - - let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..)); - let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..)); - let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..)); - let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..)); - - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let bounds_start = bounds.start + jx; - - let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); - - store_0 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3); - jx += 1; - } - - let store_0_8 = compress_i32(store_0); - let store_1_8 = compress_i32(store_1); - let store_2_8 = compress_i32(store_2); - let store_3_8 = compress_i32(store_3); - - let element_0 = _mm_extract_epi32::<0>(store_0_8); - let element_1 = _mm_extract_epi32::<0>(store_1_8); - let element_2 = _mm_extract_epi32::<0>(store_2_8); - let element_3 = _mm_extract_epi32::<0>(store_3_8); - - let bytes = element_0.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk0.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_1.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk1.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_2.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk2.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_3.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk3.get_unchecked_mut(2) = bytes[2]; + let vld = _mm_set1_epi32(ROUNDING_CONST); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + // Will make step in 4 items however since it is RGB it is necessary to make a safe offset + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); + const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); + let weight01 = _mm_shuffle_epi32::(weights); + const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); + let weight23 = _mm_shuffle_epi32::(weights); + let bounds_start = (bounds.start + jx) * CHANNELS; + + let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..)); + let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..)); + let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..)); + let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..)); + + let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let hi_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_hi); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); + + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); + + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); + + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = (bounds.start + jx) * CHANNELS; + let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); + + let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..)); + let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..)); + let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..)); + let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..)); + + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + + jx += 2; + } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + let bounds_start = bounds.start + jx; + + let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); + + store_0 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_sse_rgb(bounds_start, src3, weight0, store_3); + jx += 1; } + + let store_0_8 = compress_i32(store_0); + let store_1_8 = compress_i32(store_1); + let store_2_8 = compress_i32(store_2); + let store_3_8 = compress_i32(store_3); + + let element_0 = _mm_extract_epi32::<0>(store_0_8); + let element_1 = _mm_extract_epi32::<0>(store_1_8); + let element_2 = _mm_extract_epi32::<0>(store_2_8); + let element_3 = _mm_extract_epi32::<0>(store_3_8); + + let bytes = element_0.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk0.get_unchecked_mut(2) = bytes[2]; + + let bytes = element_1.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk1.get_unchecked_mut(2) = bytes[2]; + + let bytes = element_2.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk2.get_unchecked_mut(2) = bytes[2]; + + let bytes = element_3.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *chunk3.get_unchecked_mut(2) = bytes[2]; } } @@ -262,71 +255,68 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_impl( dst: &mut [u8], filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 3; - - let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1); - - let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1); - - for ((dst, bounds), weights) in dst - .chunks_exact_mut(CHANNELS) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let bounds_size = bounds.size; - let mut jx = 0usize; - let mut store = _mm_setzero_si128(); - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); - const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); - let weight01 = _mm_shuffle_epi32::(weights); - const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); - let weight23 = _mm_shuffle_epi32::(weights); - let bounds_start = bounds.start + jx; - let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..); - - let rgb_pixel = load_rgb_x4(src_ptr_0); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); - store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); - let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..); - let rgb_pixel = load_rgb_x2(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0)); - jx += 2; - } - - while jx < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); - store = - convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store); - jx += 1; - } - - let store_16_8 = compress_i32(store); - - let element = _mm_extract_epi32::<0>(store_16_8); - let bytes = element.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *dst.get_unchecked_mut(2) = bytes[2]; + const CHANNELS: usize = 3; + + let shuffle_lo = _mm_setr_epi8(0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1); + + let shuffle_hi = _mm_setr_epi8(6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1); + + for ((dst, bounds), weights) in dst + .chunks_exact_mut(CHANNELS) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let bounds_size = bounds.size; + let mut jx = 0usize; + let mut store = _mm_setzero_si128(); + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); + const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); + let weight01 = _mm_shuffle_epi32::(weights); + const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); + let weight23 = _mm_shuffle_epi32::(weights); + let bounds_start = bounds.start + jx; + let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..); + + let rgb_pixel = load_rgb_x4(src_ptr_0); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); + store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let weight0 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); + let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..); + let rgb_pixel = load_rgb_x2(src_ptr); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0)); + jx += 2; + } + + while jx < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + let weight0 = _mm_set1_epi32(w_ptr.as_ptr().read_unaligned() as i32); + store = convolve_horizontal_parts_one_sse_rgb(bounds.start + jx, src, weight0, store); + jx += 1; } + + let store_16_8 = compress_i32(store); + + let element = _mm_extract_epi32::<0>(store_16_8); + let bytes = element.to_le_bytes(); + let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); + (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte); + *dst.get_unchecked_mut(2) = bytes[2]; } } diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs index d032a21..5e367d2 100644 --- a/src/sse/rgba_f16.rs +++ b/src/sse/rgba_f16.rs @@ -49,8 +49,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f16(rgb_pixel); - let acc = _mm_prefer_fma_ps::(store_0, pixels, weight0); - acc + _mm_prefer_fma_ps::(store_0, pixels, weight0) } #[inline(always)] @@ -77,8 +76,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f16(store_0, rgb_pixel_0, weight0); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_1, weight1); let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_2, weight2); - let acc = _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3); - acc + _mm_prefer_fma_ps::(acc, rgb_pixel_3, weight3) } #[inline(always)] @@ -95,12 +93,11 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16(store_0, _mm_cvtph_psx::(rgb_pixels), weight0); - let acc = _mm_prefer_fma_ps::( + _mm_prefer_fma_ps::( acc, _mm_cvtph_psx::(_mm_srli_si128::<8>(rgb_pixels)), weight1, - ); - acc + ) } pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16( diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs index 64378a3..b5dc978 100644 --- a/src/sse/rgba_f32.rs +++ b/src/sse/rgba_f32.rs @@ -119,71 +119,65 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( src: &[f32], dst: &mut [f32], ) { - unsafe { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = filter_weights.bounds.get_unchecked(x); - let mut jx = 0usize; - let mut store = _mm_setzero_ps(); - - while jx + 4 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr); - let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_4_rgba_f32::( - filter_start, - src, - weight0, - weight1, - weight2, - weight3, - store, - ); - jx += 4; - } - - while jx + 2 < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8)); - const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0); - let weight0 = - _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); - const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1); - let weight1 = - _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); - let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_2_rgba_f32::( - filter_start, - src, - weight0, - weight1, - store, - ); - jx += 2 - } - - while jx < bounds.size { - let ptr = weights_ptr.add(jx + filter_offset); - let weight0 = _mm_load1_ps(ptr); - let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_one_rgba_f32::( - filter_start, - src, - weight0, - store, - ); - jx += 1; - } - - let px = x * CHANNELS; - let dest_ptr = dst.get_unchecked_mut(px..); - _mm_storeu_ps(dest_ptr.as_mut_ptr(), store); - - filter_offset += filter_weights.aligned_size; + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = _mm_setzero_ps(); + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr); + let filter_start = jx + bounds.start; + store = convolve_horizontal_parts_4_rgba_f32::( + filter_start, + src, + weight0, + weight1, + weight2, + weight3, + store, + ); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weights = _mm_castsi128_ps(_mm_loadu_si64(ptr as *const u8)); + const SHUFFLE_0: i32 = shuffle(0, 0, 0, 0); + let weight0 = + _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); + const SHUFFLE_1: i32 = shuffle(1, 1, 1, 1); + let weight1 = + _mm_castsi128_ps(_mm_shuffle_epi32::(_mm_castps_si128(weights))); + let filter_start = jx + bounds.start; + store = convolve_horizontal_parts_2_rgba_f32::( + filter_start, + src, + weight0, + weight1, + store, + ); + jx += 2 } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_load1_ps(ptr); + let filter_start = jx + bounds.start; + store = + convolve_horizontal_parts_one_rgba_f32::(filter_start, src, weight0, store); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = dst.get_unchecked_mut(px..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store); + + filter_offset += filter_weights.aligned_size; } } @@ -329,6 +323,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( let mut store_1 = zeros; let mut store_2 = zeros; let mut store_3 = zeros; + while jx + 4 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let (weight0, weight1, weight2, weight3) = load_4_weights!(ptr); diff --git a/src/sse/rgba_u16_lb.rs b/src/sse/rgba_u16_lb.rs index 6baed89..107fc78 100644 --- a/src/sse/rgba_u16_lb.rs +++ b/src/sse/rgba_u16_lb.rs @@ -192,7 +192,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl( filter_weights: &FilterWeights, bit_depth: u32, ) { - assert!(bit_depth >= 1 && bit_depth <= 16); + assert!((1..=16).contains(&bit_depth)); const CHANNELS: usize = 4; let init = _mm_set1_epi32(ROUNDING_CONST); @@ -269,8 +269,8 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl( while jx + 2 < bounds_size { let w_ptr = weights.get_unchecked(jx..(jx + 2)); let bounds_start = bounds.start + jx; - let w0 = _mm_set1_epi16(w_ptr[0] as i16); - let w1 = _mm_set1_epi16(w_ptr[1] as i16); + let w0 = _mm_set1_epi16(w_ptr[0]); + let w1 = _mm_set1_epi16(w_ptr[1]); store_0 = conv_horiz_rgba_2_u16(bounds_start, src0, w0, w1, store_0); store_1 = conv_horiz_rgba_2_u16(bounds_start, src1, w0, w1, store_1); store_2 = conv_horiz_rgba_2_u16(bounds_start, src2, w0, w1, store_2); @@ -344,14 +344,14 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl( while jx + 8 < bounds_size { let bounds_start = bounds.start + jx; let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let w0 = _mm_set1_epi32(w_ptr[0] as i32); - let w1 = _mm_set1_epi32(w_ptr[1] as i32); - let w2 = _mm_set1_epi32(w_ptr[2] as i32); - let w3 = _mm_set1_epi32(w_ptr[3] as i32); - let w4 = _mm_set1_epi32(w_ptr[4] as i32); - let w5 = _mm_set1_epi32(w_ptr[5] as i32); - let w6 = _mm_set1_epi32(w_ptr[6] as i32); - let w7 = _mm_set1_epi32(w_ptr[7] as i32); + let w0 = _mm_set1_epi16(w_ptr[0]); + let w1 = _mm_set1_epi16(w_ptr[1]); + let w2 = _mm_set1_epi16(w_ptr[2]); + let w3 = _mm_set1_epi16(w_ptr[3]); + let w4 = _mm_set1_epi16(w_ptr[4]); + let w5 = _mm_set1_epi16(w_ptr[5]); + let w6 = _mm_set1_epi16(w_ptr[6]); + let w7 = _mm_set1_epi16(w_ptr[7]); let set1 = (w0, w1, w2, w3); let set2 = (w4, w5, w6, w7); store = conv_horiz_rgba_8_u16(bounds_start, src, set1, set2, store); @@ -360,10 +360,10 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl( while jx + 4 < bounds_size { let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let w0 = _mm_set1_epi32(w_ptr[0] as i32); - let w1 = _mm_set1_epi32(w_ptr[1] as i32); - let w2 = _mm_set1_epi32(w_ptr[2] as i32); - let w3 = _mm_set1_epi32(w_ptr[3] as i32); + let w0 = _mm_set1_epi16(w_ptr[0]); + let w1 = _mm_set1_epi16(w_ptr[1]); + let w2 = _mm_set1_epi16(w_ptr[2]); + let w3 = _mm_set1_epi16(w_ptr[3]); let bounds_start = bounds.start + jx; store = conv_horiz_rgba_4_u16(bounds_start, src, w0, w1, w2, w3, store); jx += 4; @@ -372,15 +372,15 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl( while jx + 2 < bounds_size { let w_ptr = weights.get_unchecked(jx..(jx + 2)); let bounds_start = bounds.start + jx; - let w0 = _mm_set1_epi32(w_ptr[0] as i32); - let w1 = _mm_set1_epi32(w_ptr[1] as i32); + let w0 = _mm_set1_epi16(w_ptr[0]); + let w1 = _mm_set1_epi16(w_ptr[1]); store = conv_horiz_rgba_2_u16(bounds_start, src, w0, w1, store); jx += 2; } while jx < bounds_size { let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let w0 = _mm_set1_epi32(w_ptr[0] as i32); + let w0 = _mm_set1_epi16(w_ptr[0]); let bounds_start = bounds.start + jx; store = conv_horiz_rgba_1_u16(bounds_start, src, w0, store); jx += 1; diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs index c746c33..1f3dc99 100644 --- a/src/sse/rgba_u8.rs +++ b/src/sse/rgba_u8.rs @@ -72,10 +72,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 4; + const CHANNELS: usize = 4; - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_lo = _mm_setr_epi8(0, -1, 4, -1, 1, -1, @@ -85,7 +84,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( 3, -1, 7, -1); - #[rustfmt::skip] + #[rustfmt::skip] let shuffle_hi = _mm_setr_epi8(8, -1, 12, -1, 9, -1, @@ -95,152 +94,147 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( 11, -1, 15, -1); - let vld = _mm_set1_epi32(ROUNDING_CONST); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); - const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); - let weight01 = _mm_shuffle_epi32::(weights); - const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); - let weight23 = _mm_shuffle_epi32::(weights); - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_1 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_2 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_3 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - - let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); - - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); - - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); - - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); - - let rgb_pixel_0 = - _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_1 = - _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_2 = - _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_3 = - _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - - let weight0 = _mm_set1_epi32(w_ptr[0] as i32); - - let start_bounds = bounds.start + jx; - - store_0 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); - jx += 1; - } - - let store_16_8_0 = compress_i32(store_0); - let store_16_8_1 = compress_i32(store_1); - let store_16_8_2 = compress_i32(store_2); - let store_16_8_3 = compress_i32(store_3); - - _mm_storeu_si32( - chunk0.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_0, store_16_8_0), + let vld = _mm_set1_epi32(ROUNDING_CONST); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = _mm_loadu_si64(w_ptr.as_ptr() as *const u8); + const SHUFFLE_01: i32 = shuffle(0, 0, 0, 0); + let weight01 = _mm_shuffle_epi32::(weights); + const SHUFFLE_23: i32 = shuffle(1, 1, 1, 1); + let weight23 = _mm_shuffle_epi32::(weights); + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - _mm_storeu_si32( - chunk1.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_1, store_16_8_1), + let rgb_pixel_1 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - _mm_storeu_si32( - chunk2.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_2, store_16_8_2), + let rgb_pixel_2 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - _mm_storeu_si32( - chunk3.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_3, store_16_8_3), + let rgb_pixel_3 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); + + let hi_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_hi); + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let hi_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_hi); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let hi_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_hi); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let hi_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_hi); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi_0, weight23)); + + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi_1, weight23)); + + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi_2, weight23)); + + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi_3, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()); + + let rgb_pixel_0 = + _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_1 = + _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_2 = + _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_3 = + _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + + let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_lo); + let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_lo); + let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_lo); + let lo_3 = _mm_shuffle_epi8(rgb_pixel_3, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo_0, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo_1, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo_2, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo_3, weight01)); + + jx += 2; + } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + + let weight0 = _mm_set1_epi32(w_ptr[0] as i32); + + let start_bounds = bounds.start + jx; + + store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); + jx += 1; } + + let store_16_8_0 = compress_i32(store_0); + let store_16_8_1 = compress_i32(store_1); + let store_16_8_2 = compress_i32(store_2); + let store_16_8_3 = compress_i32(store_3); + + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs index e5b1934..8590618 100644 --- a/src/sse/rgba_u8_lb.rs +++ b/src/sse/rgba_u8_lb.rs @@ -112,254 +112,240 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - const CHANNELS: usize = 4; - - const SCALE: i32 = 6; - const V_SHR: i32 = SCALE; - const ROUNDING: i16 = 1 << (V_SHR - 1); - - let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0); - - let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 8 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - let weight23 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - - let weight45 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - - let weight67 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_0_1 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - let rgb_pixel_1 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_1_0 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - let rgb_pixel_2 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_2_1 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - let rgb_pixel_3 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_3_1 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS + 16)..) - .as_ptr() as *const __m128i, - ); - - store_0 = hdot4( - store_0, - rgb_pixel_0, - rgb_pixel_0_1, - weight01, - weight23, - weight45, - weight67, - ); - store_1 = hdot4( - store_1, - rgb_pixel_1, - rgb_pixel_1_0, - weight01, - weight23, - weight45, - weight67, - ); - store_2 = hdot4( - store_2, - rgb_pixel_2, - rgb_pixel_2_1, - weight01, - weight23, - weight45, - weight67, - ); - store_3 = hdot4( - store_3, - rgb_pixel_3, - rgb_pixel_3_1, - weight01, - weight23, - weight45, - weight67, - ); - - jx += 8; - } - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - let weight23 = _mm_shuffle_epi8( - _mm_set1_epi32( - (w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned(), - ), - shuffle_weights, - ); - let start_bounds = bounds.start + jx; - - let rgb_pixel_0 = _mm_loadu_si128( - src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_1 = _mm_loadu_si128( - src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_2 = _mm_loadu_si128( - src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - let rgb_pixel_3 = _mm_loadu_si128( - src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, - ); - - store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23); - store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23); - store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23); - store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23); - - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_shuffle_epi8( - _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), - shuffle_weights, - ); - - let rgb_pixel_0 = - _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_1 = - _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_2 = - _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - let rgb_pixel_3 = - _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - - store_0 = hdot(store_0, rgb_pixel_0, weight01); - store_1 = hdot(store_1, rgb_pixel_1, weight01); - store_2 = hdot(store_2, rgb_pixel_2, weight01); - store_3 = hdot(store_3, rgb_pixel_3, weight01); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - - let weight0 = _mm_set1_epi16(w_ptr[0]); - - let start_bounds = bounds.start + jx; - - store_0 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); - store_1 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); - store_2 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); - store_3 = - convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); - jx += 1; - } - - store_0 = _mm_add_hi_lo_epi16(store_0); - store_1 = _mm_add_hi_lo_epi16(store_1); - store_2 = _mm_add_hi_lo_epi16(store_2); - store_3 = _mm_add_hi_lo_epi16(store_3); - - let store_16_8_0 = _mm_srai_epi16::(store_0); - let store_16_8_1 = _mm_srai_epi16::(store_1); - let store_16_8_2 = _mm_srai_epi16::(store_2); - let store_16_8_3 = _mm_srai_epi16::(store_3); - - _mm_storeu_si32( - chunk0.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_0, store_16_8_0), - ); - _mm_storeu_si32( - chunk1.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_1, store_16_8_1), - ); - _mm_storeu_si32( - chunk2.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_2, store_16_8_2), - ); - _mm_storeu_si32( - chunk3.as_mut_ptr() as *mut _, - _mm_packus_epi16(store_16_8_3, store_16_8_3), + const CHANNELS: usize = 4; + + const SCALE: i32 = 6; + const V_SHR: i32 = SCALE; + const ROUNDING: i16 = 1 << (V_SHR - 1); + + let vld = _mm_setr_epi16(ROUNDING, ROUNDING, ROUNDING, ROUNDING, 0, 0, 0, 0); + + let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 8 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 8)); + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + let weight23 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let weight45 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(4..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let weight67 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(6..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_0_1 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + let rgb_pixel_1 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_1_0 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + let rgb_pixel_2 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); + let rgb_pixel_2_1 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + let rgb_pixel_3 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_3_1 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS + 16)..) + .as_ptr() as *const __m128i, + ); + + store_0 = hdot4( + store_0, + rgb_pixel_0, + rgb_pixel_0_1, + weight01, + weight23, + weight45, + weight67, + ); + store_1 = hdot4( + store_1, + rgb_pixel_1, + rgb_pixel_1_0, + weight01, + weight23, + weight45, + weight67, + ); + store_2 = hdot4( + store_2, + rgb_pixel_2, + rgb_pixel_2_1, + weight01, + weight23, + weight45, + weight67, + ); + store_3 = hdot4( + store_3, + rgb_pixel_3, + rgb_pixel_3_1, + weight01, + weight23, + weight45, + weight67, + ); + + jx += 8; } + + while jx + 4 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + let weight23 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.get_unchecked(2..).as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + let start_bounds = bounds.start + jx; + + let rgb_pixel_0 = _mm_loadu_si128( + src0.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_1 = _mm_loadu_si128( + src1.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_2 = _mm_loadu_si128( + src2.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + let rgb_pixel_3 = _mm_loadu_si128( + src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, + ); + + store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23); + store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23); + store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23); + store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23); + + jx += 4; + } + + while jx + 2 < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_shuffle_epi8( + _mm_set1_epi32((w_ptr.as_ptr() as *const i32).read_unaligned()), + shuffle_weights, + ); + + let rgb_pixel_0 = + _mm_loadu_si64(src0.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_1 = + _mm_loadu_si64(src1.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_2 = + _mm_loadu_si64(src2.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + let rgb_pixel_3 = + _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); + + store_0 = hdot(store_0, rgb_pixel_0, weight01); + store_1 = hdot(store_1, rgb_pixel_1, weight01); + store_2 = hdot(store_2, rgb_pixel_2, weight01); + store_3 = hdot(store_3, rgb_pixel_3, weight01); + + jx += 2; + } + + while jx < bounds.size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + + let weight0 = _mm_set1_epi16(w_ptr[0]); + + let start_bounds = bounds.start + jx; + + store_0 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); + jx += 1; + } + + store_0 = _mm_add_hi_lo_epi16(store_0); + store_1 = _mm_add_hi_lo_epi16(store_1); + store_2 = _mm_add_hi_lo_epi16(store_2); + store_3 = _mm_add_hi_lo_epi16(store_3); + + let store_16_8_0 = _mm_srai_epi16::(store_0); + let store_16_8_1 = _mm_srai_epi16::(store_1); + let store_16_8_2 = _mm_srai_epi16::(store_2); + let store_16_8_3 = _mm_srai_epi16::(store_3); + + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } diff --git a/src/sse/u8_utils.rs b/src/sse/u8_utils.rs index 7b6e5ec..c79518a 100644 --- a/src/sse/u8_utils.rs +++ b/src/sse/u8_utils.rs @@ -36,9 +36,11 @@ use crate::support::PRECISION; #[inline(always)] pub(crate) fn compress_i32(x: __m128i) -> __m128i { - let store_32 = unsafe { _mm_srai_epi32::(x) }; - let store_16 = unsafe { _mm_packus_epi32(store_32, store_32) }; - unsafe { _mm_packus_epi16(store_16, store_16) } + unsafe { + let store_32 = _mm_srai_epi32::(x); + let store_16 = _mm_packus_epi32(store_32, store_32); + _mm_packus_epi16(store_16, store_16) + } } #[inline] diff --git a/src/sse/vertical_u16.rs b/src/sse/vertical_u16.rs index a1ca78b..eb2de77 100644 --- a/src/sse/vertical_u16.rs +++ b/src/sse/vertical_u16.rs @@ -144,19 +144,19 @@ unsafe fn convolve_column_lb_u16_impl( } let v_st0 = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store0, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store0)), v_max_colors, ); let v_st1 = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store1, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store1)), v_max_colors, ); let v_st2 = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store2, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store2)), v_max_colors, ); let v_st3 = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store3, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store3)), v_max_colors, ); @@ -201,11 +201,11 @@ unsafe fn convolve_column_lb_u16_impl( } let v_st0 = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store0, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store0)), v_max_colors, ); let v_st1 = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store1, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store1)), v_max_colors, ); @@ -351,7 +351,7 @@ unsafe fn convolve_column_lb_u16_impl( } let v_st = _mm_min_epi32( - _mm_cvtps_epi32(_mm_round_ps::(_mm_max_ps(store0, zeros_ps))), + _mm_cvtps_epi32(_mm_round_ps::(store0)), v_max_colors, ); diff --git a/src/sse/vertical_u16_lb.rs b/src/sse/vertical_u16_lb.rs index 3f74c1e..1ee9d9b 100644 --- a/src/sse/vertical_u16_lb.rs +++ b/src/sse/vertical_u16_lb.rs @@ -58,7 +58,7 @@ unsafe fn convolve_column_lb_u16_impl( weight: &[i16], bit_depth: u32, ) { - assert!(bit_depth >= 1 && bit_depth <= 16); + assert!((1..=16).contains(&bit_depth)); let max_colors = (1 << bit_depth) - 1; let mut cx = 0usize; @@ -173,8 +173,8 @@ unsafe fn convolve_column_lb_u16_impl( if bounds_size == 2 { let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); + let v_weight0 = _mm_set1_epi16(weights[0]); + let v_weight1 = _mm_set1_epi16(weights[1]); let py = bounds.start; let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); @@ -194,9 +194,9 @@ unsafe fn convolve_column_lb_u16_impl( } else if bounds_size == 3 { let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); + let v_weight0 = _mm_set1_epi16(weights[0]); + let v_weight1 = _mm_set1_epi16(weights[1]); + let v_weight2 = _mm_set1_epi16(weights[2]); let py = bounds.start; let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); @@ -223,10 +223,10 @@ unsafe fn convolve_column_lb_u16_impl( } else if bounds_size == 4 { let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); - let v_weight3 = _mm_set1_epi32(weights[3] as i32); + let v_weight0 = _mm_set1_epi16(weights[0]); + let v_weight1 = _mm_set1_epi16(weights[1]); + let v_weight2 = _mm_set1_epi16(weights[2]); + let v_weight3 = _mm_set1_epi16(weights[3]); let py = bounds.start; let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); @@ -262,7 +262,7 @@ unsafe fn convolve_column_lb_u16_impl( let py = bounds.start + j; let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - let v_weight = _mm_set1_epi32(k_weight as i32); + let v_weight = _mm_set1_epi16(k_weight); let item_row = _mm_loadu_si64(src_ptr.as_ptr() as *const u8); diff --git a/src/sse/vertical_u8.rs b/src/sse/vertical_u8.rs index ab567d7..060901e 100644 --- a/src/sse/vertical_u8.rs +++ b/src/sse/vertical_u8.rs @@ -224,22 +224,28 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( if bounds_size == 2 { let py = start_y; let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); + let v_weight0 = _mm_set1_epi16(weight[0]); + let v_weight1 = _mm_set1_epi16(weight[1]); let src_ptr0 = src.get_unchecked((src_stride * py + px)..); let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let item_row0 = _mm_loadu_si64(src_ptr0.as_ptr()); let item_row1 = _mm_loadu_si64(src_ptr1.as_ptr()); let low0 = _mm_unpacklo_epi8(item_row0, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low0, zeros), v_weight0), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0), ); let low1 = _mm_unpacklo_epi8(item_row1, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low1, zeros), v_weight1), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1), @@ -247,9 +253,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( } else if bounds_size == 3 { let py = start_y; let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); + let v_weight0 = _mm_set1_epi16(weight[0]); + let v_weight1 = _mm_set1_epi16(weight[1]); + let v_weight2 = _mm_set1_epi16(weight[2]); let src_ptr0 = src.get_unchecked((src_stride * py + px)..); let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); @@ -258,21 +264,30 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let item_row2 = _mm_loadu_si64(src_ptr2.as_ptr()); let low0 = _mm_unpacklo_epi8(item_row0, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low0, zeros), v_weight0), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0), ); let low1 = _mm_unpacklo_epi8(item_row1, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low1, zeros), v_weight1), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1), ); let low2 = _mm_unpacklo_epi8(item_row2, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low2, zeros), v_weight2), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low2, zeros), v_weight2), @@ -280,10 +295,10 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( } else if bounds_size == 4 { let py = start_y; let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); - let v_weight3 = _mm_set1_epi32(weight[3] as i32); + let v_weight0 = _mm_set1_epi16(weight[0]); + let v_weight1 = _mm_set1_epi16(weight[1]); + let v_weight2 = _mm_set1_epi16(weight[2]); + let v_weight3 = _mm_set1_epi16(weight[3]); let src_ptr0 = src.get_unchecked((src_stride * py + px)..); let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); @@ -294,28 +309,40 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let item_row3 = _mm_loadu_si64(src_ptr3.as_ptr()); let low0 = _mm_unpacklo_epi8(item_row0, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low0, zeros), v_weight0), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0), ); let low1 = _mm_unpacklo_epi8(item_row1, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low1, zeros), v_weight1), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1), ); let low2 = _mm_unpacklo_epi8(item_row2, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low2, zeros), v_weight2), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low2, zeros), v_weight2), ); let low3 = _mm_unpacklo_epi8(item_row3, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low3), v_weight3)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low3, zeros), v_weight3), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low3, zeros), v_weight3), @@ -324,12 +351,15 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( for j in 0..bounds_size { let py = start_y + j; let weight = *filter.get_unchecked(j); - let v_weight = _mm_set1_epi32(weight as i32); + let v_weight = _mm_set1_epi16(weight); let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row = _mm_loadu_si64(src_ptr.as_ptr()); let low = _mm_unpacklo_epi8(item_row, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low), v_weight)); + store_0 = _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(low, zeros), v_weight), + ); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low, zeros), v_weight), @@ -368,8 +398,8 @@ pub(crate) unsafe fn convolve_vertical_part_sse( if bounds_size == 2 { let py = start_y; let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); + let v_weight0 = _mm_set1_epi16(weight[0]); + let v_weight1 = _mm_set1_epi16(weight[1]); let src_ptr0 = src.get_unchecked((src_stride * py + px)..); let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let item_row0 = @@ -382,9 +412,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse( } else if bounds_size == 3 { let py = start_y; let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); + let v_weight0 = _mm_set1_epi16(weight[0]); + let v_weight1 = _mm_set1_epi16(weight[1]); + let v_weight2 = _mm_set1_epi16(weight[2]); let src_ptr0 = src.get_unchecked((src_stride * py + px)..); let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); @@ -401,10 +431,10 @@ pub(crate) unsafe fn convolve_vertical_part_sse( } else if bounds_size == 4 { let py = start_y; let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); - let v_weight3 = _mm_set1_epi32(weight[3] as i32); + let v_weight0 = _mm_set1_epi16(weight[0]); + let v_weight1 = _mm_set1_epi16(weight[1]); + let v_weight2 = _mm_set1_epi16(weight[2]); + let v_weight3 = _mm_set1_epi16(weight[3]); let src_ptr0 = src.get_unchecked((src_stride * py + px)..); let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs index 4236b58..f10fdc8 100644 --- a/src/sse/vertical_u8_lp.rs +++ b/src/sse/vertical_u8_lp.rs @@ -77,6 +77,7 @@ unsafe fn convolve_vertical_sse_row_impl( let mut cx = 0usize; let mut rem = dst; + let iter_64 = rem.chunks_exact_mut(64); for dst in iter_64 { @@ -140,6 +141,7 @@ unsafe fn convolve_vertical_sse_row_impl( } rem = rem.chunks_exact_mut(64).into_remainder(); + let iter_32 = rem.chunks_exact_mut(32); for dst in iter_32 {