diff --git a/app/src/main.rs b/app/src/main.rs index b63e78e..42ad481 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -13,8 +13,9 @@ use fast_image_resize::{ use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ImageStoreScaling, ResamplingFunction, - RgbF16ImageStore, RgbF16ImageStoreMut, Rgba16ImageStoreMut, RgbaF16ImageStore, - RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy, WorkloadStrategy, + RgbF16ImageStore, RgbF16ImageStoreMut, Rgba16ImageStore, Rgba16ImageStoreMut, + RgbaF16ImageStore, RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy, + WorkloadStrategy, }; fn resize_plane( @@ -51,26 +52,26 @@ fn main() { .decode() .unwrap(); let dimensions = img.dimensions(); - let transient = img.to_rgb8(); + let transient = img.to_rgba8(); let mut bytes = Vec::from(transient.as_bytes()); let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); - scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed); + scaler.set_workload_strategy(WorkloadStrategy::PreferQuality); // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); - // let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); + let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); - let rgb_feature16 = transient - .iter() - .map(|&x| (x as f32 / 255f32) as f16) - .collect::>(); + // let rgb_feature16 = transient + // .iter() + // .map(|&x| (x as f32 / 255f32) as f16) + // .collect::>(); // - let store = - RgbF16ImageStore::from_slice(&rgb_feature16, dimensions.0 as usize, dimensions.1 as usize) - .unwrap(); + let mut store = + Rgba16ImageStore::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize).unwrap(); + store.bit_depth = 10; // let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); // // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height]; @@ -84,15 +85,17 @@ fn main() { // // ) // // .unwrap(); // - let mut dst_store = RgbF16ImageStoreMut::alloc_with_depth( - dimensions.0 as usize / 4, - dimensions.1 as usize / 4, + let mut dst_store = Rgba16ImageStoreMut::alloc_with_depth( + dimensions.0 as usize / 2, + dimensions.1 as usize / 2, 10, ); // // // for i in 0..25 { // let start_time = Instant::now(); - scaler.resize_rgb_f16(&store, &mut dst_store).unwrap(); + scaler + .resize_rgba_u16(&store, &mut dst_store, false) + .unwrap(); // // let elapsed_time = start_time.elapsed(); // // Print the elapsed time in milliseconds @@ -134,17 +137,17 @@ fn main() { // // .map(|&x| (x * 255f32) as u8) // // .collect(); // - // // let dst: Vec = dst_store - // // .as_bytes() - // // .iter() - // // .map(|&x| (x >> 2) as u8) - // // .collect(); - // - let dst = dst_store + let dst: Vec = dst_store .as_bytes() .iter() - .map(|&x| (x as f32 * 255.).round() as u8) - .collect::>(); + .map(|&x| (x >> 2) as u8) + .collect(); + + // let dst = dst_store + // .as_bytes() + // .iter() + // .map(|&x| (x as f32 * 255.).round() as u8) + // .collect::>(); // let dst = resized; // image::save_buffer( // "converted.png", diff --git a/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs b/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs index f0422a1..db04a3d 100644 --- a/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs +++ b/fuzz/resize_cbcr_f16/resize_cbcr_f16.rs @@ -29,11 +29,9 @@ #![feature(f16)] #![no_main] -use libfuzzer_sys::fuzz_target; -use pic_scale::{ - ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy, -}; use core::f16; +use libfuzzer_sys::fuzz_target; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy}; fuzz_target!(|data: (u16, u16, u16, u16, bool)| { let strategy = if data.4 { diff --git a/fuzz/resize_rgb_f16/resize_rgb_f16.rs b/fuzz/resize_rgb_f16/resize_rgb_f16.rs index 5d04a65..0446e31 100644 --- a/fuzz/resize_rgb_f16/resize_rgb_f16.rs +++ b/fuzz/resize_rgb_f16/resize_rgb_f16.rs @@ -31,9 +31,7 @@ use core::f16; use libfuzzer_sys::fuzz_target; -use pic_scale::{ - ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy, -}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy}; fuzz_target!(|data: (u16, u16, u16, u16, bool)| { let strategy = if data.4 { diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 72176a1..deb4bd1 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -45,6 +45,7 @@ mod vertical_f32; mod vertical_u16_lb; mod vertical_u8; mod vertical_u8_lp; +mod rgba_u16_lb; #[cfg(feature = "nightly_f16")] pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16}; @@ -73,3 +74,4 @@ pub(crate) use vertical_f32::convolve_vertical_avx_row_f32; pub(crate) use vertical_u16_lb::convolve_column_lb_avx2_u16; pub(crate) use vertical_u8::convolve_vertical_avx_row; pub(crate) use vertical_u8_lp::convolve_vertical_avx_row_lp; +pub(crate) use rgba_u16_lb::{convolve_horizontal_rgba_avx_rows_4_u16, convolve_horizontal_rgba_avx_u16lp_row}; \ No newline at end of file diff --git a/src/avx2/rgb_u8.rs b/src/avx2/rgb_u8.rs index 3249ff9..93c3107 100644 --- a/src/avx2/rgb_u8.rs +++ b/src/avx2/rgb_u8.rs @@ -27,9 +27,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::avx2::utils::_mm256_dot16_avx_epi32; +use crate::avx2::utils::{_mm256_dot16_avx_epi32, _mm_dot16_avx_epi32}; use crate::filter_weights::FilterWeights; -use crate::sse::{_mm_dot16_avx_epi32, compress_i32}; +use crate::sse::compress_i32; use crate::support::ROUNDING_CONST; #[cfg(target_arch = "x86")] use std::arch::x86::*; diff --git a/src/avx512/rgba_u16_lp.rs b/src/avx2/rgba_u16_lb.rs similarity index 73% rename from src/avx512/rgba_u16_lp.rs rename to src/avx2/rgba_u16_lb.rs index 901f57d..62fb13a 100644 --- a/src/avx512/rgba_u16_lp.rs +++ b/src/avx2/rgba_u16_lb.rs @@ -27,6 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::avx2::utils::{_mm256_dot16_avx_epi32, _mm_dot16_avx_epi32}; use crate::filter_weights::FilterWeights; use crate::support::{PRECISION, ROUNDING_CONST}; #[cfg(target_arch = "x86")] @@ -35,7 +36,7 @@ use std::arch::x86::*; use std::arch::x86_64::*; #[inline(always)] -unsafe fn acc_1_dot( +unsafe fn acc_1_dot( start_x: usize, src: &[u16], w0: __m128i, @@ -45,11 +46,11 @@ unsafe fn acc_1_dot( const COMPONENTS: usize = 4; let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); let rgba_pixel = _mm_loadu_si64(src_ptr.as_ptr() as *const u8); - _mm_dpwssd_avx_epi32(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0) + _mm_dot16_avx_epi32::(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0) } #[inline(always)] -unsafe fn acc_2_dot( +unsafe fn acc_2_dot( start_x: usize, src: &[u16], w0: __m128i, @@ -59,11 +60,11 @@ unsafe fn acc_2_dot( const COMPONENTS: usize = 4; let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); let rgba_pixel = _mm_loadu_si128(src_ptr.as_ptr() as *const _); - _mm_dpwssd_avx_epi32(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0) + _mm_dot16_avx_epi32::(store, _mm_shuffle_epi8(rgba_pixel, shuffle), w0) } #[inline(always)] -unsafe fn acc_4_dot( +unsafe fn acc_4_dot( start_x: usize, src: &[u16], w0: __m256i, @@ -73,11 +74,11 @@ unsafe fn acc_4_dot( const COMPONENTS: usize = 4; let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); let rgba_pixel = _mm256_loadu_si256(src_ptr.as_ptr() as *const _); - _mm256_dpwssd_avx_epi32(store, _mm256_shuffle_epi8(rgba_pixel, shuffle), w0) + _mm256_dot16_avx_epi32::(store, _mm256_shuffle_epi8(rgba_pixel, shuffle), w0) } #[inline(always)] -unsafe fn acc_8_dot( +unsafe fn acc_8_dot( start_x: usize, src: &[u16], w0: __m256i, @@ -90,11 +91,11 @@ unsafe fn acc_8_dot( let rgba_pixel0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const _); let rgba_pixel1 = _mm256_loadu_si256(src_ptr.get_unchecked(16..).as_ptr() as *const _); - let p0 = _mm256_dpwssd_avx_epi32(store, _mm256_shuffle_epi8(rgba_pixel0, shuffle), w0); - _mm256_dpwssd_avx_epi32(p0, _mm256_shuffle_epi8(rgba_pixel1, shuffle), w1) + let p0 = _mm256_dot16_avx_epi32::(store, _mm256_shuffle_epi8(rgba_pixel0, shuffle), w0); + _mm256_dot16_avx_epi32::(p0, _mm256_shuffle_epi8(rgba_pixel1, shuffle), w1) } -pub(crate) fn convolve_horizontal_rgba_vnni_rows_4_u16( +pub(crate) fn convolve_horizontal_rgba_avx_rows_4_u16( src: &[u16], src_stride: usize, dst: &mut [u16], @@ -103,7 +104,18 @@ pub(crate) fn convolve_horizontal_rgba_vnni_rows_4_u16( bit_depth: u32, ) { unsafe { - convolve_horizontal_rgba_vnni_rows_4_lb_impl( + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avxvnni") { + return convolve_horizontal_rgba_avx_rows_4_lb_vn( + src, + src_stride, + dst, + dst_stride, + filter_weights, + bit_depth, + ); + } + convolve_horizontal_rgba_avx_rows_4_lb_a( src, src_stride, dst, @@ -114,8 +126,47 @@ pub(crate) fn convolve_horizontal_rgba_vnni_rows_4_u16( } } +#[cfg(feature = "nightly_avx512")] #[target_feature(enable = "avxvnni", enable = "avx2")] -unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl( +unsafe fn convolve_horizontal_rgba_avx_rows_4_lb_vn( + src: &[u16], + src_stride: usize, + dst: &mut [u16], + dst_stride: usize, + filter_weights: &FilterWeights, + bit_depth: u32, +) { + convolve_horizontal_rgba_avx_rows_4_lb_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + bit_depth, + ); +} + +#[target_feature(enable = "avx2")] +unsafe fn convolve_horizontal_rgba_avx_rows_4_lb_a( + src: &[u16], + src_stride: usize, + dst: &mut [u16], + dst_stride: usize, + filter_weights: &FilterWeights, + bit_depth: u32, +) { + convolve_horizontal_rgba_avx_rows_4_lb_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + bit_depth, + ); +} + +#[inline(always)] +unsafe fn convolve_horizontal_rgba_avx_rows_4_lb_impl( src: &[u16], src_stride: usize, dst: &mut [u16], @@ -228,10 +279,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl( a_shuffle_weights_table, ); let bounds_start = bounds.start + jx; - store_0 = acc_8_dot(bounds_start, src0, w0, w1, store_0, a_shuffle_2_table); - store_1 = acc_8_dot(bounds_start, src1, w0, w1, store_1, a_shuffle_2_table); - store_2 = acc_8_dot(bounds_start, src2, w0, w1, store_2, a_shuffle_2_table); - store_3 = acc_8_dot(bounds_start, src3, w0, w1, store_3, a_shuffle_2_table); + store_0 = acc_8_dot::(bounds_start, src0, w0, w1, store_0, a_shuffle_2_table); + store_1 = acc_8_dot::(bounds_start, src1, w0, w1, store_1, a_shuffle_2_table); + store_2 = acc_8_dot::(bounds_start, src2, w0, w1, store_2, a_shuffle_2_table); + store_3 = acc_8_dot::(bounds_start, src3, w0, w1, store_3, a_shuffle_2_table); jx += 8; } @@ -245,10 +296,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl( ), a_shuffle_weights_table, ); - store_0 = acc_4_dot(bounds_start, src0, w0, store_0, a_shuffle_2_table); - store_1 = acc_4_dot(bounds_start, src1, w0, store_1, a_shuffle_2_table); - store_2 = acc_4_dot(bounds_start, src2, w0, store_2, a_shuffle_2_table); - store_3 = acc_4_dot(bounds_start, src3, w0, store_3, a_shuffle_2_table); + store_0 = acc_4_dot::(bounds_start, src0, w0, store_0, a_shuffle_2_table); + store_1 = acc_4_dot::(bounds_start, src1, w0, store_1, a_shuffle_2_table); + store_2 = acc_4_dot::(bounds_start, src2, w0, store_2, a_shuffle_2_table); + store_3 = acc_4_dot::(bounds_start, src3, w0, store_3, a_shuffle_2_table); jx += 4; } @@ -276,10 +327,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl( _mm_loadu_si32(w_ptr.as_ptr() as *const _), shuffle_weights_table, ); - store_0 = acc_2_dot(bounds_start, src0, w0, store_0, shuffle_2_table); - store_1 = acc_2_dot(bounds_start, src1, w0, store_1, shuffle_2_table); - store_2 = acc_2_dot(bounds_start, src2, w0, store_2, shuffle_2_table); - store_3 = acc_2_dot(bounds_start, src3, w0, store_3, shuffle_2_table); + store_0 = acc_2_dot::(bounds_start, src0, w0, store_0, shuffle_2_table); + store_1 = acc_2_dot::(bounds_start, src1, w0, store_1, shuffle_2_table); + store_2 = acc_2_dot::(bounds_start, src2, w0, store_2, shuffle_2_table); + store_3 = acc_2_dot::(bounds_start, src3, w0, store_3, shuffle_2_table); jx += 2; } @@ -287,10 +338,10 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let bounds_start = bounds.start + jx; let w0 = _mm_shuffle_epi8(_mm_set1_epi16(w_ptr[0]), shuffle_weights_table); - store_0 = acc_1_dot(bounds_start, src0, w0, store_0, shuffle_1_table); - store_1 = acc_1_dot(bounds_start, src1, w0, store_1, shuffle_1_table); - store_2 = acc_1_dot(bounds_start, src2, w0, store_2, shuffle_1_table); - store_3 = acc_1_dot(bounds_start, src3, w0, store_3, shuffle_1_table); + store_0 = acc_1_dot::(bounds_start, src0, w0, store_0, shuffle_1_table); + store_1 = acc_1_dot::(bounds_start, src1, w0, store_1, shuffle_1_table); + store_2 = acc_1_dot::(bounds_start, src2, w0, store_2, shuffle_1_table); + store_3 = acc_1_dot::(bounds_start, src3, w0, store_3, shuffle_1_table); jx += 1; } @@ -311,19 +362,44 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_lb_impl( } } -pub(crate) fn convolve_horizontal_rgba_vnni_u16lp_row( +pub(crate) fn convolve_horizontal_rgba_avx_u16lp_row( src: &[u16], dst: &mut [u16], filter_weights: &FilterWeights, bit_depth: u32, ) { unsafe { - convolve_horizontal_rgba_vnni_u16_row_impl(src, dst, filter_weights, bit_depth); + #[cfg(feature = "nightly_avx512")] + if std::arch::is_x86_feature_detected!("avxvnni") { + return convolve_horizontal_rgba_avx_u16_row_vn(src, dst, filter_weights, bit_depth); + } + convolve_horizontal_rgba_avx_u16_row_avx(src, dst, filter_weights, bit_depth); } } +#[cfg(feature = "nightly_avx512")] #[target_feature(enable = "avxvnni", enable = "avx2")] -unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl( +unsafe fn convolve_horizontal_rgba_avx_u16_row_vn( + src: &[u16], + dst: &mut [u16], + filter_weights: &FilterWeights, + bit_depth: u32, +) { + convolve_horizontal_rgba_vnni_u16_row_impl::(src, dst, filter_weights, bit_depth); +} + +#[target_feature(enable = "avx2")] +unsafe fn convolve_horizontal_rgba_avx_u16_row_avx( + src: &[u16], + dst: &mut [u16], + filter_weights: &FilterWeights, + bit_depth: u32, +) { + convolve_horizontal_rgba_vnni_u16_row_impl::(src, dst, filter_weights, bit_depth); +} + +#[inline(always)] +unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl( src: &[u16], dst: &mut [u16], filter_weights: &FilterWeights, @@ -387,7 +463,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl( a_shuffle_weights_table, ); let bounds_start = bounds.start + jx; - store = acc_8_dot(bounds_start, src, w0, w1, store, a_shuffle_2_table); + store = acc_8_dot::(bounds_start, src, w0, w1, store, a_shuffle_2_table); jx += 8; } @@ -401,7 +477,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl( a_shuffle_weights_table, ); let bounds_start = bounds.start + jx; - store = acc_4_dot(bounds_start, src, w0, store, a_shuffle_2_table); + store = acc_4_dot::(bounds_start, src, w0, store, a_shuffle_2_table); jx += 4; } @@ -417,7 +493,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl( _mm_loadu_si32(w_ptr.as_ptr() as *const _), shuffle_weights_table, ); - store = acc_2_dot(bounds_start, src, w0, store, shuffle_2_table); + store = acc_2_dot::(bounds_start, src, w0, store, shuffle_2_table); jx += 2; } @@ -425,7 +501,7 @@ unsafe fn convolve_horizontal_rgba_vnni_u16_row_impl( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let w0 = _mm_shuffle_epi8(_mm_set1_epi16(w_ptr[0]), shuffle_weights_table); let bounds_start = bounds.start + jx; - store = acc_1_dot(bounds_start, src, w0, store, shuffle_1_table); + store = acc_1_dot::(bounds_start, src, w0, store, shuffle_1_table); jx += 1; } diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs index e2b9ebc..761484f 100644 --- a/src/avx2/utils.rs +++ b/src/avx2/utils.rs @@ -374,6 +374,27 @@ pub(crate) unsafe fn _mm256_dot16_avx_epi32( } } +#[allow(dead_code)] +#[inline(always)] +pub(crate) unsafe fn _mm_dot16_avx_epi32( + a: __m128i, + b: __m128i, + c: __m128i, +) -> __m128i { + #[cfg(feature = "nightly_avx512")] + { + if HAS_DOT { + _mm_dpwssd_avx_epi32(a, b, c) + } else { + _mm_add_epi32(a, _mm_madd_epi16(b, c)) + } + } + #[cfg(not(feature = "nightly_avx512"))] + { + _mm_add_epi32(a, _mm_madd_epi16(b, c)) + } +} + #[allow(dead_code)] #[inline(always)] pub(crate) unsafe fn _mm_udot8_epi16( diff --git a/src/avx2/vertical_u16_lb.rs b/src/avx2/vertical_u16_lb.rs index 72ac2c3..6319eb0 100644 --- a/src/avx2/vertical_u16_lb.rs +++ b/src/avx2/vertical_u16_lb.rs @@ -26,9 +26,8 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::avx2::utils::{_mm256_dot16_avx_epi32, avx2_pack_u32}; +use crate::avx2::utils::{_mm256_dot16_avx_epi32, _mm_dot16_avx_epi32, avx2_pack_u32}; use crate::filter_weights::FilterBounds; -use crate::sse::_mm_dot16_avx_epi32; use crate::support::{PRECISION, ROUNDING_CONST}; #[cfg(target_arch = "x86")] use std::arch::x86::*; diff --git a/src/handler_provider.rs b/src/handler_provider.rs index e790204..a960df0 100644 --- a/src/handler_provider.rs +++ b/src/handler_provider.rs @@ -423,10 +423,9 @@ impl RowHandlerFixedPoint for u16 { i16: AsPrimitive, u16: AsPrimitive, { - #[cfg(feature = "nightly_avx512")] - if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avxvnni") { - use crate::avx512::convolve_horizontal_rgba_vnni_rows_4_u16; - return convolve_horizontal_rgba_vnni_rows_4_u16( + if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avx2") { + use crate::avx2::convolve_horizontal_rgba_avx_rows_4_u16; + return convolve_horizontal_rgba_avx_rows_4_u16( src, src_stride, dst, @@ -531,10 +530,9 @@ impl RowHandlerFixedPoint for u16 { i16: AsPrimitive, u16: AsPrimitive, { - #[cfg(feature = "nightly_avx512")] - if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avxvnni") { - use crate::avx512::convolve_horizontal_rgba_vnni_u16lp_row; - return convolve_horizontal_rgba_vnni_u16lp_row(src, dst, filter_weights, bit_depth); + if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("avx2") { + use crate::avx2::convolve_horizontal_rgba_avx_u16lp_row; + return convolve_horizontal_rgba_avx_u16lp_row(src, dst, filter_weights, bit_depth); } if COMPONENTS == 4 && std::arch::is_x86_feature_detected!("sse4.1") { convolve_horizontal_rgba_sse_u16_lb_row(src, dst, filter_weights, bit_depth); diff --git a/src/sse/utils.rs b/src/sse/utils.rs index 1a98903..d22df09 100644 --- a/src/sse/utils.rs +++ b/src/sse/utils.rs @@ -280,26 +280,6 @@ pub(crate) unsafe fn _mm_store3_u16(ptr: *mut u16, a: __m128i) { .write_unaligned(_mm_extract_epi16::<2>(a) as i16); } -#[inline(always)] -pub(crate) unsafe fn _mm_dot16_avx_epi32( - a: __m128i, - b: __m128i, - c: __m128i, -) -> __m128i { - #[cfg(feature = "nightly_avx512")] - { - if HAS_DOT { - _mm_dpwssd_avx_epi32(a, b, c) - } else { - _mm_add_epi32(a, _mm_madd_epi16(b, c)) - } - } - #[cfg(not(feature = "nightly_avx512"))] - { - _mm_add_epi32(a, _mm_madd_epi16(b, c)) - } -} - #[cfg(test)] mod tests { use super::*;