diff --git a/Cargo.lock b/Cargo.lock index 2595aaff..4aa0e949 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1013,7 +1013,7 @@ dependencies = [ [[package]] name = "yuvutils-rs" -version = "0.7.0" +version = "0.8.0" dependencies = [ "fast_transpose", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 6d85a685..83411225 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["app", "fuzz"] } [package] name = "yuvutils-rs" -version = "0.7.0" +version = "0.8.0" edition = "2021" description = "High performance utilities for YUV format handling and conversion." readme = "README.md" @@ -19,7 +19,7 @@ rust-version = "1.82.0" [dependencies] num-traits = "0.2.19" rayon = { version = "1.10.0", optional = true } -fast_transpose = "0.2.3" +fast_transpose = { version = "0.2.3", optional = true } [dev-dependencies] rand = "0.9.0" @@ -48,10 +48,11 @@ nightly_i8mm = [] fast_mode = [] # Enables `professional_mode` support on available paths professional_mode = [] -# Enables `rayon` support, use with care, in common, YUV encoding/decoding is more usually used in single thread mode +# Enables `rayon` support, use with care, in common, YUV encoding/decoding is more usually expected to be used in single thread mode rayon = ["dep:rayon"] # Support for Big-Endian YUV big_endian = [] +geometry = ["dep:fast_transpose"] rdp = [] [profile.dev.package] diff --git a/app/benches/yuv8/main.rs b/app/benches/yuv8/main.rs index fac0650b..a70fb4e0 100644 --- a/app/benches/yuv8/main.rs +++ b/app/benches/yuv8/main.rs @@ -749,6 +749,34 @@ pub fn criterion_benchmark(c: &mut Criterion) { }) }); + c.bench_function("yuvutils YCgCo 4:2:2 Full -> RGBA", |b| { + let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; + b.iter(|| { + yuv422_to_rgba( + &fixed_planar422, + &mut rgb_bytes, + dimensions.0 * 4u32, + YuvRange::Full, + YuvStandardMatrix::Bt601, + ) + .unwrap(); + }) + }); + + c.bench_function("yuvutils YCgCo 4:2:2 Limited -> RGBA", |b| { + let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; + b.iter(|| { + yuv422_to_rgba( + &fixed_planar422, + &mut rgb_bytes, + dimensions.0 * 4u32, + YuvRange::Limited, + YuvStandardMatrix::Bt601, + ) + .unwrap(); + }) + }); + c.bench_function("libyuv YUV 4:2:2 -> RGBA", |b| { let mut rgb_bytes = vec![0u8; dimensions.0 as usize * 4 * dimensions.1 as usize]; b.iter(|| unsafe { diff --git a/app/src/main.rs b/app/src/main.rs index f880dfee..7cb0bdb0 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -33,7 +33,17 @@ use image::{ColorType, DynamicImage, EncodableLayout, GenericImageView, ImageRea use std::fs::File; use std::io::Read; use std::time::Instant; -use yuvutils_rs::{rdp_rgba_to_yuv444, rdp_yuv444_to_rgba, YuvBiPlanarImageMut, YuvChromaSubsampling, YuvPlanarImageMut}; +use yuvutils_rs::{ + ar30_to_rgb8, i010_to_rgb10, i010_to_rgb_f16, i012_to_rgb12, i014_to_rgb14, i016_to_rgb16, + i210_to_rgb10, i210_to_rgb_f16, i210_to_rgba_f16, i214_to_rgb14, i214_to_rgb_f16, + i214_to_rgba14, i216_to_rgb16, i410_to_rgb10, i410_to_rgb_f16, i410_to_rgba10, i414_to_rgb14, + i414_to_rgb_f16, i416_to_rgb16, p210_to_ar30, p212_to_ar30, rgb10_to_i010, rgb10_to_i210, + rgb10_to_i410, rgb10_to_p210, rgb12_to_i012, rgb12_to_p212, rgb14_to_i014, rgb14_to_i214, + rgb14_to_i414, rgb16_to_i016, rgb16_to_i216, rgb16_to_i416, rgb_to_ycgco420, rgb_to_ycgco422, + rgb_to_ycgco444, rgba14_to_i214, rgba_to_ycgco420, ycgco420_to_rgb, ycgco420_to_rgba, + ycgco422_to_rgb, ycgco444_to_rgb, ycgco444_to_rgba, Rgb30ByteOrder, YuvBiPlanarImageMut, + YuvChromaSubsampling, YuvPlanarImageMut, YuvRange, YuvStandardMatrix, +}; fn read_file_bytes(file_path: &str) -> Result, String> { // Open the file @@ -93,7 +103,7 @@ fn main() { ); let mut planar_image = - YuvPlanarImageMut::::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv444); + YuvPlanarImageMut::::alloc(width as u32, height as u32, YuvChromaSubsampling::Yuv420); // let mut bytes_16: Vec = src_bytes .iter() @@ -101,12 +111,17 @@ fn main() { .collect(); let start_time = Instant::now(); - rdp_rgba_to_yuv444(&mut planar_image, src_bytes, rgba_stride as u32).unwrap(); + rgba_to_ycgco420( + &mut planar_image, + &src_bytes, + rgba_stride as u32, + YuvRange::Full, + ) + .unwrap(); println!("Forward time: {:?}", start_time.elapsed()); let fixed = planar_image.to_fixed(); rgba.fill(0); - rdp_yuv444_to_rgba(&fixed, &mut rgba, rgba_stride as u32).unwrap(); let fixed_biplanar = bi_planar_image.to_fixed(); let fixed_planar = planar_image.to_fixed(); @@ -114,6 +129,13 @@ fn main() { let mut j_rgba = vec![0u8; dimensions.0 as usize * dimensions.1 as usize * 4]; + ycgco420_to_rgba( + &fixed_planar, + &mut rgba, + dimensions.0 as u32 * 4, + YuvRange::Full, + ) + .unwrap(); // let mut rgba_f16: Vec = vec![0.; rgba.len()]; // // // i210_to_rgb_f16( diff --git a/fuzz/yuv_to_rgb/yuv_to_rgb.rs b/fuzz/yuv_to_rgb/yuv_to_rgb.rs index 9ea285b2..7ad3e126 100644 --- a/fuzz/yuv_to_rgb/yuv_to_rgb.rs +++ b/fuzz/yuv_to_rgb/yuv_to_rgb.rs @@ -31,9 +31,10 @@ use libfuzzer_sys::fuzz_target; use yuvutils_rs::{ - yuv420_alpha_to_rgba, yuv420_to_rgb, yuv420_to_rgba, yuv422_alpha_to_rgba, yuv422_to_rgb, - yuv422_to_rgba, yuv444_alpha_to_rgba, yuv444_to_rgb, yuv444_to_rgba, YuvPlanarImage, - YuvPlanarImageWithAlpha, YuvRange, YuvStandardMatrix, + ycgco420_to_rgb, ycgco420_to_rgba, ycgco422_to_rgb, ycgco422_to_rgba, yuv420_alpha_to_rgba, + yuv420_to_rgb, yuv420_to_rgba, yuv422_alpha_to_rgba, yuv422_to_rgb, yuv422_to_rgba, + yuv444_alpha_to_rgba, yuv444_to_rgb, yuv444_to_rgba, YuvPlanarImage, YuvPlanarImageWithAlpha, + YuvRange, YuvStandardMatrix, }; fuzz_target!(|data: (u8, u8, u8, u8, u8, u8)| { @@ -73,6 +74,14 @@ fn fuzz_yuv_420(i_width: u8, i_height: u8, y_value: u8, u_value: u8, v_value: u8 ) .unwrap(); + ycgco420_to_rgb( + &planar_image, + &mut target_rgb, + i_width as u32 * 3, + YuvRange::Full, + ) + .unwrap(); + let mut target_rgba = vec![0u8; i_width as usize * i_height as usize * 4]; yuv420_to_rgba( @@ -84,6 +93,14 @@ fn fuzz_yuv_420(i_width: u8, i_height: u8, y_value: u8, u_value: u8, v_value: u8 ) .unwrap(); + ycgco420_to_rgba( + &planar_image, + &mut target_rgba, + i_width as u32 * 4, + YuvRange::Full, + ) + .unwrap(); + let planar_image_with_alpha = YuvPlanarImageWithAlpha { y_plane: &y_plane, y_stride: i_width as u32, @@ -139,6 +156,14 @@ fn fuzz_yuv_422(i_width: u8, i_height: u8, y_value: u8, u_value: u8, v_value: u8 ) .unwrap(); + ycgco422_to_rgb( + &planar_image, + &mut target_rgb, + i_width as u32 * 3, + YuvRange::Full, + ) + .unwrap(); + let mut target_rgba = vec![0u8; i_width as usize * i_height as usize * 4]; yuv422_to_rgba( @@ -150,6 +175,14 @@ fn fuzz_yuv_422(i_width: u8, i_height: u8, y_value: u8, u_value: u8, v_value: u8 ) .unwrap(); + ycgco422_to_rgba( + &planar_image, + &mut target_rgba, + i_width as u32 * 4, + YuvRange::Full, + ) + .unwrap(); + let planar_image_with_alpha = YuvPlanarImageWithAlpha { y_plane: &y_plane, y_stride: i_width as u32, diff --git a/src/lib.rs b/src/lib.rs index 1dce1d2b..a169487d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,9 +71,11 @@ mod built_coefficients; mod f16_converter; mod from_identity; mod from_identity_alpha; +#[cfg(feature = "geometry")] mod geometry; mod images; mod internals; +#[cfg(feature = "geometry")] mod mirroring; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] mod neon; @@ -272,9 +274,9 @@ pub use ycgco_to_rgb::{ }; pub use ycgco_to_rgb::{ - icgc010_to_rgb, icgc010_to_rgba, icgc012_to_rgb, icgc012_to_rgba, icgc210_to_rgb, - icgc210_to_rgba, icgc212_to_rgb, icgc212_to_rgba, icgc410_to_rgb, icgc410_to_rgba, - icgc412_to_rgb, icgc412_to_rgba, + icgc010_to_rgb10, icgc010_to_rgba10, icgc012_to_rgb12, icgc012_to_rgba12, icgc210_to_rgb10, + icgc210_to_rgba10, icgc212_to_rgb12, icgc212_to_rgba12, icgc410_to_rgb10, icgc410_to_rgba10, + icgc412_to_rgb12, icgc412_to_rgba12, }; pub use yuv_nv_to_rgba::yuv_nv16_to_bgr; @@ -292,8 +294,8 @@ pub use ycgco_to_rgb_alpha::{ }; pub use ycgco_to_rgb_alpha::{ - icgc010_alpha_to_rgba, icgc012_alpha_to_rgba, icgc210_alpha_to_rgba, icgc212_alpha_to_rgba, - icgc410_alpha_to_rgba, icgc412_alpha_to_rgba, + icgc010_alpha_to_rgba10, icgc012_alpha_to_rgba12, icgc210_alpha_to_rgba10, + icgc212_alpha_to_rgba12, icgc410_alpha_to_rgba10, icgc412_alpha_to_rgba12, }; pub use yuv_to_yuy2::yuv420_to_uyvy422; @@ -475,11 +477,12 @@ pub use f16_converter::{ convert_rgba_f16_to_rgba16, convert_rgba_to_f16, }; +#[cfg(feature = "geometry")] pub use geometry::{ rotate_cbcr, rotate_cbcr16, rotate_plane, rotate_plane16, rotate_rgb, rotate_rgb16, rotate_rgba, rotate_rgba16, RotationMode, }; - +#[cfg(feature = "geometry")] pub use mirroring::{ mirror_cbcr, mirror_cbcr16, mirror_plane, mirror_plane16, mirror_rgb, mirror_rgb16, mirror_rgba, mirror_rgba16, MirrorMode, diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 6b3cb56e..d833aa18 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -69,6 +69,7 @@ mod utils; mod y_p16_to_rgba16; mod y_to_rgb; mod y_to_rgb_alpha; +mod ycgco_to_rgb; mod yuv_nv_p10_to_ar30; mod yuv_nv_p10_to_rgba; #[cfg(feature = "professional_mode")] @@ -161,6 +162,7 @@ pub(crate) use y_to_rgb::neon_y_to_rgb_row_rdm; pub(crate) use y_to_rgb_alpha::neon_y_to_rgb_alpha_row; #[cfg(feature = "rdm")] pub(crate) use y_to_rgb_alpha::neon_y_to_rgb_row_alpha_rdm; +pub(crate) use ycgco_to_rgb::{neon_ycgco420_to_rgba_row, neon_ycgco_full_range_to_rgb}; pub(crate) use yuv_nv_p10_to_ar30::neon_yuv_nv12_p10_to_ar30_row; pub(crate) use yuv_nv_p10_to_rgba::neon_yuv_nv12_p10_to_rgba_row; #[cfg(feature = "professional_mode")] diff --git a/src/neon/ycgco_to_rgb.rs b/src/neon/ycgco_to_rgb.rs new file mode 100644 index 00000000..6a4cdcc4 --- /dev/null +++ b/src/neon/ycgco_to_rgb.rs @@ -0,0 +1,484 @@ +/* + * Copyright (c) Radzivon Bartoshyk, 2/2025. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::internals::ProcessedOffset; +use crate::neon::utils::neon_store_rgb8; +use crate::yuv_support::{YuvChromaRange, YuvSourceChannels}; +use crate::YuvChromaSubsampling; +use std::arch::aarch64::*; + +pub(crate) unsafe fn neon_ycgco_full_range_to_rgb< + const DESTINATION_CHANNELS: u8, + const SAMPLING: u8, +>( + y_plane: &[u8], + u_plane: &[u8], + v_plane: &[u8], + rgba: &mut [u8], + width: usize, + chroma_range: YuvChromaRange, +) -> ProcessedOffset { + let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + let channels = destination_channels.get_channels_count(); + + let mut cx = 0; + let mut uv_x = 0; + + let y_ptr = y_plane.as_ptr(); + let u_ptr = u_plane.as_ptr(); + let v_ptr = v_plane.as_ptr(); + let rgba_ptr = rgba.as_mut_ptr(); + + let bias_y = vdupq_n_u8(chroma_range.bias_y as u8); + let bias_uv = vdupq_n_u8(chroma_range.bias_uv as u8); + + while cx + 16 < width { + let mut y_values = vld1q_u8(y_ptr.add(cx)); + + let u_high_u8: uint8x8_t; + let v_high_u8: uint8x8_t; + let u_low_u8: uint8x8_t; + let v_low_u8: uint8x8_t; + + match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { + let mut u_values = vld1_u8(u_ptr.add(uv_x)); + let mut v_values = vld1_u8(v_ptr.add(uv_x)); + + u_values = vsub_u8(u_values, vget_low_u8(bias_uv)); + v_values = vsub_u8(v_values, vget_low_u8(bias_uv)); + + u_high_u8 = vzip2_u8(u_values, u_values); + v_high_u8 = vzip2_u8(v_values, v_values); + u_low_u8 = vzip1_u8(u_values, u_values); + v_low_u8 = vzip1_u8(v_values, v_values); + } + YuvChromaSubsampling::Yuv444 => { + let mut u_values = vld1q_u8(u_ptr.add(uv_x)); + let mut v_values = vld1q_u8(v_ptr.add(uv_x)); + + u_values = vsubq_u8(u_values, bias_uv); + v_values = vsubq_u8(v_values, bias_uv); + + u_high_u8 = vget_high_u8(u_values); + v_high_u8 = vget_high_u8(v_values); + u_low_u8 = vget_low_u8(u_values); + v_low_u8 = vget_low_u8(v_values); + } + } + + y_values = vqsubq_u8(y_values, bias_y); + + let y_high = vreinterpretq_s16_u16(vmovl_high_u8(y_values)); + let y_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values))); + + let u_high = vmovl_s8(vreinterpret_s8_u8(u_high_u8)); + let v_high = vmovl_s8(vreinterpret_s8_u8(v_high_u8)); + let u_low = vmovl_s8(vreinterpret_s8_u8(u_low_u8)); + let v_low = vmovl_s8(vreinterpret_s8_u8(v_low_u8)); + + let t_high = vsubq_s16(y_high, u_high); + let t_low = vsubq_s16(y_low, u_low); + + let r_h = vaddq_s16(t_high, v_high); + let r_l = vaddq_s16(t_low, v_low); + let b_h = vsubq_s16(t_high, v_high); + let b_l = vsubq_s16(t_low, v_low); + let g_h = vaddq_s16(y_high, u_high); + let g_l = vaddq_s16(y_low, u_low); + + let r_values = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + let g_values = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + let b_values = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + + let dst_shift = cx * channels; + + neon_store_rgb8::( + rgba_ptr.add(dst_shift), + r_values, + g_values, + b_values, + vdupq_n_u8(0), + ); + + cx += 16; + + match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { + uv_x += 8; + } + YuvChromaSubsampling::Yuv444 => { + uv_x += 16; + } + } + } + + if cx < width { + let diff = width - cx; + + assert!(diff <= 16); + + let mut dst_buffer: [u8; 16 * 4] = [0; 16 * 4]; + let mut y_buffer: [u8; 16] = [0; 16]; + let mut u_buffer: [u8; 16] = [0; 16]; + let mut v_buffer: [u8; 16] = [0; 16]; + + std::ptr::copy_nonoverlapping( + y_plane.get_unchecked(cx..).as_ptr(), + y_buffer.as_mut_ptr(), + diff, + ); + + let ux_diff = match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => diff.div_ceil(2), + YuvChromaSubsampling::Yuv444 => diff, + }; + + std::ptr::copy_nonoverlapping( + u_plane.get_unchecked(uv_x..).as_ptr(), + u_buffer.as_mut_ptr(), + ux_diff, + ); + + std::ptr::copy_nonoverlapping( + v_plane.get_unchecked(uv_x..).as_ptr(), + v_buffer.as_mut_ptr(), + ux_diff, + ); + + let mut y_values = vld1q_u8(y_buffer.as_ptr()); + + let u_high_u8: uint8x8_t; + let v_high_u8: uint8x8_t; + let u_low_u8: uint8x8_t; + let v_low_u8: uint8x8_t; + + match chroma_subsampling { + YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { + let mut u_values = vld1_u8(u_buffer.as_ptr()); + let mut v_values = vld1_u8(v_buffer.as_ptr()); + + u_values = vsub_u8(u_values, vget_low_u8(bias_uv)); + v_values = vsub_u8(v_values, vget_low_u8(bias_uv)); + + u_high_u8 = vzip2_u8(u_values, u_values); + v_high_u8 = vzip2_u8(v_values, v_values); + u_low_u8 = vzip1_u8(u_values, u_values); + v_low_u8 = vzip1_u8(v_values, v_values); + } + YuvChromaSubsampling::Yuv444 => { + let mut u_values = vld1q_u8(u_buffer.as_ptr()); + let mut v_values = vld1q_u8(v_buffer.as_ptr()); + + u_values = vsubq_u8(u_values, bias_uv); + v_values = vsubq_u8(v_values, bias_uv); + + u_high_u8 = vget_high_u8(u_values); + v_high_u8 = vget_high_u8(v_values); + u_low_u8 = vget_low_u8(u_values); + v_low_u8 = vget_low_u8(v_values); + } + } + + y_values = vqsubq_u8(y_values, bias_y); + + let y_high = vreinterpretq_s16_u16(vmovl_high_u8(y_values)); + let y_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values))); + + let u_high = vmovl_s8(vreinterpret_s8_u8(u_high_u8)); + let v_high = vmovl_s8(vreinterpret_s8_u8(v_high_u8)); + let u_low = vmovl_s8(vreinterpret_s8_u8(u_low_u8)); + let v_low = vmovl_s8(vreinterpret_s8_u8(v_low_u8)); + + let t_high = vsubq_s16(y_high, u_high); + let t_low = vsubq_s16(y_low, u_low); + + let r_h = vaddq_s16(t_high, v_high); + let r_l = vaddq_s16(t_low, v_low); + let b_h = vsubq_s16(t_high, v_high); + let b_l = vsubq_s16(t_low, v_low); + let g_h = vaddq_s16(y_high, u_high); + let g_l = vaddq_s16(y_low, u_low); + + let r_values = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + let g_values = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + let b_values = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + + neon_store_rgb8::( + dst_buffer.as_mut_ptr(), + r_values, + g_values, + b_values, + vdupq_n_u8(0), + ); + + let dst_shift = cx * channels; + std::ptr::copy_nonoverlapping( + dst_buffer.as_mut_ptr(), + rgba.get_unchecked_mut(dst_shift..).as_mut_ptr(), + diff * channels, + ); + + cx += diff; + uv_x += ux_diff; + } + + ProcessedOffset { cx, ux: uv_x } +} + +/// Special path for Planar YUV 4:2:0 for aarch64 with RDM available +pub(crate) unsafe fn neon_ycgco420_to_rgba_row( + y_plane0: &[u8], + y_plane1: &[u8], + u_plane: &[u8], + v_plane: &[u8], + rgba0: &mut [u8], + rgba1: &mut [u8], + width: u32, + chroma_range: YuvChromaRange, +) -> ProcessedOffset { + let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into(); + let channels = destination_channels.get_channels_count(); + + let mut cx = 0usize; + let mut uv_x = 0usize; + + let u_ptr = u_plane.as_ptr(); + let v_ptr = v_plane.as_ptr(); + + let bias_y = vdupq_n_u8(chroma_range.bias_y as u8); + let bias_uv = vdupq_n_u8(chroma_range.bias_uv as u8); + + while cx + 16 < width as usize { + let vl0 = vld1q_u8(y_plane0.get_unchecked(cx..).as_ptr()); + let vl1 = vld1q_u8(y_plane1.get_unchecked(cx..).as_ptr()); + + let mut u_values = vld1_u8(u_ptr.add(uv_x)); + let mut v_values = vld1_u8(v_ptr.add(uv_x)); + + u_values = vsub_u8(u_values, vget_low_u8(bias_uv)); + v_values = vsub_u8(v_values, vget_low_u8(bias_uv)); + + let y_values0 = vqsubq_u8(vl0, bias_y); + let y_values1 = vqsubq_u8(vl1, bias_y); + + let u_high_u8 = vzip2_u8(u_values, u_values); + let v_high_u8 = vzip2_u8(v_values, v_values); + let u_low_u8 = vzip1_u8(u_values, u_values); + let v_low_u8 = vzip1_u8(v_values, v_values); + + let u_high = vmovl_s8(vreinterpret_s8_u8(u_high_u8)); + let v_high = vmovl_s8(vreinterpret_s8_u8(v_high_u8)); + let u_low = vmovl_s8(vreinterpret_s8_u8(u_low_u8)); + let v_low = vmovl_s8(vreinterpret_s8_u8(v_low_u8)); + + let y_high0 = vreinterpretq_s16_u16(vmovl_high_u8(y_values0)); + let y_low0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values0))); + + let y_high1 = vreinterpretq_s16_u16(vmovl_high_u8(y_values1)); + let y_low1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values1))); + + let t_high0 = vsubq_s16(y_high0, u_high); + let t_low0 = vsubq_s16(y_low0, u_low); + + let t_high1 = vsubq_s16(y_high1, u_high); + let t_low1 = vsubq_s16(y_low1, u_low); + + let r_h0 = vaddq_s16(t_high0, v_high); + let r_l0 = vaddq_s16(t_low0, v_low); + let b_h0 = vsubq_s16(t_high0, v_high); + let b_l0 = vsubq_s16(t_low0, v_low); + let g_h0 = vaddq_s16(y_high0, u_high); + let g_l0 = vaddq_s16(y_low0, u_low); + + let r_h1 = vaddq_s16(t_high1, v_high); + let r_l1 = vaddq_s16(t_low1, v_low); + let b_h1 = vsubq_s16(t_high1, v_high); + let b_l1 = vsubq_s16(t_low1, v_low); + let g_h1 = vaddq_s16(y_high1, u_high); + let g_l1 = vaddq_s16(y_low1, u_low); + + let r_values0 = vcombine_u8(vqmovun_s16(r_l0), vqmovun_s16(r_h0)); + let g_values0 = vcombine_u8(vqmovun_s16(g_l0), vqmovun_s16(g_h0)); + let b_values0 = vcombine_u8(vqmovun_s16(b_l0), vqmovun_s16(b_h0)); + + let r_values1 = vcombine_u8(vqmovun_s16(r_l1), vqmovun_s16(r_h1)); + let g_values1 = vcombine_u8(vqmovun_s16(g_l1), vqmovun_s16(g_h1)); + let b_values1 = vcombine_u8(vqmovun_s16(b_l1), vqmovun_s16(b_h1)); + + let dst_shift = cx * channels; + + neon_store_rgb8::( + rgba0.get_unchecked_mut(dst_shift..).as_mut_ptr(), + r_values0, + g_values0, + b_values0, + vdupq_n_u8(255), + ); + neon_store_rgb8::( + rgba1.get_unchecked_mut(dst_shift..).as_mut_ptr(), + r_values1, + g_values1, + b_values1, + vdupq_n_u8(255), + ); + + cx += 16; + uv_x += 8; + } + + if cx < width as usize { + let diff = width as usize - cx; + + assert!(diff <= 16); + + let mut dst_buffer0: [u8; 16 * 4] = [0; 16 * 4]; + let mut dst_buffer1: [u8; 16 * 4] = [0; 16 * 4]; + let mut y_buffer0: [u8; 16] = [0; 16]; + let mut y_buffer1: [u8; 16] = [0; 16]; + let mut u_buffer: [u8; 16] = [0; 16]; + let mut v_buffer: [u8; 16] = [0; 16]; + + std::ptr::copy_nonoverlapping( + y_plane0.get_unchecked(cx..).as_ptr(), + y_buffer0.as_mut_ptr(), + diff, + ); + + std::ptr::copy_nonoverlapping( + y_plane1.get_unchecked(cx..).as_ptr(), + y_buffer1.as_mut_ptr(), + diff, + ); + + let half_div = diff.div_ceil(2); + + std::ptr::copy_nonoverlapping( + u_plane.get_unchecked(uv_x..).as_ptr(), + u_buffer.as_mut_ptr(), + half_div, + ); + + std::ptr::copy_nonoverlapping( + v_plane.get_unchecked(uv_x..).as_ptr(), + v_buffer.as_mut_ptr(), + half_div, + ); + + let vl0 = vld1q_u8(y_buffer0.as_ptr()); + let vl1 = vld1q_u8(y_buffer1.as_ptr()); + + let mut u_values = vld1_u8(u_buffer.as_ptr()); + let mut v_values = vld1_u8(v_buffer.as_ptr()); + + u_values = vsub_u8(u_values, vget_low_u8(bias_uv)); + v_values = vsub_u8(v_values, vget_low_u8(bias_uv)); + + let y_values0 = vqsubq_u8(vl0, bias_y); + let y_values1 = vqsubq_u8(vl1, bias_y); + + let u_high_u8 = vzip2_u8(u_values, u_values); + let v_high_u8 = vzip2_u8(v_values, v_values); + let u_low_u8 = vzip1_u8(u_values, u_values); + let v_low_u8 = vzip1_u8(v_values, v_values); + + let u_high = vmovl_s8(vreinterpret_s8_u8(u_high_u8)); + let v_high = vmovl_s8(vreinterpret_s8_u8(v_high_u8)); + let u_low = vmovl_s8(vreinterpret_s8_u8(u_low_u8)); + let v_low = vmovl_s8(vreinterpret_s8_u8(v_low_u8)); + + let y_high0 = vreinterpretq_s16_u16(vmovl_high_u8(y_values0)); + let y_low0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values0))); + + let y_high1 = vreinterpretq_s16_u16(vmovl_high_u8(y_values1)); + let y_low1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(y_values1))); + + let t_high0 = vsubq_s16(y_high0, u_high); + let t_low0 = vsubq_s16(y_low0, u_low); + + let t_high1 = vsubq_s16(y_high1, u_high); + let t_low1 = vsubq_s16(y_low1, u_low); + + let r_h0 = vaddq_s16(t_high0, v_high); + let r_l0 = vaddq_s16(t_low0, v_low); + let b_h0 = vsubq_s16(t_high0, v_high); + let b_l0 = vsubq_s16(t_low0, v_low); + let g_h0 = vaddq_s16(y_high0, u_high); + let g_l0 = vaddq_s16(y_low0, u_low); + + let r_h1 = vaddq_s16(t_high1, v_high); + let r_l1 = vaddq_s16(t_low1, v_low); + let b_h1 = vsubq_s16(t_high1, v_high); + let b_l1 = vsubq_s16(t_low1, v_low); + let g_h1 = vaddq_s16(y_high1, u_high); + let g_l1 = vaddq_s16(y_low1, u_low); + + let r_values0 = vcombine_u8(vqmovun_s16(r_l0), vqmovun_s16(r_h0)); + let g_values0 = vcombine_u8(vqmovun_s16(g_l0), vqmovun_s16(g_h0)); + let b_values0 = vcombine_u8(vqmovun_s16(b_l0), vqmovun_s16(b_h0)); + + let r_values1 = vcombine_u8(vqmovun_s16(r_l1), vqmovun_s16(r_h1)); + let g_values1 = vcombine_u8(vqmovun_s16(g_l1), vqmovun_s16(g_h1)); + let b_values1 = vcombine_u8(vqmovun_s16(b_l1), vqmovun_s16(b_h1)); + + neon_store_rgb8::( + dst_buffer0.as_mut_ptr(), + r_values0, + g_values0, + b_values0, + vdupq_n_u8(255), + ); + neon_store_rgb8::( + dst_buffer1.as_mut_ptr(), + r_values1, + g_values1, + b_values1, + vdupq_n_u8(255), + ); + + let dst_shift = cx * channels; + std::ptr::copy_nonoverlapping( + dst_buffer0.as_mut_ptr(), + rgba0.get_unchecked_mut(dst_shift..).as_mut_ptr(), + diff * channels, + ); + + std::ptr::copy_nonoverlapping( + dst_buffer1.as_mut_ptr(), + rgba1.get_unchecked_mut(dst_shift..).as_mut_ptr(), + diff * channels, + ); + + cx += diff; + uv_x += half_div; + } + + ProcessedOffset { cx, ux: uv_x } +} diff --git a/src/rgb_to_ycgco.rs b/src/rgb_to_ycgco.rs index 9ef996a2..87984559 100644 --- a/src/rgb_to_ycgco.rs +++ b/src/rgb_to_ycgco.rs @@ -31,6 +31,10 @@ use crate::yuv_error::check_rgba_destination; #[allow(unused_imports)] use crate::yuv_support::*; use crate::{YuvError, YuvPlanarImageMut}; +#[cfg(feature = "rayon")] +use rayon::iter::{IndexedParallelIterator, ParallelIterator}; +#[cfg(feature = "rayon")] +use rayon::prelude::{ParallelSlice, ParallelSliceMut}; fn rgbx_to_ycgco( image: &mut YuvPlanarImageMut, @@ -39,141 +43,316 @@ fn rgbx_to_ycgco( range: YuvRange, ) -> Result<(), YuvError> { let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); - let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); - let channels = source_channels.get_channels_count(); + let src_chans: YuvSourceChannels = ORIGIN_CHANNELS.into(); + let channels = src_chans.get_channels_count(); + const PRECISION: i32 = 13; let range = get_yuv_range(8, range); - let precision_scale = (1 << 8) as f32; - let bias_y = ((range.bias_y as f32 + 0.5f32) * precision_scale) as i32; - let bias_uv = ((range.bias_uv as f32 + 0.5f32) * precision_scale) as i32; + let precision_scale = (1 << PRECISION) as f32; + let rounding_const_bias: i32 = (1 << (PRECISION - 1)) - 1; + let bias_y = range.bias_y as i32 * (1 << PRECISION) + rounding_const_bias; + let bias_uv = range.bias_uv as i32 * (1 << PRECISION) + rounding_const_bias; let max_colors = (1 << 8) - 1i32; check_rgba_destination(rgba, rgba_stride, image.width, image.height, channels)?; image.check_constraints(chroma_subsampling)?; - let iterator_step = match chroma_subsampling { - YuvChromaSubsampling::Yuv420 => 2usize, - YuvChromaSubsampling::Yuv422 => 2usize, - YuvChromaSubsampling::Yuv444 => 1usize, - }; - let range_reduction_y = (range.range_y as f32 / max_colors as f32 * precision_scale).round() as i32; let range_reduction_uv = (range.range_uv as f32 / max_colors as f32 * precision_scale).round() as i32; - let mut y_offset = 0usize; - let mut cg_offset = 0usize; - let mut co_offset = 0usize; - let mut rgba_offset = 0usize; + let process_halved_chroma_row = + |y_plane: &mut [u8], u_plane: &mut [u8], v_plane: &mut [u8], rgba: &[u8]| { + for (((y_dst, u_dst), v_dst), rgba) in y_plane + .chunks_exact_mut(2) + .zip(u_plane.iter_mut()) + .zip(v_plane.iter_mut()) + .zip(rgba.chunks_exact(channels * 2)) + { + let src0 = &rgba[0..channels]; - let y_plane = image.y_plane.borrow_mut(); - let cg_plane = image.u_plane.borrow_mut(); - let co_plane = image.v_plane.borrow_mut(); - let y_stride = image.y_stride; - let cg_stride = image.u_stride; - let co_stride = image.v_stride; - - for y in 0..image.height as usize { - #[allow(unused_variables)] - #[allow(unused_mut)] - let mut cx = 0usize; - #[allow(unused_variables)] - #[allow(unused_mut)] - let mut ux = 0usize; - - let compute_uv_row = chroma_subsampling == YuvChromaSubsampling::Yuv444 - || chroma_subsampling == YuvChromaSubsampling::Yuv422 - || y & 1 == 0; - - #[allow(clippy::explicit_counter_loop)] - for x in (cx..image.width as usize).step_by(iterator_step) { - let px = x * channels; - let rgba_shift = rgba_offset + px; - let src0 = unsafe { rgba.get_unchecked(rgba_shift..) }; - let r0 = unsafe { *src0.get_unchecked(source_channels.get_r_channel_offset()) } as i32; - let g0 = unsafe { *src0.get_unchecked(source_channels.get_g_channel_offset()) } as i32; - let b0 = unsafe { *src0.get_unchecked(source_channels.get_b_channel_offset()) } as i32; - - let mut r1 = r0; - let mut g1 = g0; - let mut b1 = b0; - - let hg = (g0 * range_reduction_y) >> 1; - let y_0 = (hg + ((r0 * range_reduction_y + b0 * range_reduction_y) >> 2) + bias_y) >> 8; - unsafe { *y_plane.get_unchecked_mut(y_offset + x) = y_0 as u8 }; - match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => { - if x + 1 < image.width as usize { - let next_px = (x + 1) * channels; - let rgba_shift = rgba_offset + next_px; - let src1 = unsafe { rgba.get_unchecked(rgba_shift..) }; - r1 = unsafe { *src1.get_unchecked(source_channels.get_r_channel_offset()) } - as i32; - g1 = unsafe { *src1.get_unchecked(source_channels.get_g_channel_offset()) } - as i32; - b1 = unsafe { *src1.get_unchecked(source_channels.get_b_channel_offset()) } - as i32; - let hg_1 = (g1 * range_reduction_y) >> 1; - let y_1 = (hg_1 - + ((r1 * range_reduction_y + b1 * range_reduction_y) >> 2) - + bias_y) - >> 8; - unsafe { *y_plane.get_unchecked_mut(y_offset + x + 1) = y_1 as u8 }; - } - } - _ => {} + let r0 = src0[src_chans.get_r_channel_offset()] as i32; + let g0 = src0[src_chans.get_g_channel_offset()] as i32; + let b0 = src0[src_chans.get_b_channel_offset()] as i32; + + let hg0 = (g0 * range_reduction_y) >> 1; + let y_0 = (hg0 + ((r0 * range_reduction_y + b0 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + + y_dst[0] = y_0 as u8; + + let src1 = &rgba[channels..channels * 2]; + + let r1 = src1[src_chans.get_r_channel_offset()] as i32; + let g1 = src1[src_chans.get_g_channel_offset()] as i32; + let b1 = src1[src_chans.get_b_channel_offset()] as i32; + let hg1 = (g1 * range_reduction_y) >> 1; + let y_1 = (hg1 + ((r1 * range_reduction_y + b1 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + y_dst[1] = y_1 as u8; + + let r = ((r0 + r1 + 1) >> 1) * range_reduction_uv; + let g = ((g0 + g1 + 1) >> 1) * range_reduction_uv; + let b = ((b0 + b1 + 1) >> 1) * range_reduction_uv; + + let cg = (((g >> 1) - ((r + b) >> 2)) + bias_uv) >> PRECISION; + let co = (((r - b) >> 1) + bias_uv) >> PRECISION; + + *u_dst = cg as u8; + *v_dst = co as u8; } - if compute_uv_row { - let mut r = if chroma_subsampling == YuvChromaSubsampling::Yuv444 { - r0 - } else { - (r0 + r1 + 1) >> 1 - }; - let mut g = if chroma_subsampling == YuvChromaSubsampling::Yuv444 { - g0 - } else { - (g0 + g1 + 1) >> 1 - }; - let mut b = if chroma_subsampling == YuvChromaSubsampling::Yuv444 { - b0 - } else { - (b0 + b1 + 1) >> 1 - }; - r *= range_reduction_uv; - g *= range_reduction_uv; - b *= range_reduction_uv; - let cg = (((g >> 1) - ((r + b) >> 2)) + bias_uv) >> 8; - let co = (((r - b) >> 1) + bias_uv) >> 8; - let u_pos = match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => cg_offset + ux, - YuvChromaSubsampling::Yuv444 => cg_offset + ux, - }; - unsafe { *cg_plane.get_unchecked_mut(u_pos) = cg as u8 }; - let v_pos = match chroma_subsampling { - YuvChromaSubsampling::Yuv420 | YuvChromaSubsampling::Yuv422 => co_offset + ux, - YuvChromaSubsampling::Yuv444 => co_offset + ux, - }; - unsafe { *co_plane.get_unchecked_mut(v_pos) = co as u8 }; + if image.width & 1 != 0 { + let rgb_last = rgba.chunks_exact(channels * 2).remainder(); + let mut r0 = rgb_last[src_chans.get_r_channel_offset()] as i32; + let mut g0 = rgb_last[src_chans.get_g_channel_offset()] as i32; + let mut b0 = rgb_last[src_chans.get_b_channel_offset()] as i32; + + let y_last = y_plane.last_mut().unwrap(); + let u_last = u_plane.last_mut().unwrap(); + let v_last = v_plane.last_mut().unwrap(); + + let hg0 = (g0 * range_reduction_y) >> 1; + let y_0 = (hg0 + ((r0 * range_reduction_y + b0 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + + *y_last = y_0 as u8; + + r0 *= range_reduction_y; + g0 *= range_reduction_uv; + b0 *= range_reduction_uv; + + let cg = (((g0 >> 1) - ((r0 + b0) >> 2)) + bias_uv) >> PRECISION; + let co = (((r0 - b0) >> 1) + bias_uv) >> PRECISION; + *u_last = cg as u8; + *v_last = co as u8; } + }; + + let process_doubled_row = |y_plane0: &mut [u8], + y_plane1: &mut [u8], + u_plane: &mut [u8], + v_plane: &mut [u8], + rgba0: &[u8], + rgba1: &[u8]| { + for (((((y_dst0, y_dst1), u_dst), v_dst), rgba0), rgba1) in y_plane0 + .chunks_exact_mut(2) + .zip(y_plane1.chunks_exact_mut(2)) + .zip(u_plane.iter_mut()) + .zip(v_plane.iter_mut()) + .zip(rgba0.chunks_exact(channels * 2)) + .zip(rgba1.chunks_exact(channels * 2)) + { + let src00 = &rgba0[0..channels]; + + let r00 = src00[src_chans.get_r_channel_offset()] as i32; + let g00 = src00[src_chans.get_g_channel_offset()] as i32; + let b00 = src00[src_chans.get_b_channel_offset()] as i32; + let hg00 = (g00 * range_reduction_y) >> 1; + let y_00 = (hg00 + ((r00 * range_reduction_y + b00 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + y_dst0[0] = y_00 as u8; + + let src1 = &rgba0[channels..channels * 2]; + + let r01 = src1[src_chans.get_r_channel_offset()] as i32; + let g01 = src1[src_chans.get_g_channel_offset()] as i32; + let b01 = src1[src_chans.get_b_channel_offset()] as i32; + let hg01 = (g01 * range_reduction_y) >> 1; + let y_01 = (hg01 + ((r01 * range_reduction_y + b01 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + y_dst0[1] = y_01 as u8; + + let src10 = &rgba1[0..channels]; + + let r10 = src10[src_chans.get_r_channel_offset()] as i32; + let g10 = src10[src_chans.get_g_channel_offset()] as i32; + let b10 = src10[src_chans.get_b_channel_offset()] as i32; + let hg10 = (g10 * range_reduction_y) >> 1; + let y_10 = (hg10 + ((r10 * range_reduction_y + b10 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + y_dst1[0] = y_10 as u8; + + let src11 = &rgba1[channels..channels * 2]; + + let r11 = src11[src_chans.get_r_channel_offset()] as i32; + let g11 = src11[src_chans.get_g_channel_offset()] as i32; + let b11 = src11[src_chans.get_b_channel_offset()] as i32; + let hg11 = (g11 * range_reduction_y) >> 1; + let y_11 = (hg11 + ((r11 * range_reduction_y + b11 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + y_dst1[1] = y_11 as u8; - ux += 1; + let ruv = ((r00 + r01 + r10 + r11 + 2) >> 2) * range_reduction_uv; + let guv = ((g00 + g01 + g10 + g11 + 2) >> 2) * range_reduction_uv; + let buv = ((b00 + b01 + b10 + b11 + 2) >> 2) * range_reduction_uv; + + let cg = (((guv >> 1) - ((ruv + buv) >> 2)) + bias_uv) >> PRECISION; + let co = (((ruv - buv) >> 1) + bias_uv) >> PRECISION; + *u_dst = cg as u8; + *v_dst = co as u8; } - y_offset += y_stride as usize; - rgba_offset += rgba_stride as usize; - match chroma_subsampling { - YuvChromaSubsampling::Yuv420 => { - if y & 1 == 1 { - cg_offset += cg_stride as usize; - co_offset += co_stride as usize; - } - } - YuvChromaSubsampling::Yuv444 | YuvChromaSubsampling::Yuv422 => { - cg_offset += cg_stride as usize; - co_offset += co_stride as usize; + if image.width & 1 != 0 { + let rgb_last0 = rgba0.chunks_exact(channels * 2).remainder(); + let rgb_last1 = rgba1.chunks_exact(channels * 2).remainder(); + let r0 = rgb_last0[src_chans.get_r_channel_offset()] as i32; + let g0 = rgb_last0[src_chans.get_g_channel_offset()] as i32; + let b0 = rgb_last0[src_chans.get_b_channel_offset()] as i32; + + let r1 = rgb_last1[src_chans.get_r_channel_offset()] as i32; + let g1 = rgb_last1[src_chans.get_g_channel_offset()] as i32; + let b1 = rgb_last1[src_chans.get_b_channel_offset()] as i32; + + let y0_last = y_plane0.last_mut().unwrap(); + let y1_last = y_plane1.last_mut().unwrap(); + let u_last = u_plane.last_mut().unwrap(); + let v_last = v_plane.last_mut().unwrap(); + + let hg0 = (g0 * range_reduction_y) >> 1; + let y_0 = (hg0 + ((r0 * range_reduction_y + b0 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + *y0_last = y_0 as u8; + + let hg1 = (g1 * range_reduction_y) >> 1; + let y_1 = (hg1 + ((r1 * range_reduction_y + b1 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + *y1_last = y_1 as u8; + + let r0 = ((r0 + r1) >> 1) * range_reduction_uv; + let g0 = ((g0 + g1) >> 1) * range_reduction_uv; + let b0 = ((b0 + b1) >> 1) * range_reduction_uv; + + let cg = (((g0 >> 1) - ((r0 + b0) >> 2)) + bias_uv) >> PRECISION; + let co = (((r0 - b0) >> 1) + bias_uv) >> PRECISION; + *u_last = cg as u8; + *v_last = co as u8; + } + }; + + let y_plane = image.y_plane.borrow_mut(); + let u_plane = image.u_plane.borrow_mut(); + let v_plane = image.v_plane.borrow_mut(); + let y_stride = image.y_stride as usize; + let u_stride = image.u_stride as usize; + let v_stride = image.v_stride as usize; + + if chroma_subsampling == YuvChromaSubsampling::Yuv444 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride) + .zip(u_plane.par_chunks_exact_mut(u_stride)) + .zip(v_plane.par_chunks_exact_mut(v_stride)) + .zip(rgba.par_chunks_exact(rgba_stride as usize)); + } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride) + .zip(u_plane.chunks_exact_mut(u_stride)) + .zip(v_plane.chunks_exact_mut(v_stride)) + .zip(rgba.chunks_exact(rgba_stride as usize)); + } + iter.for_each(|(((y_dst, u_plane), v_plane), rgba)| { + let y_dst = &mut y_dst[0..image.width as usize]; + for (((y_dst, u_dst), v_dst), rgba) in y_dst + .iter_mut() + .zip(u_plane.iter_mut()) + .zip(v_plane.iter_mut()) + .zip(rgba.chunks_exact(channels)) + { + let mut r0 = rgba[src_chans.get_r_channel_offset()] as i32; + let mut g0 = rgba[src_chans.get_g_channel_offset()] as i32; + let mut b0 = rgba[src_chans.get_b_channel_offset()] as i32; + let hg0 = (g0 * range_reduction_y) >> 1; + let y_0 = (hg0 + ((r0 * range_reduction_y + b0 * range_reduction_y) >> 2) + bias_y) + >> PRECISION; + *y_dst = y_0 as u8; + + r0 *= range_reduction_y; + g0 *= range_reduction_y; + b0 *= range_reduction_y; + + let cg = (((g0 >> 1) - ((r0 + b0) >> 2)) + bias_uv) >> PRECISION; + let co = (((r0 - b0) >> 1) + bias_uv) >> PRECISION; + *u_dst = cg as u8; + *v_dst = co as u8; } + }); + } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride) + .zip(u_plane.par_chunks_exact_mut(u_stride)) + .zip(v_plane.par_chunks_exact_mut(v_stride)) + .zip(rgba.par_chunks_exact(rgba_stride as usize)); + } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride) + .zip(u_plane.chunks_exact_mut(u_stride)) + .zip(v_plane.chunks_exact_mut(v_stride)) + .zip(rgba.chunks_exact(rgba_stride as usize)); + } + + iter.for_each(|(((y_plane, u_plane), v_plane), rgba)| { + process_halved_chroma_row( + &mut y_plane[0..image.width as usize], + &mut u_plane[0..(image.width as usize).div_ceil(2)], + &mut v_plane[0..(image.width as usize).div_ceil(2)], + &rgba[0..image.width as usize * channels], + ); + }); + } else if chroma_subsampling == YuvChromaSubsampling::Yuv420 { + let iter; + #[cfg(feature = "rayon")] + { + iter = y_plane + .par_chunks_exact_mut(y_stride * 2) + .zip(u_plane.par_chunks_exact_mut(u_stride)) + .zip(v_plane.par_chunks_exact_mut(v_stride)) + .zip(rgba.par_chunks_exact(rgba_stride as usize * 2)); + } + #[cfg(not(feature = "rayon"))] + { + iter = y_plane + .chunks_exact_mut(y_stride * 2) + .zip(u_plane.chunks_exact_mut(u_stride)) + .zip(v_plane.chunks_exact_mut(v_stride)) + .zip(rgba.chunks_exact(rgba_stride as usize * 2)); + } + iter.for_each(|(((y_plane, u_plane), v_plane), rgba)| { + let (rgba0, rgba1) = rgba.split_at(rgba_stride as usize); + let (y_plane0, y_plane1) = y_plane.split_at_mut(y_stride); + process_doubled_row( + &mut y_plane0[0..image.width as usize], + &mut y_plane1[0..image.width as usize], + &mut u_plane[0..(image.width as usize).div_ceil(2)], + &mut v_plane[0..(image.width as usize).div_ceil(2)], + &rgba0[0..image.width as usize * channels], + &rgba1[0..image.width as usize * channels], + ); + }); + + if image.height & 1 != 0 { + let remainder_y_plane = y_plane.chunks_exact_mut(y_stride * 2).into_remainder(); + let remainder_rgba = rgba.chunks_exact(rgba_stride as usize * 2).remainder(); + let u_plane = u_plane.chunks_exact_mut(u_stride).last().unwrap(); + let v_plane = v_plane.chunks_exact_mut(v_stride).last().unwrap(); + process_halved_chroma_row( + &mut remainder_y_plane[0..image.width as usize], + &mut u_plane[0..(image.width as usize).div_ceil(2)], + &mut v_plane[0..(image.width as usize).div_ceil(2)], + &remainder_rgba[0..image.width as usize * channels], + ); } + } else { + unreachable!(); } Ok(()) diff --git a/src/ycgco_to_rgb.rs b/src/ycgco_to_rgb.rs index d7c9ede5..425ca39e 100644 --- a/src/ycgco_to_rgb.rs +++ b/src/ycgco_to_rgb.rs @@ -26,6 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::internals::ProcessedOffset; use crate::numerics::qrshr; use crate::yuv_error::check_rgba_destination; use crate::yuv_support::*; @@ -37,8 +38,245 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use std::fmt::Debug; +trait CgCoWideRowInversionHandler { + fn handle_row( + &self, + y_plane: &[V], + u_plane: &[V], + v_plane: &[V], + rgba: &mut [V], + width: u32, + chroma_range: YuvChromaRange, + ) -> ProcessedOffset; +} + +trait CgCoWideRowInversionHandler420 { + fn handle_row420( + &self, + y_plane0: &[V], + y_plane1: &[V], + u_plane: &[V], + v_plane: &[V], + rgba0: &mut [V], + rgba1: &mut [V], + width: u32, + chroma_range: YuvChromaRange, + ) -> ProcessedOffset; +} + +type RgbHandler = unsafe fn( + y_plane: &[u8], + u_plane: &[u8], + v_plane: &[u8], + rgba: &mut [u8], + width: usize, + chroma_range: YuvChromaRange, +) -> ProcessedOffset; + +type RgbHandler420 = unsafe fn( + y_plane0: &[u8], + y_plane1: &[u8], + u_plane: &[u8], + v_plane: &[u8], + rgba0: &mut [u8], + rgba1: &mut [u8], + width: u32, + chroma_range: YuvChromaRange, +) -> ProcessedOffset; + +struct Rgb8Converter { + handler: Option, +} + +struct Rgb8Converter420 { + handler: Option, +} + +impl + Rgb8Converter +{ + fn new(range: YuvRange) -> Self { + if range == YuvRange::Full { + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + use crate::neon::neon_ycgco_full_range_to_rgb; + return Rgb8Converter { + handler: Some(neon_ycgco_full_range_to_rgb::), + }; + } + } + Self { handler: None } + } +} + +impl + Rgb8Converter420 +{ + fn new(range: YuvRange) -> Self { + let sampling: YuvChromaSubsampling = SAMPLING.into(); + if sampling != YuvChromaSubsampling::Yuv420 { + return Self { handler: None }; + } + assert_eq!(sampling, YuvChromaSubsampling::Yuv420); + if range == YuvRange::Full { + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + use crate::neon::neon_ycgco420_to_rgba_row; + return Rgb8Converter420 { + handler: Some(neon_ycgco420_to_rgba_row::), + }; + } + } + Self { handler: None } + } +} + +struct Rgb16Converter {} + +impl CgCoWideRowInversionHandler + for Rgb16Converter +{ + fn handle_row( + &self, + _: &[u16], + _: &[u16], + _: &[u16], + _: &mut [u16], + _: u32, + _: YuvChromaRange, + ) -> ProcessedOffset { + ProcessedOffset { cx: 0, ux: 0 } + } +} + +struct Rgb16Converter420 {} + +impl CgCoWideRowInversionHandler420 + for Rgb16Converter420 +{ + fn handle_row420( + &self, + _: &[u16], + _: &[u16], + _: &[u16], + _: &[u16], + _: &mut [u16], + _: &mut [u16], + _: u32, + _: YuvChromaRange, + ) -> ProcessedOffset { + ProcessedOffset { cx: 0, ux: 0 } + } +} + +impl CgCoWideRowInversionHandler + for Rgb8Converter +{ + fn handle_row( + &self, + y_plane: &[u8], + u_plane: &[u8], + v_plane: &[u8], + rgba: &mut [u8], + width: u32, + chroma_range: YuvChromaRange, + ) -> ProcessedOffset { + if let Some(handler) = self.handler { + unsafe { + return handler( + y_plane, + u_plane, + v_plane, + rgba, + width as usize, + chroma_range, + ); + } + } + ProcessedOffset { cx: 0, ux: 0 } + } +} + +impl CgCoWideRowInversionHandler420 + for Rgb8Converter420 +{ + fn handle_row420( + &self, + y_plane0: &[u8], + y_plane1: &[u8], + u_plane: &[u8], + v_plane: &[u8], + rgba0: &mut [u8], + rgba1: &mut [u8], + width: u32, + chroma_range: YuvChromaRange, + ) -> ProcessedOffset { + if let Some(handler) = self.handler { + unsafe { + return handler( + y_plane0, + y_plane1, + u_plane, + v_plane, + rgba0, + rgba1, + width, + chroma_range, + ); + } + } + ProcessedOffset { cx: 0, ux: 0 } + } +} + +trait YCgCoConverterFactory { + fn make_converter( + range: YuvRange, + bit_depth: usize, + ) -> Box + Sync + Send>; + + fn make_converter420( + range: YuvRange, + bit_depth: usize, + ) -> Box + Sync + Send>; +} + +impl YCgCoConverterFactory for u8 { + fn make_converter( + range: YuvRange, + _: usize, + ) -> Box + Sync + Send> { + Box::new(Rgb8Converter::::new(range)) + } + + fn make_converter420( + range: YuvRange, + _: usize, + ) -> Box + Sync + Send> { + Box::new(Rgb8Converter420::::new( + range, + )) + } +} + +impl YCgCoConverterFactory for u16 { + fn make_converter( + _: YuvRange, + _: usize, + ) -> Box + Sync + Send> { + Box::new(Rgb16Converter:: {}) + } + + fn make_converter420( + _: YuvRange, + _: usize, + ) -> Box + Sync + Send> { + Box::new(Rgb16Converter420:: {}) + } +} + fn ycgco_ro_rgbx< - V: AsPrimitive + 'static + Default + Debug + Sync + Send, + V: AsPrimitive + 'static + Default + Debug + Sync + Send + YCgCoConverterFactory, const DESTINATION_CHANNELS: u8, const SAMPLING: u8, const BIT_DEPTH: usize, @@ -58,9 +296,9 @@ where check_rgba_destination(rgba, rgba_stride, image.width, image.height, channels)?; image.check_constraints(chroma_subsampling)?; - let range = get_yuv_range(BIT_DEPTH as u32, range); - let bias_y = range.bias_y as i32; - let bias_uv = range.bias_uv as i32; + let chroma_range = get_yuv_range(BIT_DEPTH as u32, range); + let bias_y = chroma_range.bias_y as i32; + let bias_uv = chroma_range.bias_uv as i32; const PRECISION: i32 = 13; @@ -68,19 +306,112 @@ where let precision_scale = (1 << PRECISION) as f32; let range_reduction_y = - (max_colors as f32 / range.range_y as f32 * precision_scale).round() as i32; + (max_colors as f32 / chroma_range.range_y as f32 * precision_scale).round() as i32; let range_reduction_uv = - (max_colors as f32 / range.range_uv as f32 * precision_scale).round() as i32; + (max_colors as f32 / chroma_range.range_uv as f32 * precision_scale).round() as i32; + + let converter = V::make_converter::(range, BIT_DEPTH); + let converter420 = V::make_converter420::(range, BIT_DEPTH); let process_halved_chroma_row = |y_plane: &[V], u_plane: &[V], v_plane: &[V], rgba: &mut [V]| { - for (((rgba, y_src), &u_src), &v_src) in rgba + let processed_offset = + converter.handle_row(y_plane, u_plane, v_plane, rgba, image.width, chroma_range); + if processed_offset.cx != image.width as usize { + for (((rgba, y_src), &u_src), &v_src) in rgba + .chunks_exact_mut(channels * 2) + .zip(y_plane.chunks_exact(2)) + .zip(u_plane.iter()) + .zip(v_plane.iter()) + .skip(processed_offset.cx) + { + let y_value0 = (y_src[0].as_() - bias_y) * range_reduction_y; + let cb_value = (u_src.as_() - bias_uv) * range_reduction_uv; + let cr_value = (v_src.as_() - bias_uv) * range_reduction_uv; + + let t0 = y_value0 - cb_value; + + let r0 = qrshr::(t0 + cr_value); + let b0 = qrshr::(t0 - cr_value); + let g0 = qrshr::(y_value0 + cb_value); + + let rgba0 = &mut rgba[0..channels]; + + rgba0[dst_chans.get_r_channel_offset()] = r0.as_(); + rgba0[dst_chans.get_g_channel_offset()] = g0.as_(); + rgba0[dst_chans.get_b_channel_offset()] = b0.as_(); + if dst_chans.has_alpha() { + rgba0[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } + + let y_value1 = (y_src[1].as_() - bias_y) * range_reduction_y; + + let t1 = y_value1 - cb_value; + + let r1 = qrshr::(t1 + cr_value); + let b1 = qrshr::(t1 - cr_value); + let g1 = qrshr::(y_value1 + cb_value); + + let rgba1 = &mut rgba[channels..channels * 2]; + + rgba1[dst_chans.get_r_channel_offset()] = r1.as_(); + rgba1[dst_chans.get_g_channel_offset()] = g1.as_(); + rgba1[dst_chans.get_b_channel_offset()] = b1.as_(); + if dst_chans.has_alpha() { + rgba1[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } + } + + if image.width & 1 != 0 { + let y_value0 = (y_plane.last().unwrap().as_() - bias_y) * range_reduction_y; + let cb_value = (u_plane.last().unwrap().as_() - bias_uv) * range_reduction_uv; + let cr_value = (v_plane.last().unwrap().as_() - bias_uv) * range_reduction_uv; + let rgba = rgba.chunks_exact_mut(channels).last().unwrap(); + let rgba0 = &mut rgba[0..channels]; + + let t0 = y_value0 - cb_value; + + let r0 = qrshr::(t0 + cr_value); + let b0 = qrshr::(t0 - cr_value); + let g0 = qrshr::(y_value0 + cb_value); + rgba0[dst_chans.get_r_channel_offset()] = r0.as_(); + rgba0[dst_chans.get_g_channel_offset()] = g0.as_(); + rgba0[dst_chans.get_b_channel_offset()] = b0.as_(); + if dst_chans.has_alpha() { + rgba0[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } + } + } + }; + + let process_doubled_chroma_row = |y_plane0: &[V], + y_plane1: &[V], + u_plane: &[V], + v_plane: &[V], + rgba0: &mut [V], + rgba1: &mut [V]| { + let processed_offset420 = converter420 + .handle_row420( + y_plane0, + y_plane1, + u_plane, + v_plane, + rgba0, + rgba1, + image.width, + chroma_range, + ) + .cx; + if processed_offset420 != image.width as usize { + for (((((rgba0, rgba1), y_src0), y_src1), &u_src), &v_src) in rgba0 .chunks_exact_mut(channels * 2) - .zip(y_plane.chunks_exact(2)) + .zip(rgba1.chunks_exact_mut(channels * 2)) + .zip(y_plane0.chunks_exact(2)) + .zip(y_plane1.chunks_exact(2)) .zip(u_plane.iter()) .zip(v_plane.iter()) { - let y_value0 = (y_src[0].as_() - bias_y) * range_reduction_y; + let y_value0 = (y_src0[0].as_() - bias_y) * range_reduction_y; let cb_value = (u_src.as_() - bias_uv) * range_reduction_uv; let cr_value = (v_src.as_() - bias_uv) * range_reduction_uv; @@ -90,16 +421,16 @@ where let b0 = qrshr::(t0 - cr_value); let g0 = qrshr::(y_value0 + cb_value); - let rgba0 = &mut rgba[0..channels]; + let rgba00 = &mut rgba0[0..channels]; - rgba0[dst_chans.get_r_channel_offset()] = r0.as_(); - rgba0[dst_chans.get_g_channel_offset()] = g0.as_(); - rgba0[dst_chans.get_b_channel_offset()] = b0.as_(); + rgba00[dst_chans.get_r_channel_offset()] = r0.as_(); + rgba00[dst_chans.get_g_channel_offset()] = g0.as_(); + rgba00[dst_chans.get_b_channel_offset()] = b0.as_(); if dst_chans.has_alpha() { - rgba0[dst_chans.get_a_channel_offset()] = max_colors.as_(); + rgba00[dst_chans.get_a_channel_offset()] = max_colors.as_(); } - let y_value1 = (y_src[1].as_() - bias_y) * range_reduction_y; + let y_value1 = (y_src0[1].as_() - bias_y) * range_reduction_y; let t1 = y_value1 - cb_value; @@ -107,21 +438,56 @@ where let b1 = qrshr::(t1 - cr_value); let g1 = qrshr::(y_value1 + cb_value); - let rgba1 = &mut rgba[channels..channels * 2]; + let rgba01 = &mut rgba0[channels..channels * 2]; - rgba1[dst_chans.get_r_channel_offset()] = r1.as_(); - rgba1[dst_chans.get_g_channel_offset()] = g1.as_(); - rgba1[dst_chans.get_b_channel_offset()] = b1.as_(); + rgba01[dst_chans.get_r_channel_offset()] = r1.as_(); + rgba01[dst_chans.get_g_channel_offset()] = g1.as_(); + rgba01[dst_chans.get_b_channel_offset()] = b1.as_(); if dst_chans.has_alpha() { - rgba1[dst_chans.get_a_channel_offset()] = max_colors.as_(); + rgba01[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } + + let y_value10 = (y_src1[0].as_() - bias_y) * range_reduction_y; + + let t10 = y_value10 - cb_value; + + let r10 = qrshr::(t10 + cr_value); + let b10 = qrshr::(t10 - cr_value); + let g10 = qrshr::(y_value10 + cb_value); + + let rgba10 = &mut rgba1[0..channels]; + + rgba10[dst_chans.get_r_channel_offset()] = r10.as_(); + rgba10[dst_chans.get_g_channel_offset()] = g10.as_(); + rgba10[dst_chans.get_b_channel_offset()] = b10.as_(); + if dst_chans.has_alpha() { + rgba10[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } + + let y_value11 = (y_src1[1].as_() - bias_y) * range_reduction_y; + + let t11 = y_value11 - cb_value; + + let r11 = qrshr::(t11 + cr_value); + let b11 = qrshr::(t11 - cr_value); + let g11 = qrshr::(y_value11 + cb_value); + + let rgba11 = &mut rgba1[channels..channels * 2]; + + rgba11[dst_chans.get_r_channel_offset()] = r11.as_(); + rgba11[dst_chans.get_g_channel_offset()] = g11.as_(); + rgba11[dst_chans.get_b_channel_offset()] = b11.as_(); + if dst_chans.has_alpha() { + rgba11[dst_chans.get_a_channel_offset()] = max_colors.as_(); } } if image.width & 1 != 0 { - let y_value0 = (y_plane.last().unwrap().as_() - bias_y) * range_reduction_y; + let y_value0 = (y_plane0.last().unwrap().as_() - bias_y) * range_reduction_y; + let y_value1 = (y_plane1.last().unwrap().as_() - bias_y) * range_reduction_y; let cb_value = (u_plane.last().unwrap().as_() - bias_uv) * range_reduction_uv; let cr_value = (v_plane.last().unwrap().as_() - bias_uv) * range_reduction_uv; - let rgba = rgba.chunks_exact_mut(channels).last().unwrap(); + let rgba = rgba0.chunks_exact_mut(channels).last().unwrap(); let rgba0 = &mut rgba[0..channels]; let t0 = y_value0 - cb_value; @@ -129,134 +495,28 @@ where let r0 = qrshr::(t0 + cr_value); let b0 = qrshr::(t0 - cr_value); let g0 = qrshr::(y_value0 + cb_value); + rgba0[dst_chans.get_r_channel_offset()] = r0.as_(); rgba0[dst_chans.get_g_channel_offset()] = g0.as_(); rgba0[dst_chans.get_b_channel_offset()] = b0.as_(); if dst_chans.has_alpha() { rgba0[dst_chans.get_a_channel_offset()] = max_colors.as_(); } - } - }; - - let process_doubled_chroma_row = |y_plane0: &[V], - y_plane1: &[V], - u_plane: &[V], - v_plane: &[V], - rgba0: &mut [V], - rgba1: &mut [V]| { - for (((((rgba0, rgba1), y_src0), y_src1), &u_src), &v_src) in rgba0 - .chunks_exact_mut(channels * 2) - .zip(rgba1.chunks_exact_mut(channels * 2)) - .zip(y_plane0.chunks_exact(2)) - .zip(y_plane1.chunks_exact(2)) - .zip(u_plane.iter()) - .zip(v_plane.iter()) - { - let y_value0 = (y_src0[0].as_() - bias_y) * range_reduction_y; - let cb_value = (u_src.as_() - bias_uv) * range_reduction_uv; - let cr_value = (v_src.as_() - bias_uv) * range_reduction_uv; - - let t0 = y_value0 - cb_value; - - let r0 = qrshr::(t0 + cr_value); - let b0 = qrshr::(t0 - cr_value); - let g0 = qrshr::(y_value0 + cb_value); - - let rgba00 = &mut rgba0[0..channels]; - - rgba00[dst_chans.get_r_channel_offset()] = r0.as_(); - rgba00[dst_chans.get_g_channel_offset()] = g0.as_(); - rgba00[dst_chans.get_b_channel_offset()] = b0.as_(); - if dst_chans.has_alpha() { - rgba00[dst_chans.get_a_channel_offset()] = max_colors.as_(); - } - - let y_value1 = (y_src0[1].as_() - bias_y) * range_reduction_y; - - let t1 = y_value1 - cb_value; - - let r1 = qrshr::(t1 + cr_value); - let b1 = qrshr::(t1 - cr_value); - let g1 = qrshr::(y_value1 + cb_value); - let rgba01 = &mut rgba0[channels..channels * 2]; - - rgba01[dst_chans.get_r_channel_offset()] = r1.as_(); - rgba01[dst_chans.get_g_channel_offset()] = g1.as_(); - rgba01[dst_chans.get_b_channel_offset()] = b1.as_(); - if dst_chans.has_alpha() { - rgba01[dst_chans.get_a_channel_offset()] = max_colors.as_(); - } - - let y_value10 = (y_src1[0].as_() - bias_y) * range_reduction_y; - - let t10 = y_value10 - cb_value; - - let r10 = qrshr::(t10 + cr_value); - let b10 = qrshr::(t10 - cr_value); - let g10 = qrshr::(y_value10 + cb_value); - - let rgba10 = &mut rgba1[0..channels]; - - rgba10[dst_chans.get_r_channel_offset()] = r10.as_(); - rgba10[dst_chans.get_g_channel_offset()] = g10.as_(); - rgba10[dst_chans.get_b_channel_offset()] = b10.as_(); - if dst_chans.has_alpha() { - rgba10[dst_chans.get_a_channel_offset()] = max_colors.as_(); - } - - let y_value11 = (y_src1[1].as_() - bias_y) * range_reduction_y; - - let t11 = y_value11 - cb_value; - - let r11 = qrshr::(t11 + cr_value); - let b11 = qrshr::(t11 - cr_value); - let g11 = qrshr::(y_value11 + cb_value); - - let rgba11 = &mut rgba1[channels..channels * 2]; - - rgba11[dst_chans.get_r_channel_offset()] = r11.as_(); - rgba11[dst_chans.get_g_channel_offset()] = g11.as_(); - rgba11[dst_chans.get_b_channel_offset()] = b11.as_(); - if dst_chans.has_alpha() { - rgba11[dst_chans.get_a_channel_offset()] = max_colors.as_(); - } - } - - if image.width & 1 != 0 { - let y_value0 = (y_plane0.last().unwrap().as_() - bias_y) * range_reduction_y; - let y_value1 = (y_plane1.last().unwrap().as_() - bias_y) * range_reduction_y; - let cb_value = (u_plane.last().unwrap().as_() - bias_uv) * range_reduction_uv; - let cr_value = (v_plane.last().unwrap().as_() - bias_uv) * range_reduction_uv; - let rgba = rgba0.chunks_exact_mut(channels).last().unwrap(); - let rgba0 = &mut rgba[0..channels]; - - let t0 = y_value0 - cb_value; - - let r0 = qrshr::(t0 + cr_value); - let b0 = qrshr::(t0 - cr_value); - let g0 = qrshr::(y_value0 + cb_value); - - rgba0[dst_chans.get_r_channel_offset()] = r0.as_(); - rgba0[dst_chans.get_g_channel_offset()] = g0.as_(); - rgba0[dst_chans.get_b_channel_offset()] = b0.as_(); - if dst_chans.has_alpha() { - rgba0[dst_chans.get_a_channel_offset()] = max_colors.as_(); - } - - let t1 = y_value1 - cb_value; + let t1 = y_value1 - cb_value; - let r1 = qrshr::(t1 + cr_value); - let b1 = qrshr::(t1 - cr_value); - let g1 = qrshr::(y_value1 + cb_value); + let r1 = qrshr::(t1 + cr_value); + let b1 = qrshr::(t1 - cr_value); + let g1 = qrshr::(y_value1 + cb_value); - let rgba = rgba1.chunks_exact_mut(channels).last().unwrap(); - let rgba1 = &mut rgba[0..channels]; - rgba1[dst_chans.get_r_channel_offset()] = r1.as_(); - rgba1[dst_chans.get_g_channel_offset()] = g1.as_(); - rgba1[dst_chans.get_b_channel_offset()] = b1.as_(); - if dst_chans.has_alpha() { - rgba1[dst_chans.get_a_channel_offset()] = max_colors.as_(); + let rgba = rgba1.chunks_exact_mut(channels).last().unwrap(); + let rgba1 = &mut rgba[0..channels]; + rgba1[dst_chans.get_r_channel_offset()] = r1.as_(); + rgba1[dst_chans.get_g_channel_offset()] = g1.as_(); + rgba1[dst_chans.get_b_channel_offset()] = b1.as_(); + if dst_chans.has_alpha() { + rgba1[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } } } }; @@ -281,27 +541,31 @@ where } iter.for_each(|(((rgba, y_plane), u_plane), v_plane)| { let y_plane = &y_plane[0..image.width as usize]; - for (((rgba, &y_src), &u_src), &v_src) in rgba - .chunks_exact_mut(channels) - .zip(y_plane.iter()) - .zip(u_plane.iter()) - .zip(v_plane.iter()) - { - let y_value = (y_src.as_() - bias_y) * range_reduction_y; - let cb_value = (u_src.as_() - bias_uv) * range_reduction_uv; - let cr_value = (v_src.as_() - bias_uv) * range_reduction_uv; - - let t0 = y_value - cb_value; - - let r = qrshr::(t0 + cr_value); - let b = qrshr::(t0 - cr_value); - let g = qrshr::(y_value + cb_value); - - rgba[dst_chans.get_r_channel_offset()] = r.as_(); - rgba[dst_chans.get_g_channel_offset()] = g.as_(); - rgba[dst_chans.get_b_channel_offset()] = b.as_(); - if dst_chans.has_alpha() { - rgba[dst_chans.get_a_channel_offset()] = max_colors.as_(); + let processed_offset = + converter.handle_row(y_plane, u_plane, v_plane, rgba, image.width, chroma_range); + if processed_offset.cx != image.width as usize { + for (((rgba, &y_src), &u_src), &v_src) in rgba + .chunks_exact_mut(channels) + .zip(y_plane.iter()) + .zip(u_plane.iter()) + .zip(v_plane.iter()) + { + let y_value = (y_src.as_() - bias_y) * range_reduction_y; + let cb_value = (u_src.as_() - bias_uv) * range_reduction_uv; + let cr_value = (v_src.as_() - bias_uv) * range_reduction_uv; + + let t0 = y_value - cb_value; + + let r = qrshr::(t0 + cr_value); + let b = qrshr::(t0 - cr_value); + let g = qrshr::(y_value + cb_value); + + rgba[dst_chans.get_r_channel_offset()] = r.as_(); + rgba[dst_chans.get_g_channel_offset()] = g.as_(); + rgba[dst_chans.get_b_channel_offset()] = b.as_(); + if dst_chans.has_alpha() { + rgba[dst_chans.get_a_channel_offset()] = max_colors.as_(); + } } } }); @@ -539,7 +803,7 @@ d_cnv!( ); d_cnv!( - icgc010_to_rgb, + icgc010_to_rgb10, u16, 10, YuvSourceChannels::Rgb, @@ -548,7 +812,7 @@ d_cnv!( "YCgCo 4:2:0 10-bit" ); d_cnv!( - icgc010_to_rgba, + icgc010_to_rgba10, u16, 10, YuvSourceChannels::Rgba, @@ -557,7 +821,7 @@ d_cnv!( "YCgCo 4:2:0 10-bit" ); d_cnv!( - icgc210_to_rgb, + icgc210_to_rgb10, u16, 10, YuvSourceChannels::Rgb, @@ -566,7 +830,7 @@ d_cnv!( "YCgCo 4:2:2 10-bit" ); d_cnv!( - icgc210_to_rgba, + icgc210_to_rgba10, u16, 10, YuvSourceChannels::Rgba, @@ -575,7 +839,7 @@ d_cnv!( "YCgCo 4:2:2 10-bit" ); d_cnv!( - icgc410_to_rgb, + icgc410_to_rgb10, u16, 10, YuvSourceChannels::Rgb, @@ -584,7 +848,7 @@ d_cnv!( "YCgCo 4:4:4 10-bit" ); d_cnv!( - icgc410_to_rgba, + icgc410_to_rgba10, u16, 10, YuvSourceChannels::Rgba, @@ -596,7 +860,7 @@ d_cnv!( // 12-bit d_cnv!( - icgc012_to_rgb, + icgc012_to_rgb12, u16, 12, YuvSourceChannels::Rgb, @@ -605,7 +869,7 @@ d_cnv!( "YCgCo 4:2:0 12-bit" ); d_cnv!( - icgc012_to_rgba, + icgc012_to_rgba12, u16, 12, YuvSourceChannels::Rgba, @@ -614,7 +878,7 @@ d_cnv!( "YCgCo 4:2:0 12-bit" ); d_cnv!( - icgc212_to_rgb, + icgc212_to_rgb12, u16, 12, YuvSourceChannels::Rgb, @@ -623,7 +887,7 @@ d_cnv!( "YCgCo 4:2:2 12-bit" ); d_cnv!( - icgc212_to_rgba, + icgc212_to_rgba12, u16, 12, YuvSourceChannels::Rgba, @@ -632,7 +896,7 @@ d_cnv!( "YCgCo 4:2:2 12-bit" ); d_cnv!( - icgc412_to_rgb, + icgc412_to_rgb12, u16, 12, YuvSourceChannels::Rgb, @@ -641,7 +905,7 @@ d_cnv!( "YCgCo 4:4:4 12-bit" ); d_cnv!( - icgc412_to_rgba, + icgc412_to_rgba12, u16, 12, YuvSourceChannels::Rgba, diff --git a/src/ycgco_to_rgb_alpha.rs b/src/ycgco_to_rgb_alpha.rs index 9291b339..dff79456 100644 --- a/src/ycgco_to_rgb_alpha.rs +++ b/src/ycgco_to_rgb_alpha.rs @@ -496,7 +496,7 @@ d_cnv!( ); d_cnv!( - icgc010_alpha_to_rgba, + icgc010_alpha_to_rgba10, u16, 10, YuvSourceChannels::Rgba, @@ -505,7 +505,7 @@ d_cnv!( "YCgCo 4:2:0 10-bit" ); d_cnv!( - icgc210_alpha_to_rgba, + icgc210_alpha_to_rgba10, u16, 10, YuvSourceChannels::Rgba, @@ -514,7 +514,7 @@ d_cnv!( "YCgCo 4:2:2 10-bit" ); d_cnv!( - icgc410_alpha_to_rgba, + icgc410_alpha_to_rgba10, u16, 10, YuvSourceChannels::Rgba, @@ -525,7 +525,7 @@ d_cnv!( // 12-bit d_cnv!( - icgc012_alpha_to_rgba, + icgc012_alpha_to_rgba12, u16, 12, YuvSourceChannels::Rgba, @@ -534,7 +534,7 @@ d_cnv!( "YCgCo 4:2:0 12-bit" ); d_cnv!( - icgc212_alpha_to_rgba, + icgc212_alpha_to_rgba12, u16, 12, YuvSourceChannels::Rgba, @@ -543,7 +543,7 @@ d_cnv!( "YCgCo 4:2:2 12-bit" ); d_cnv!( - icgc412_alpha_to_rgba, + icgc412_alpha_to_rgba12, u16, 12, YuvSourceChannels::Rgba, diff --git a/src/yuv_error.rs b/src/yuv_error.rs index 44e6c8ef..84e8d201 100644 --- a/src/yuv_error.rs +++ b/src/yuv_error.rs @@ -39,7 +39,7 @@ pub struct MismatchedSize { #[derive(Debug)] pub enum YuvError { DestinationSizeMismatch(MismatchedSize), - MinimumDestinationSizeMismatch(MismatchedSize), + MinimumStrideSizeMismatch(MismatchedSize), PointerOverflow, ZeroBaseSize, LumaPlaneSizeMismatch(MismatchedSize), @@ -84,8 +84,8 @@ impl Display for YuvError { "Destination size mismatch: expected={}, received={}", size.expected, size.received )), - YuvError::MinimumDestinationSizeMismatch(size) => f.write_fmt(format_args!( - "Destination must have size at least {} but it is {}", + YuvError::MinimumStrideSizeMismatch(size) => f.write_fmt(format_args!( + "Minimum stride must have size at least {} but it is {}", size.expected, size.received )), } @@ -135,9 +135,9 @@ pub(crate) fn check_rgba_destination( })); } if (rgba_stride as usize) < (width as usize * channels) { - return Err(YuvError::MinimumDestinationSizeMismatch(MismatchedSize { - expected: width as usize * height as usize * channels, - received: rgba_stride as usize * height as usize, + return Err(YuvError::MinimumStrideSizeMismatch(MismatchedSize { + expected: width as usize * channels, + received: rgba_stride as usize, })); } Ok(()) diff --git a/src/yuva_p16_rgba_f16.rs b/src/yuva_p16_rgba_f16.rs index 9b3e09ca..3515445c 100644 --- a/src/yuva_p16_rgba_f16.rs +++ b/src/yuva_p16_rgba_f16.rs @@ -877,21 +877,21 @@ mod tests { let diff_b = b as i32 - ob as i32; assert!( - diff_r <= 264, + diff_r <= 340, "Actual diff {}, Original RGB {:?}, Round-tripped RGB {:?}", diff_r, [or, og, ob], [r, g, b] ); assert!( - diff_g <= 264, + diff_g <= 340, "Actual diff {}, Original RGB {:?}, Round-tripped RGB {:?}", diff_g, [or, og, ob], [r, g, b] ); assert!( - diff_b <= 264, + diff_b <= 340, "Actual diff {}, Original RGB {:?}, Round-tripped RGB {:?}", diff_b, [or, og, ob],