Skip to content

Commit

Permalink
AVX, SSE refactor and improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jan 1, 2025
1 parent 38f214e commit ab50ceb
Show file tree
Hide file tree
Showing 18 changed files with 1,567 additions and 1,814 deletions.
2 changes: 1 addition & 1 deletion app/benches/resize_rgba/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
dimensions.0 as usize,
dimensions.1 as usize,
)
.unwrap();
.unwrap();
let mut target = ImageStoreMut::alloc_with_depth(
dimensions.0 as usize / 4,
dimensions.1 as usize / 4,
Expand Down
2 changes: 1 addition & 1 deletion app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ fn resize_plane(

fn main() {
// test_fast_image();
let img = ImageReader::open("./assets/abstract_alpha.png")
let img = ImageReader::open("./assets/asset_4.png")
.unwrap()
.decode()
.unwrap();
Expand Down
246 changes: 121 additions & 125 deletions src/avx2/alpha_f16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,61 +55,59 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
let mut rem = dst;
let mut src_rem = src;

unsafe {
for (dst, src) in rem
.chunks_exact_mut(16 * 4)
.zip(src_rem.chunks_exact(16 * 4))
{
let src_ptr = src.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
for (dst, src) in rem
.chunks_exact_mut(16 * 4)
.zip(src_rem.chunks_exact(16 * 4))
{
let src_ptr = src.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);

let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha);
let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha);
let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha);
let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
let low_r = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha);
let low_g = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha);
let low_b = _mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha);

let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
let high_r = _mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
high_alpha,
);
let high_g = _mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
high_alpha,
);
let high_b = _mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
high_alpha,
);
let r_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
);
let g_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
);
let b_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
);
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
_mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
_mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
_mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
}

rem = rem.chunks_exact_mut(16 * 4).into_remainder();
src_rem = src_rem.chunks_exact(16 * 4).remainder();
let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
let high_r = _mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
high_alpha,
);
let high_g = _mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
high_alpha,
);
let high_b = _mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
high_alpha,
);
let r_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
);
let g_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
);
let b_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
);
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
_mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
_mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
_mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
}

rem = rem.chunks_exact_mut(16 * 4).into_remainder();
src_rem = src_rem.chunks_exact(16 * 4).remainder();

premultiply_pixel_f16_row(rem, src_rem);
}

Expand Down Expand Up @@ -155,84 +153,82 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) {
let mut rem = in_place;

unsafe {
for dst in rem.chunks_exact_mut(16 * 4) {
let src_ptr = dst.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);

let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
let zeros = _mm256_setzero_ps();
let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros);
let low_r = _mm256_blendv_ps(
_mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha),
zeros,
low_alpha_zero_mask,
);
let low_g = _mm256_blendv_ps(
_mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha),
zeros,
low_alpha_zero_mask,
);
let low_b = _mm256_blendv_ps(
_mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha),
zeros,
low_alpha_zero_mask,
);
for dst in rem.chunks_exact_mut(16 * 4) {
let src_ptr = dst.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);

let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros);
let high_r = _mm256_blendv_ps(
_mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let high_g = _mm256_blendv_ps(
_mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let high_b = _mm256_blendv_ps(
_mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let r_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
);
let g_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
);
let b_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
);
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
_mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
_mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
_mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
}
let low_alpha = _mm256_cvtph_ps(_mm256_castsi256_si128(pixel.3));
let zeros = _mm256_setzero_ps();
let low_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(low_alpha, zeros);
let low_r = _mm256_blendv_ps(
_mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.0)), low_alpha),
zeros,
low_alpha_zero_mask,
);
let low_g = _mm256_blendv_ps(
_mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.1)), low_alpha),
zeros,
low_alpha_zero_mask,
);
let low_b = _mm256_blendv_ps(
_mm256_mul_ps(_mm256_cvtph_ps(_mm256_castsi256_si128(pixel.2)), low_alpha),
zeros,
low_alpha_zero_mask,
);

rem = rem.chunks_exact_mut(16 * 4).into_remainder();
let high_alpha = _mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.3));
let high_alpha_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(high_alpha, zeros);
let high_r = _mm256_blendv_ps(
_mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.0)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let high_g = _mm256_blendv_ps(
_mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.1)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let high_b = _mm256_blendv_ps(
_mm256_mul_ps(
_mm256_cvtph_ps(_mm256_extracti128_si256::<1>(pixel.2)),
high_alpha,
),
zeros,
high_alpha_zero_mask,
);
let r_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_r),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_r),
);
let g_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_g),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_g),
);
let b_values = avx_combine_epi(
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(low_b),
_mm256_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(high_b),
);
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
avx_interleave_rgba_epi16(r_values, g_values, b_values, pixel.3);
_mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
_mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
_mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
}

rem = rem.chunks_exact_mut(16 * 4).into_remainder();

unpremultiply_pixel_f16_row(rem);
}

Expand Down
50 changes: 24 additions & 26 deletions src/avx2/alpha_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,34 +129,32 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32])
let mut rem = dst;
let mut src_rem = src;

unsafe {
for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
let src_ptr = src.as_ptr();
let rgba0 = _mm256_loadu_ps(src_ptr);
let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);

let rrr = _mm256_mul_ps(rrr, aaa);
let ggg = _mm256_mul_ps(ggg, aaa);
let bbb = _mm256_mul_ps(bbb, aaa);

let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa);

let dst_ptr = dst.as_mut_ptr();
_mm256_storeu_ps(dst_ptr, rgba0);
_mm256_storeu_ps(dst_ptr.add(8), rgba1);
_mm256_storeu_ps(dst_ptr.add(16), rgba2);
_mm256_storeu_ps(dst_ptr.add(24), rgba3);

_cx += 8;
}

rem = rem.chunks_exact_mut(8 * 4).into_remainder();
src_rem = src_rem.chunks_exact(8 * 4).remainder();
for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
let src_ptr = src.as_ptr();
let rgba0 = _mm256_loadu_ps(src_ptr);
let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);

let rrr = _mm256_mul_ps(rrr, aaa);
let ggg = _mm256_mul_ps(ggg, aaa);
let bbb = _mm256_mul_ps(bbb, aaa);

let (rgba0, rgba1, rgba2, rgba3) = avx_interleave_rgba_ps(rrr, ggg, bbb, aaa);

let dst_ptr = dst.as_mut_ptr();
_mm256_storeu_ps(dst_ptr, rgba0);
_mm256_storeu_ps(dst_ptr.add(8), rgba1);
_mm256_storeu_ps(dst_ptr.add(16), rgba2);
_mm256_storeu_ps(dst_ptr.add(24), rgba3);

_cx += 8;
}

rem = rem.chunks_exact_mut(8 * 4).into_remainder();
src_rem = src_rem.chunks_exact(8 * 4).remainder();

premultiply_pixel_f32_row(rem, src_rem);
}

Expand Down
Loading

0 comments on commit ab50ceb

Please sign in to comment.