Skip to content

Commit

Permalink
AVX-512 base added
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jan 1, 2025
1 parent 20ebeb9 commit 02ab7b4
Show file tree
Hide file tree
Showing 19 changed files with 1,460 additions and 193 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ libc = "0.2.158"

[features]
default = ["colorspaces"]
colorspaces = ["dep:colorutils-rs"]
colorspaces = ["dep:colorutils-rs"]
nightly_avx512 = []
36 changes: 20 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,27 @@ Despite all implementation are fast, not all the paths are implemented using SIM

`~` - Partially implemented

| | NEON | SSE | AVX2 | WASM |
|----------------|------|-----|------|------|
| RGBA (8 bit) | x | x | x | ~ |
| RGB (8 bit) | x | x | ~ | ~ |
| Plane (8 bit) | x | x | ~ | ~ |
| RGBA (8+ bit) | x | x | ~ | - |
| RGB (8+ bit) | x | x | ~ | - |
| Plane (8+ bit) | ~ | ~ | ~ | - |
| RGBA (f32) | x | x | x | - |
| RGB (f32) | x | x | ~ | - |
| Plane (f32) | x | x | ~ | - |
| RGBA (f16) | x | x | x | - |
| RGB (f16) | x | ~ | ~ | - |
| Plane (f16) | ~ | ~ | ~ | - |
| AR30/RA30 | x | - | - | - |
| | NEON | SSE | AVX2 | AVX-512 | WASM |
|----------------|------|-----|------|---------|------|
| RGBA (8 bit) | x | x | x | ~ | ~ |
| RGB (8 bit) | x | x | ~ | ~ | ~ |
| Plane (8 bit) | x | x | ~ | ~ | ~ |
| RGBA (8+ bit) | x | x | ~ | - | - |
| RGB (8+ bit) | x | x | ~ | - | - |
| Plane (8+ bit) | ~ | ~ | ~ | - | - |
| RGBA (f32) | x | x | x | - | - |
| RGB (f32) | x | x | ~ | - | - |
| Plane (f32) | x | x | ~ | - | - |
| RGBA (f16) | x | x | x | - | - |
| RGB (f16) | x | ~ | ~ | - | - |
| Plane (f16) | ~ | ~ | ~ | - | - |
| AR30/RA30 | x | - | - | - | - |

#### Features

To enable support of `f16` the feature `half` should be activated.
Features:
- To enable support of `f16` the feature `half` should be activated.
- `nightly_avx512` activates AVX-512 feature set and requires `nightly` compiler channel

#### Target features with runtime dispatch

Expand All @@ -72,6 +74,8 @@ For x86 and aarch64 NEON runtime dispatch is used.

`avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, no additional actions need, and called the best path.

`avx512` requires feature `nightly_avx512` and requires `nightly` compiler channel, runtime detection if it is available then will be used.

`fullfp16` NEON target detection performed in runtime, when available best the best paths for *f16* images are available on ARM.

WASM `simd128` target feature activating is mandatory in build flags.
Expand Down
2 changes: 1 addition & 1 deletion app/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ edition = "2021"
[dependencies]
image = { version = "0.25.5", features = ["default"] }
#image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
pic-scale = { path = "..", features = ["half"], default-features = true }
pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
fast_image_resize = { version = "5.0.0", features = [] }
half = { version = "2.4.1", default-features = true }

Expand Down
22 changes: 10 additions & 12 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ fn resize_plane(

fn main() {
// test_fast_image();
let img = ImageReader::open("./assets/asset_4.png")
let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
.unwrap()
.decode()
.unwrap();
Expand All @@ -59,7 +59,7 @@ fn main() {

//
let store =
ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
.unwrap();

let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
Expand All @@ -75,15 +75,13 @@ fn main() {
// )
// .unwrap();

let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
dimensions.0 as usize,
dimensions.1 as usize / 2,
10,
);

scaler
.resize_rgba_u16(&store, &mut dst_store, true)
.unwrap();
scaler.resize_rgba(&store, &mut dst_store, true).unwrap();

let elapsed_time = start_time.elapsed();
// Print the elapsed time in milliseconds
Expand Down Expand Up @@ -160,13 +158,13 @@ fn main() {
// .map(|&x| (x * 255f32) as u8)
// .collect();

let dst: Vec<u8> = dst_store
.as_bytes()
.iter()
.map(|&x| (x >> 2) as u8)
.collect();
// let dst: Vec<u8> = dst_store
// .as_bytes()
// .iter()
// .map(|&x| (x >> 2) as u8)
// .collect();

// let dst = dst_store.as_bytes();
let dst = dst_store.as_bytes();
// let dst = resized;
// image::save_buffer(
// "converted.png",
Expand Down
18 changes: 16 additions & 2 deletions src/alpha_handle_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ pub(crate) fn premultiply_alpha_rgba(
if is_x86_feature_detected!("avx2") {
_dispatcher = avx_premultiply_alpha_rgba;
}
#[cfg(feature = "nightly_avx512")]
if std::arch::is_x86_feature_detected!("avx512f")
&& std::arch::is_x86_feature_detected!("avx512bw")
{
use crate::avx512::avx512_premultiply_alpha_rgba;
_dispatcher = avx512_premultiply_alpha_rgba;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
Expand All @@ -159,15 +166,22 @@ pub(crate) fn unpremultiply_alpha_rgba(
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("sse4.1") {
if std::arch::is_x86_feature_detected!("sse4.1") {
_dispatcher = sse_unpremultiply_alpha_rgba;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("avx2") {
if std::arch::is_x86_feature_detected!("avx2") {
_dispatcher = avx_unpremultiply_alpha_rgba;
}
#[cfg(feature = "nightly_avx512")]
if std::arch::is_x86_feature_detected!("avx512f")
&& std::arch::is_x86_feature_detected!("avx512bw")
{
use crate::avx512::avx512_unpremultiply_alpha_rgba;
_dispatcher = avx512_unpremultiply_alpha_rgba;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
Expand Down
10 changes: 4 additions & 6 deletions src/avx2/alpha_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m2
let low_px = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(px, zeros));
let high_px = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(px, zeros));

let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(low_px, low_low_a)));
let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(high_px, low_high_a)));
let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(low_px, low_low_a)));
let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(high_px, low_high_a)));

_mm256_packus_epi32(new_ll, new_lh)
}
Expand Down Expand Up @@ -110,8 +110,7 @@ trait Avx2PremultiplyExecutor {
struct Avx2PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}

impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutorDefault<BIT_DEPTH> {
#[inline]
#[target_feature(enable = "avx2")]
#[inline(always)]
unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16]) {
let src_ptr = src.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
Expand Down Expand Up @@ -203,8 +202,7 @@ impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutor for Avx2PremultiplyExecutor
struct Avx2PremultiplyExecutorAnyBit {}

impl Avx2PremultiplyExecutorAnyBit {
#[inline]
#[target_feature(enable = "avx2")]
#[inline(always)]
unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m256) {
let src_ptr = src.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
Expand Down
Loading

0 comments on commit 02ab7b4

Please sign in to comment.