Skip to content

Commit 02ab7b4

Browse files
committed
AVX-512 base added
1 parent 20ebeb9 commit 02ab7b4

19 files changed

+1460
-193
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,5 @@ libc = "0.2.158"
2727

2828
[features]
2929
default = ["colorspaces"]
30-
colorspaces = ["dep:colorutils-rs"]
30+
colorspaces = ["dep:colorutils-rs"]
31+
nightly_avx512 = []

README.md

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,25 +44,27 @@ Despite all implementation are fast, not all the paths are implemented using SIM
4444

4545
`~` - Partially implemented
4646

47-
| | NEON | SSE | AVX2 | WASM |
48-
|----------------|------|-----|------|------|
49-
| RGBA (8 bit) | x | x | x | ~ |
50-
| RGB (8 bit) | x | x | ~ | ~ |
51-
| Plane (8 bit) | x | x | ~ | ~ |
52-
| RGBA (8+ bit) | x | x | ~ | - |
53-
| RGB (8+ bit) | x | x | ~ | - |
54-
| Plane (8+ bit) | ~ | ~ | ~ | - |
55-
| RGBA (f32) | x | x | x | - |
56-
| RGB (f32) | x | x | ~ | - |
57-
| Plane (f32) | x | x | ~ | - |
58-
| RGBA (f16) | x | x | x | - |
59-
| RGB (f16) | x | ~ | ~ | - |
60-
| Plane (f16) | ~ | ~ | ~ | - |
61-
| AR30/RA30 | x | - | - | - |
47+
| | NEON | SSE | AVX2 | AVX-512 | WASM |
48+
|----------------|------|-----|------|---------|------|
49+
| RGBA (8 bit) | x | x | x | ~ | ~ |
50+
| RGB (8 bit) | x | x | ~ | ~ | ~ |
51+
| Plane (8 bit) | x | x | ~ | ~ | ~ |
52+
| RGBA (8+ bit) | x | x | ~ | - | - |
53+
| RGB (8+ bit) | x | x | ~ | - | - |
54+
| Plane (8+ bit) | ~ | ~ | ~ | - | - |
55+
| RGBA (f32) | x | x | x | - | - |
56+
| RGB (f32) | x | x | ~ | - | - |
57+
| Plane (f32) | x | x | ~ | - | - |
58+
| RGBA (f16) | x | x | x | - | - |
59+
| RGB (f16) | x | ~ | ~ | - | - |
60+
| Plane (f16) | ~ | ~ | ~ | - | - |
61+
| AR30/RA30 | x | - | - | - | - |
6262

6363
#### Features
6464

65-
To enable support of `f16` the feature `half` should be activated.
65+
Features:
66+
- To enable support of `f16` the feature `half` should be activated.
67+
- `nightly_avx512` activates AVX-512 feature set and requires `nightly` compiler channel
6668

6769
#### Target features with runtime dispatch
6870

@@ -72,6 +74,8 @@ For x86 and aarch64 NEON runtime dispatch is used.
7274

7375
`avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, no additional actions need, and called the best path.
7476

77+
`avx512` requires feature `nightly_avx512` and requires `nightly` compiler channel, runtime detection if it is available then will be used.
78+
7579
`fullfp16` NEON target detection performed in runtime, when available best the best paths for *f16* images are available on ARM.
7680

7781
WASM `simd128` target feature activating is mandatory in build flags.

app/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ edition = "2021"
66
[dependencies]
77
image = { version = "0.25.5", features = ["default"] }
88
#image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
9-
pic-scale = { path = "..", features = ["half"], default-features = true }
9+
pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
1010
fast_image_resize = { version = "5.0.0", features = [] }
1111
half = { version = "2.4.1", default-features = true }
1212

app/src/main.rs

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ fn resize_plane(
4242

4343
fn main() {
4444
// test_fast_image();
45-
let img = ImageReader::open("./assets/asset_4.png")
45+
let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
4646
.unwrap()
4747
.decode()
4848
.unwrap();
@@ -59,7 +59,7 @@ fn main() {
5959

6060
//
6161
let store =
62-
ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
62+
ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
6363
.unwrap();
6464

6565
let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
@@ -75,15 +75,13 @@ fn main() {
7575
// )
7676
// .unwrap();
7777

78-
let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
78+
let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
7979
dimensions.0 as usize,
8080
dimensions.1 as usize / 2,
8181
10,
8282
);
8383

84-
scaler
85-
.resize_rgba_u16(&store, &mut dst_store, true)
86-
.unwrap();
84+
scaler.resize_rgba(&store, &mut dst_store, true).unwrap();
8785

8886
let elapsed_time = start_time.elapsed();
8987
// Print the elapsed time in milliseconds
@@ -160,13 +158,13 @@ fn main() {
160158
// .map(|&x| (x * 255f32) as u8)
161159
// .collect();
162160

163-
let dst: Vec<u8> = dst_store
164-
.as_bytes()
165-
.iter()
166-
.map(|&x| (x >> 2) as u8)
167-
.collect();
161+
// let dst: Vec<u8> = dst_store
162+
// .as_bytes()
163+
// .iter()
164+
// .map(|&x| (x >> 2) as u8)
165+
// .collect();
168166

169-
// let dst = dst_store.as_bytes();
167+
let dst = dst_store.as_bytes();
170168
// let dst = resized;
171169
// image::save_buffer(
172170
// "converted.png",

src/alpha_handle_u8.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@ pub(crate) fn premultiply_alpha_rgba(
137137
if is_x86_feature_detected!("avx2") {
138138
_dispatcher = avx_premultiply_alpha_rgba;
139139
}
140+
#[cfg(feature = "nightly_avx512")]
141+
if std::arch::is_x86_feature_detected!("avx512f")
142+
&& std::arch::is_x86_feature_detected!("avx512bw")
143+
{
144+
use crate::avx512::avx512_premultiply_alpha_rgba;
145+
_dispatcher = avx512_premultiply_alpha_rgba;
146+
}
140147
}
141148
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
142149
{
@@ -159,15 +166,22 @@ pub(crate) fn unpremultiply_alpha_rgba(
159166
}
160167
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
161168
{
162-
if is_x86_feature_detected!("sse4.1") {
169+
if std::arch::is_x86_feature_detected!("sse4.1") {
163170
_dispatcher = sse_unpremultiply_alpha_rgba;
164171
}
165172
}
166173
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
167174
{
168-
if is_x86_feature_detected!("avx2") {
175+
if std::arch::is_x86_feature_detected!("avx2") {
169176
_dispatcher = avx_unpremultiply_alpha_rgba;
170177
}
178+
#[cfg(feature = "nightly_avx512")]
179+
if std::arch::is_x86_feature_detected!("avx512f")
180+
&& std::arch::is_x86_feature_detected!("avx512bw")
181+
{
182+
use crate::avx512::avx512_unpremultiply_alpha_rgba;
183+
_dispatcher = avx512_unpremultiply_alpha_rgba;
184+
}
171185
}
172186
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
173187
{

src/avx2/alpha_u16.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m2
4545
let low_px = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(px, zeros));
4646
let high_px = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(px, zeros));
4747

48-
let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(low_px, low_low_a)));
49-
let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(high_px, low_high_a)));
48+
let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(low_px, low_low_a)));
49+
let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(high_px, low_high_a)));
5050

5151
_mm256_packus_epi32(new_ll, new_lh)
5252
}
@@ -110,8 +110,7 @@ trait Avx2PremultiplyExecutor {
110110
struct Avx2PremultiplyExecutorDefault<const BIT_DEPTH: usize> {}
111111

112112
impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutorDefault<BIT_DEPTH> {
113-
#[inline]
114-
#[target_feature(enable = "avx2")]
113+
#[inline(always)]
115114
unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16]) {
116115
let src_ptr = src.as_ptr();
117116
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
@@ -203,8 +202,7 @@ impl<const BIT_DEPTH: usize> Avx2PremultiplyExecutor for Avx2PremultiplyExecutor
203202
struct Avx2PremultiplyExecutorAnyBit {}
204203

205204
impl Avx2PremultiplyExecutorAnyBit {
206-
#[inline]
207-
#[target_feature(enable = "avx2")]
205+
#[inline(always)]
208206
unsafe fn premultiply_chunk(&self, dst: &mut [u16], src: &[u16], scale: __m256) {
209207
let src_ptr = src.as_ptr();
210208
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);

0 commit comments

Comments
 (0)