Skip to content

Commit 1bc5f95

Browse files
sayantnAmanieu
authored andcommitted
Fixed _mm512_kunpackb, reduce-max and reduce-min
`_mm512_kunpackb` was implemented wrong, and `simd_reduce_max` uses `maxnum` for comparison, which adheres to IEEE754, but Intel specifically says that they do NOT adhere to IEEE754 for NaNs, which can give wrong results
1 parent 41d19d4 commit 1bc5f95

File tree

1 file changed

+32
-28
lines changed

1 file changed

+32
-28
lines changed

crates/core_arch/src/x86/avx512f.rs

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27663,9 +27663,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
2766327663
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
2766427664
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
2766527665
pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
27666-
let a = a & 0b00000000_11111111;
27667-
let b = b & 0b11111111_00000000;
27668-
a | b
27666+
((a & 0xff) << 8) | (b & 0xff)
2766927667
}
2767027668

2767127669
/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
@@ -31554,7 +31552,13 @@ pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
3155431552
#[target_feature(enable = "avx512f")]
3155531553
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3155631554
pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
31557-
simd_reduce_max(a.as_f32x16())
31555+
let a = _mm256_max_ps(
31556+
simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
31557+
simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
31558+
);
31559+
let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
31560+
let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
31561+
_mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
3155831562
}
3155931563

3156031564
/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31564,11 +31568,7 @@ pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
3156431568
#[target_feature(enable = "avx512f")]
3156531569
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3156631570
pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
31567-
simd_reduce_max(simd_select_bitmask(
31568-
k,
31569-
a.as_f32x16(),
31570-
_mm512_undefined_ps().as_f32x16(),
31571-
))
31571+
_mm512_reduce_max_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MIN), k, a))
3157231572
}
3157331573

3157431574
/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
@@ -31578,7 +31578,12 @@ pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
3157831578
#[target_feature(enable = "avx512f")]
3157931579
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3158031580
pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
31581-
simd_reduce_max(a.as_f64x8())
31581+
let a = _mm256_max_pd(
31582+
_mm512_extractf64x4_pd::<0>(a),
31583+
_mm512_extractf64x4_pd::<1>(a),
31584+
);
31585+
let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
31586+
_mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
3158231587
}
3158331588

3158431589
/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31588,11 +31593,7 @@ pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
3158831593
#[target_feature(enable = "avx512f")]
3158931594
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3159031595
pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
31591-
simd_reduce_max(simd_select_bitmask(
31592-
k,
31593-
a.as_f64x8(),
31594-
_mm512_undefined_pd().as_f64x8(),
31595-
))
31596+
_mm512_reduce_max_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MIN), k, a))
3159631597
}
3159731598

3159831599
/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -31698,7 +31699,13 @@ pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
3169831699
#[target_feature(enable = "avx512f")]
3169931700
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3170031701
pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
31701-
simd_reduce_min(a.as_f32x16())
31702+
let a = _mm256_min_ps(
31703+
simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
31704+
simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
31705+
);
31706+
let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
31707+
let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
31708+
_mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
3170231709
}
3170331710

3170431711
/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31708,11 +31715,7 @@ pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
3170831715
#[target_feature(enable = "avx512f")]
3170931716
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3171031717
pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
31711-
simd_reduce_min(simd_select_bitmask(
31712-
k,
31713-
a.as_f32x16(),
31714-
_mm512_undefined_ps().as_f32x16(),
31715-
))
31718+
_mm512_reduce_min_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MAX), k, a))
3171631719
}
3171731720

3171831721
/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
@@ -31722,7 +31725,12 @@ pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
3172231725
#[target_feature(enable = "avx512f")]
3172331726
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3172431727
pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
31725-
simd_reduce_min(a.as_f64x8())
31728+
let a = _mm256_min_pd(
31729+
_mm512_extractf64x4_pd::<0>(a),
31730+
_mm512_extractf64x4_pd::<1>(a),
31731+
);
31732+
let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
31733+
_mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
3172631734
}
3172731735

3172831736
/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31732,11 +31740,7 @@ pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
3173231740
#[target_feature(enable = "avx512f")]
3173331741
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
3173431742
pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
31735-
simd_reduce_min(simd_select_bitmask(
31736-
k,
31737-
a.as_f64x8(),
31738-
_mm512_undefined_pd().as_f64x8(),
31739-
))
31743+
_mm512_reduce_min_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MAX), k, a))
3174031744
}
3174131745

3174231746
/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -54323,7 +54327,7 @@ mod tests {
5432354327
let a: u16 = 0b11001100_00110011;
5432454328
let b: u16 = 0b00101110_00001011;
5432554329
let r = _mm512_kunpackb(a, b);
54326-
let e: u16 = 0b00101110_00110011;
54330+
let e: u16 = 0b00110011_00001011;
5432754331
assert_eq!(r, e);
5432854332
}
5432954333

0 commit comments

Comments
 (0)