@@ -27663,9 +27663,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
27663
27663
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
27664
27664
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
27665
27665
pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
27666
- let a = a & 0b00000000_11111111;
27667
- let b = b & 0b11111111_00000000;
27668
- a | b
27666
+ ((a & 0xff) << 8) | (b & 0xff)
27669
27667
}
27670
27668
27671
27669
/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
@@ -31554,7 +31552,13 @@ pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
31554
31552
#[target_feature(enable = "avx512f")]
31555
31553
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31556
31554
pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
31557
- simd_reduce_max(a.as_f32x16())
31555
+ let a = _mm256_max_ps(
31556
+ simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
31557
+ simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
31558
+ );
31559
+ let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
31560
+ let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
31561
+ _mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
31558
31562
}
31559
31563
31560
31564
/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31564,11 +31568,7 @@ pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
31564
31568
#[target_feature(enable = "avx512f")]
31565
31569
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31566
31570
pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
31567
- simd_reduce_max(simd_select_bitmask(
31568
- k,
31569
- a.as_f32x16(),
31570
- _mm512_undefined_ps().as_f32x16(),
31571
- ))
31571
+ _mm512_reduce_max_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MIN), k, a))
31572
31572
}
31573
31573
31574
31574
/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
@@ -31578,7 +31578,12 @@ pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
31578
31578
#[target_feature(enable = "avx512f")]
31579
31579
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31580
31580
pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
31581
- simd_reduce_max(a.as_f64x8())
31581
+ let a = _mm256_max_pd(
31582
+ _mm512_extractf64x4_pd::<0>(a),
31583
+ _mm512_extractf64x4_pd::<1>(a),
31584
+ );
31585
+ let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
31586
+ _mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
31582
31587
}
31583
31588
31584
31589
/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31588,11 +31593,7 @@ pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
31588
31593
#[target_feature(enable = "avx512f")]
31589
31594
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31590
31595
pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
31591
- simd_reduce_max(simd_select_bitmask(
31592
- k,
31593
- a.as_f64x8(),
31594
- _mm512_undefined_pd().as_f64x8(),
31595
- ))
31596
+ _mm512_reduce_max_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MIN), k, a))
31596
31597
}
31597
31598
31598
31599
/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -31698,7 +31699,13 @@ pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
31698
31699
#[target_feature(enable = "avx512f")]
31699
31700
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31700
31701
pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
31701
- simd_reduce_min(a.as_f32x16())
31702
+ let a = _mm256_min_ps(
31703
+ simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
31704
+ simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
31705
+ );
31706
+ let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
31707
+ let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
31708
+ _mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
31702
31709
}
31703
31710
31704
31711
/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31708,11 +31715,7 @@ pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
31708
31715
#[target_feature(enable = "avx512f")]
31709
31716
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31710
31717
pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
31711
- simd_reduce_min(simd_select_bitmask(
31712
- k,
31713
- a.as_f32x16(),
31714
- _mm512_undefined_ps().as_f32x16(),
31715
- ))
31718
+ _mm512_reduce_min_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MAX), k, a))
31716
31719
}
31717
31720
31718
31721
/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
@@ -31722,7 +31725,12 @@ pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
31722
31725
#[target_feature(enable = "avx512f")]
31723
31726
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31724
31727
pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
31725
- simd_reduce_min(a.as_f64x8())
31728
+ let a = _mm256_min_pd(
31729
+ _mm512_extractf64x4_pd::<0>(a),
31730
+ _mm512_extractf64x4_pd::<1>(a),
31731
+ );
31732
+ let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
31733
+ _mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
31726
31734
}
31727
31735
31728
31736
/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31732,11 +31740,7 @@ pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
31732
31740
#[target_feature(enable = "avx512f")]
31733
31741
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
31734
31742
pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
31735
- simd_reduce_min(simd_select_bitmask(
31736
- k,
31737
- a.as_f64x8(),
31738
- _mm512_undefined_pd().as_f64x8(),
31739
- ))
31743
+ _mm512_reduce_min_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MAX), k, a))
31740
31744
}
31741
31745
31742
31746
/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -54323,7 +54327,7 @@ mod tests {
54323
54327
let a: u16 = 0b11001100_00110011;
54324
54328
let b: u16 = 0b00101110_00001011;
54325
54329
let r = _mm512_kunpackb(a, b);
54326
- let e: u16 = 0b00101110_00110011 ;
54330
+ let e: u16 = 0b00110011_00001011 ;
54327
54331
assert_eq!(r, e);
54328
54332
}
54329
54333
0 commit comments