rust-lang · Amanieu · Jun 29, 2024 · Jun 23, 2024 · Jun 23, 2024 · Jun 25, 2024
diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md
@@ -51,13 +51,6 @@
 </p></details>
 
 
-<details><summary>["AVX2"]</summary><p>
-
-  * [ ] [`_mm256_stream_load_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_load_si256)
-  * [ ] [`_mm_broadcastsi128_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsi128_si256)
-</p></details>
-
-
 <details><summary>["AVX512BW"]</summary><p>
 
   * [ ] [`_cvtmask32_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask32_u32)
@@ -1334,12 +1327,6 @@
 </p></details>
 
 
-<details><summary>["BMI1"]</summary><p>
-
-  * [ ] [`_tzcnt_u16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u16)
-</p></details>
-
-
 <details><summary>["CET_SS"]</summary><p>
 
   * [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy)
@@ -1546,22 +1533,6 @@
 </p></details>
 
 
-<details><summary>["SSE2"]</summary><p>
-
-  * [ ] [`_mm_loadu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16)
-  * [ ] [`_mm_loadu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32)
-  * [ ] [`_mm_storeu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16)
-  * [ ] [`_mm_storeu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32)
-  * [ ] [`_mm_storeu_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64)
-</p></details>
-
-
-<details><summary>["SSE4.1"]</summary><p>
-
-  * [ ] [`_mm_stream_load_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128)
-</p></details>
-
-
 <details><summary>["TSXLDTRK"]</summary><p>
 
   * [ ] [`_xresldtrk`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xresldtrk)

diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
@@ -587,6 +587,19 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
     simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4])
 }
 
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let zero = _mm_setzero_si128();
+    let ret = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    transmute::<i64x4, _>(ret)
+}
+
 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
 // `vbroadcastf128`.
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
@@ -3124,6 +3137,35 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
     transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
 }
 
+/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
+/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "avx,avx2")]
+#[cfg_attr(test, assert_instr(vmovntdqa))]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
+    let dst: __m256i;
+    crate::arch::asm!(
+        "vmovntdqa {a}, [{mem_addr}]",
+        a = out(ymm_reg) dst,
+        mem_addr = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
@@ -5153,6 +5195,16 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm256_stream_load_si256() {
+        let a = _mm256_set_epi64x(5, 6, 7, 8);
+        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m256i(a, r);
+    }
+
     #[simd_test(enable = "avx2")]
     unsafe fn test_mm256_sub_epi16() {
         let a = _mm256_set1_epi16(4);

diff --git a/crates/core_arch/src/x86/bmi1.rs b/crates/core_arch/src/x86/bmi1.rs
@@ -85,6 +85,19 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 {
     x & (x.wrapping_sub(1))
 }
 
+/// Counts the number of trailing least significant zero bits.
+///
+/// When the source operand is `0`, it returns its size in bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u16)
+#[inline]
+#[target_feature(enable = "bmi1")]
+#[cfg_attr(test, assert_instr(tzcnt))]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
+    x.trailing_zeros() as u16
+}
+
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is `0`, it returns its size in bits.
@@ -169,6 +182,13 @@ mod tests {
         assert_eq!(r, 0b0010_0000u32);
     }
 
+    #[simd_test(enable = "bmi1")]
+    unsafe fn test_tzcnt_u16() {
+        assert_eq!(_tzcnt_u16(0b0000_0001u16), 0u16);
+        assert_eq!(_tzcnt_u16(0b0000_0000u16), 16u16);
+        assert_eq!(_tzcnt_u16(0b1001_0000u16), 4u16);
+    }
+
     #[simd_test(enable = "bmi1")]
     unsafe fn test_tzcnt_u32() {
         assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);

diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
@@ -2588,6 +2588,42 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
     mem_addr.cast::<__m128d>().write_unaligned(a);
 }
 
+/// Store 16-bit integer from the first element of a into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
+}
+
+/// Store 32-bit integer from the first element of a into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
+}
+
+/// Store 64-bit integer from the first element of a into memory.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
+}
+
 /// Stores the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
@@ -2713,11 +2749,49 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
     dst
 }
 
-/// Loads unaligned 64-bits of integer data from memory into new vector.
+/// Loads unaligned 16-bits of integer data from memory into new vector.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
+    transmute(i16x8::new(
+        ptr::read_unaligned(mem_addr as *const i16),
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+    ))
+}
+
+/// Loads unaligned 32-bits of integer data from memory into new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
+#[inline]
+#[target_feature(enable = "sse2")]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
+    transmute(i32x4::new(
+        ptr::read_unaligned(mem_addr as *const i32),
+        0,
+        0,
+        0,
+    ))
+}
+
+/// Loads unaligned 16-bits of integer data from memory into new vector.
+///
+/// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
@@ -4699,6 +4773,33 @@ mod tests {
         assert_eq!(vals[ofs + 1], 2.0);
     }
 
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
+        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut r = _mm_setr_epi32(5, 6, 7, 8);
+        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
+        let e = _mm_setr_epi32(1, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_storeu_si64() {
+        let a = _mm_setr_epi64x(1, 2);
+        let mut r = _mm_setr_epi64x(3, 4);
+        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
+        let e = _mm_setr_epi64x(1, 4);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_store1_pd() {
         let mut mem = Memory { data: [0.0f64; 4] };
@@ -4783,6 +4884,20 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
+    }
+
+    #[simd_test(enable = "sse2")]
+    unsafe fn test_mm_loadu_si32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
+    }
+
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_loadu_si64() {
         let a = _mm_setr_epi64x(5, 6);

diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs
@@ -1142,6 +1142,35 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
     _mm_testnzc_si128(a, mask)
 }
 
+/// Load 128-bits of integer data from memory into dstt. mem_addr must be aligned on a 16-byte
+/// boundary or a general-protection exception may be generated. To minimize caching, the data
+/// is flagged as non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
+///
+/// # Safety of non-temporal stores
+///
+/// After using this intrinsic, but before any other access to the memory that this intrinsic
+/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+/// return.
+///
+/// See [`_mm_sfence`] for details.
+#[inline]
+#[target_feature(enable = "sse,sse4.1")]
+#[cfg_attr(test, assert_instr(movntdqa))]
+#[unstable(feature = "simd_x86_updates", issue = "126936")]
+pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
+    let dst: __m128i;
+    crate::arch::asm!(
+        "movntdqa {a}, [{mem_addr}]",
+        a = out(xmm_reg) dst,
+        mem_addr = in(reg) mem_addr,
+        options(pure, readonly, nostack, preserves_flags),
+    );
+    dst
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.sse41.insertps"]
@@ -1936,4 +1965,14 @@ mod tests {
         let r = _mm_test_mix_ones_zeros(a, mask);
         assert_eq!(r, 0);
     }
+
+    #[simd_test(enable = "sse4.1")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
+    unsafe fn test_mm_stream_load_si128() {
+        let a = _mm_set_epi64x(5, 6);
+        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
+        assert_eq_m128i(a, r);
+    }
 }
diff --git a/crates/stdarch-verify/tests/mips.rs b/crates/stdarch-verify/tests/mips.rs
@@ -1,5 +1,5 @@
 //! Verification of MIPS MSA intrinsics
-#![allow(bad_style, unused)]
+#![allow(unused, non_upper_case_globals, clippy::single_match)]
 
 // This file is obtained from
 // https://gcc.gnu.org/onlinedocs//gcc/MIPS-SIMD-Architecture-Built-in-Functions.html

diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
@@ -1,15 +1,4 @@
-#![allow(bad_style)]
-#![allow(unused)]
-#![allow(
-    clippy::shadow_reuse,
-    clippy::cast_lossless,
-    clippy::match_same_arms,
-    clippy::nonminimal_bool,
-    clippy::print_stdout,
-    clippy::use_debug,
-    clippy::eq_op,
-    clippy::useless_format
-)]
+#![allow(unused, non_camel_case_types)]
 
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::fs::File;
@@ -284,6 +273,7 @@ fn verify_all_signatures() {
                 "_mm_cvtsi64x_sd",
                 "_bextr2_u64",
                 "_mm_tzcnt_64",
+                "_mm_broadcastsi128_si256",
             ];
             if !skip.contains(&rust.name) {
                 println!(
@@ -560,7 +550,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
     // Make sure we've got the right return type.
     if let Some(t) = rust.ret {
         equate(t, &intel.return_.type_, "", rust.name, false)?;
-    } else if intel.return_.type_ != "" && intel.return_.type_ != "void" {
+    } else if !intel.return_.type_.is_empty() && intel.return_.type_ != "void" {
         bail!(
             "{} returns `{}` with intel, void in rust",
             rust.name,