Skip to content

Commit b36e66d

Browse files
committed
Add the missing BMI1, SSE2, SSE4.1 and AVX2 intrinsics
1 parent 249f5c5 commit b36e66d

File tree

7 files changed

+232
-45
lines changed

7 files changed

+232
-45
lines changed

crates/core_arch/missing-x86.md

-29
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,6 @@
5151
</p></details>
5252

5353

54-
<details><summary>["AVX2"]</summary><p>
55-
56-
* [ ] [`_mm256_stream_load_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_load_si256)
57-
* [ ] [`_mm_broadcastsi128_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsi128_si256)
58-
</p></details>
59-
60-
6154
<details><summary>["AVX512BW"]</summary><p>
6255

6356
* [ ] [`_cvtmask32_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask32_u32)
@@ -1334,12 +1327,6 @@
13341327
</p></details>
13351328

13361329

1337-
<details><summary>["BMI1"]</summary><p>
1338-
1339-
* [ ] [`_tzcnt_u16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u16)
1340-
</p></details>
1341-
1342-
13431330
<details><summary>["CET_SS"]</summary><p>
13441331

13451332
* [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy)
@@ -1546,22 +1533,6 @@
15461533
</p></details>
15471534

15481535

1549-
<details><summary>["SSE2"]</summary><p>
1550-
1551-
* [ ] [`_mm_loadu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16)
1552-
* [ ] [`_mm_loadu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32)
1553-
* [ ] [`_mm_storeu_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16)
1554-
* [ ] [`_mm_storeu_si32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32)
1555-
* [ ] [`_mm_storeu_si64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64)
1556-
</p></details>
1557-
1558-
1559-
<details><summary>["SSE4.1"]</summary><p>
1560-
1561-
* [ ] [`_mm_stream_load_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128)
1562-
</p></details>
1563-
1564-
15651536
<details><summary>["TSXLDTRK"]</summary><p>
15661537

15671538
* [ ] [`_xresldtrk`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xresldtrk)

crates/core_arch/src/x86/avx2.rs

+52
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,19 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
587587
simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4])
588588
}
589589

590+
/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
591+
/// the 256-bit returned value.
592+
///
593+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
594+
#[inline]
595+
#[target_feature(enable = "avx2")]
596+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
597+
pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
598+
let zero = _mm_setzero_si128();
599+
let ret = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
600+
transmute::<i64x4, _>(ret)
601+
}
602+
590603
// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
591604
// `vbroadcastf128`.
592605
/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
@@ -3124,6 +3137,35 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
31243137
transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
31253138
}
31263139

3140+
/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3141+
/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3142+
/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3143+
///
3144+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3145+
///
3146+
/// # Safety of non-temporal stores
3147+
///
3148+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
3149+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
3150+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
3151+
/// return.
3152+
///
3153+
/// See [`_mm_sfence`] for details.
3154+
#[inline]
3155+
#[target_feature(enable = "avx,avx2")]
3156+
#[cfg_attr(test, assert_instr(vmovntdqa))]
3157+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
3158+
pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3159+
let dst: __m256i;
3160+
crate::arch::asm!(
3161+
"vmovntdqa {a}, [{mem_addr}]",
3162+
a = out(ymm_reg) dst,
3163+
mem_addr = in(reg) mem_addr,
3164+
options(pure, readonly, nostack, preserves_flags),
3165+
);
3166+
dst
3167+
}
3168+
31273169
/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
31283170
///
31293171
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
@@ -5153,6 +5195,16 @@ mod tests {
51535195
assert_eq_m256i(r, e);
51545196
}
51555197

5198+
#[simd_test(enable = "avx2")]
5199+
// Miri cannot support this until it is clear how it fits in the Rust memory model
5200+
// (non-temporal store)
5201+
#[cfg_attr(miri, ignore)]
5202+
unsafe fn test_mm256_stream_load_si256() {
5203+
let a = _mm256_set_epi64x(5, 6, 7, 8);
5204+
let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5205+
assert_eq_m256i(a, r);
5206+
}
5207+
51565208
#[simd_test(enable = "avx2")]
51575209
unsafe fn test_mm256_sub_epi16() {
51585210
let a = _mm256_set1_epi16(4);

crates/core_arch/src/x86/bmi1.rs

+20
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,19 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 {
8585
x & (x.wrapping_sub(1))
8686
}
8787

88+
/// Counts the number of trailing least significant zero bits.
89+
///
90+
/// When the source operand is `0`, it returns its size in bits.
91+
///
92+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_tzcnt_u16)
93+
#[inline]
94+
#[target_feature(enable = "bmi1")]
95+
#[cfg_attr(test, assert_instr(tzcnt))]
96+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
97+
pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
98+
x.trailing_zeros() as u16
99+
}
100+
88101
/// Counts the number of trailing least significant zero bits.
89102
///
90103
/// When the source operand is `0`, it returns its size in bits.
@@ -169,6 +182,13 @@ mod tests {
169182
assert_eq!(r, 0b0010_0000u32);
170183
}
171184

185+
#[simd_test(enable = "bmi1")]
186+
unsafe fn test_tzcnt_u16() {
187+
assert_eq!(_tzcnt_u16(0b0000_0001u16), 0u16);
188+
assert_eq!(_tzcnt_u16(0b0000_0000u16), 16u16);
189+
assert_eq!(_tzcnt_u16(0b1001_0000u16), 4u16);
190+
}
191+
172192
#[simd_test(enable = "bmi1")]
173193
unsafe fn test_tzcnt_u32() {
174194
assert_eq!(_tzcnt_u32(0b0000_0001u32), 0u32);

crates/core_arch/src/x86/sse2.rs

+117-2
Original file line numberDiff line numberDiff line change
@@ -2588,6 +2588,42 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
25882588
mem_addr.cast::<__m128d>().write_unaligned(a);
25892589
}
25902590

2591+
/// Store 16-bit integer from the first element of a into memory.
2592+
///
2593+
/// `mem_addr` does not need to be aligned on any particular boundary.
2594+
///
2595+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2596+
#[inline]
2597+
#[target_feature(enable = "sse2")]
2598+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
2599+
pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2600+
ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2601+
}
2602+
2603+
/// Store 32-bit integer from the first element of a into memory.
2604+
///
2605+
/// `mem_addr` does not need to be aligned on any particular boundary.
2606+
///
2607+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2608+
#[inline]
2609+
#[target_feature(enable = "sse2")]
2610+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
2611+
pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2612+
ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2613+
}
2614+
2615+
/// Store 64-bit integer from the first element of a into memory.
2616+
///
2617+
/// `mem_addr` does not need to be aligned on any particular boundary.
2618+
///
2619+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2620+
#[inline]
2621+
#[target_feature(enable = "sse2")]
2622+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
2623+
pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2624+
ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2625+
}
2626+
25912627
/// Stores the lower double-precision (64-bit) floating-point element from `a`
25922628
/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
25932629
/// 16-byte boundary or a general-protection exception may be generated.
@@ -2713,11 +2749,49 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
27132749
dst
27142750
}
27152751

2716-
/// Loads unaligned 64-bits of integer data from memory into new vector.
2752+
/// Loads unaligned 16-bits of integer data from memory into new vector.
27172753
///
27182754
/// `mem_addr` does not need to be aligned on any particular boundary.
27192755
///
2720-
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2756+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2757+
#[inline]
2758+
#[target_feature(enable = "sse2")]
2759+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
2760+
pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2761+
transmute(i16x8::new(
2762+
ptr::read_unaligned(mem_addr as *const i16),
2763+
0,
2764+
0,
2765+
0,
2766+
0,
2767+
0,
2768+
0,
2769+
0,
2770+
))
2771+
}
2772+
2773+
/// Loads unaligned 32-bits of integer data from memory into new vector.
2774+
///
2775+
/// `mem_addr` does not need to be aligned on any particular boundary.
2776+
///
2777+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2778+
#[inline]
2779+
#[target_feature(enable = "sse2")]
2780+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
2781+
pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2782+
transmute(i32x4::new(
2783+
ptr::read_unaligned(mem_addr as *const i32),
2784+
0,
2785+
0,
2786+
0,
2787+
))
2788+
}
2789+
2790+
/// Loads unaligned 16-bits of integer data from memory into new vector.
2791+
///
2792+
/// `mem_addr` does not need to be aligned on any particular boundary.
2793+
///
2794+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
27212795
#[inline]
27222796
#[target_feature(enable = "sse2")]
27232797
#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
@@ -4699,6 +4773,33 @@ mod tests {
46994773
assert_eq!(vals[ofs + 1], 2.0);
47004774
}
47014775

4776+
#[simd_test(enable = "sse2")]
4777+
unsafe fn test_mm_storeu_si16() {
4778+
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4779+
let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4780+
_mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4781+
let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4782+
assert_eq_m128i(r, e);
4783+
}
4784+
4785+
#[simd_test(enable = "sse2")]
4786+
unsafe fn test_mm_storeu_si32() {
4787+
let a = _mm_setr_epi32(1, 2, 3, 4);
4788+
let mut r = _mm_setr_epi32(5, 6, 7, 8);
4789+
_mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4790+
let e = _mm_setr_epi32(1, 6, 7, 8);
4791+
assert_eq_m128i(r, e);
4792+
}
4793+
4794+
#[simd_test(enable = "sse2")]
4795+
unsafe fn test_mm_storeu_si64() {
4796+
let a = _mm_setr_epi64x(1, 2);
4797+
let mut r = _mm_setr_epi64x(3, 4);
4798+
_mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4799+
let e = _mm_setr_epi64x(1, 4);
4800+
assert_eq_m128i(r, e);
4801+
}
4802+
47024803
#[simd_test(enable = "sse2")]
47034804
unsafe fn test_mm_store1_pd() {
47044805
let mut mem = Memory { data: [0.0f64; 4] };
@@ -4783,6 +4884,20 @@ mod tests {
47834884
assert_eq_m128d(r, e);
47844885
}
47854886

4887+
#[simd_test(enable = "sse2")]
4888+
unsafe fn test_mm_loadu_si16() {
4889+
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4890+
let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4891+
assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4892+
}
4893+
4894+
#[simd_test(enable = "sse2")]
4895+
unsafe fn test_mm_loadu_si32() {
4896+
let a = _mm_setr_epi32(1, 2, 3, 4);
4897+
let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4898+
assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4899+
}
4900+
47864901
#[simd_test(enable = "sse2")]
47874902
unsafe fn test_mm_loadu_si64() {
47884903
let a = _mm_setr_epi64x(5, 6);

crates/core_arch/src/x86/sse41.rs

+39
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,35 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
11421142
_mm_testnzc_si128(a, mask)
11431143
}
11441144

1145+
/// Load 128-bits of integer data from memory into dstt. mem_addr must be aligned on a 16-byte
1146+
/// boundary or a general-protection exception may be generated. To minimize caching, the data
1147+
/// is flagged as non-temporal (unlikely to be used again soon)
1148+
///
1149+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
1150+
///
1151+
/// # Safety of non-temporal stores
1152+
///
1153+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
1154+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1155+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1156+
/// return.
1157+
///
1158+
/// See [`_mm_sfence`] for details.
1159+
#[inline]
1160+
#[target_feature(enable = "sse,sse4.1")]
1161+
#[cfg_attr(test, assert_instr(movntdqa))]
1162+
#[unstable(feature = "simd_x86_updates", issue = "126936")]
1163+
pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
1164+
let dst: __m128i;
1165+
crate::arch::asm!(
1166+
"movntdqa {a}, [{mem_addr}]",
1167+
a = out(xmm_reg) dst,
1168+
mem_addr = in(reg) mem_addr,
1169+
options(pure, readonly, nostack, preserves_flags),
1170+
);
1171+
dst
1172+
}
1173+
11451174
#[allow(improper_ctypes)]
11461175
extern "C" {
11471176
#[link_name = "llvm.x86.sse41.insertps"]
@@ -1936,4 +1965,14 @@ mod tests {
19361965
let r = _mm_test_mix_ones_zeros(a, mask);
19371966
assert_eq!(r, 0);
19381967
}
1968+
1969+
#[simd_test(enable = "sse4.1")]
1970+
// Miri cannot support this until it is clear how it fits in the Rust memory model
1971+
// (non-temporal store)
1972+
#[cfg_attr(miri, ignore)]
1973+
unsafe fn test_mm_stream_load_si128() {
1974+
let a = _mm_set_epi64x(5, 6);
1975+
let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
1976+
assert_eq_m128i(a, r);
1977+
}
19391978
}

crates/stdarch-verify/tests/mips.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//! Verification of MIPS MSA intrinsics
2-
#![allow(bad_style, unused)]
2+
#![allow(unused, non_upper_case_globals, clippy::single_match)]
33

44
// This file is obtained from
55
// https://gcc.gnu.org/onlinedocs//gcc/MIPS-SIMD-Architecture-Built-in-Functions.html

crates/stdarch-verify/tests/x86-intel.rs

+3-13
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,4 @@
1-
#![allow(bad_style)]
2-
#![allow(unused)]
3-
#![allow(
4-
clippy::shadow_reuse,
5-
clippy::cast_lossless,
6-
clippy::match_same_arms,
7-
clippy::nonminimal_bool,
8-
clippy::print_stdout,
9-
clippy::use_debug,
10-
clippy::eq_op,
11-
clippy::useless_format
12-
)]
1+
#![allow(unused, non_camel_case_types)]
132

143
use std::collections::{BTreeMap, HashMap, HashSet};
154
use std::fs::File;
@@ -284,6 +273,7 @@ fn verify_all_signatures() {
284273
"_mm_cvtsi64x_sd",
285274
"_bextr2_u64",
286275
"_mm_tzcnt_64",
276+
"_mm_broadcastsi128_si256",
287277
];
288278
if !skip.contains(&rust.name) {
289279
println!(
@@ -560,7 +550,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
560550
// Make sure we've got the right return type.
561551
if let Some(t) = rust.ret {
562552
equate(t, &intel.return_.type_, "", rust.name, false)?;
563-
} else if intel.return_.type_ != "" && intel.return_.type_ != "void" {
553+
} else if !intel.return_.type_.is_empty() && intel.return_.type_ != "void" {
564554
bail!(
565555
"{} returns `{}` with intel, void in rust",
566556
rust.name,

0 commit comments

Comments
 (0)