Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
- thumbv7em-none-eabihf

# macOS targets
#- x86_64-apple-darwin
- x86_64-apple-darwin
- aarch64-apple-darwin
# FIXME: gh-actions build environment doesn't have linker support
# - i686-apple-darwin
Expand All @@ -103,8 +103,7 @@ jobs:
- x86_64-pc-windows-msvc
- i686-pc-windows-msvc
- aarch64-pc-windows-msvc
# FIXME: Disassembly not implemented for the # following targets:
# - x86_64-pc-windows-gnu:
- x86_64-pc-windows-gnu
# - i686-pc-windows-gnu:

include:
Expand Down Expand Up @@ -155,6 +154,8 @@ jobs:
- target: aarch64-pc-windows-msvc
os: windows-latest
norun: true
- target: x86_64-pc-windows-gnu
os: windows-latest
- target: i586-unknown-linux-gnu
os: ubuntu-latest
- target: nvptx64-nvidia-cuda
Expand Down
1,571 changes: 1,571 additions & 0 deletions crates/core_arch/missing-x86.md

Large diffs are not rendered by default.

112 changes: 83 additions & 29 deletions crates/core_arch/src/x86/avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
#[inline]
#[target_feature(enable = "avx")]
// FIXME: Should be 'vandpd' instruction.
// See https://github.com/rust-lang/stdarch/issues/71
#[cfg_attr(test, assert_instr(vandps))]
#[cfg_attr(test, assert_instr(vandp))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
let a: u64x4 = transmute(a);
Expand Down Expand Up @@ -82,9 +81,8 @@ pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
#[inline]
#[target_feature(enable = "avx")]
// FIXME: should be `vorpd` instruction.
// See <https://github.com/rust-lang/stdarch/issues/71>.
#[cfg_attr(test, assert_instr(vorps))]
#[cfg_attr(test, assert_instr(vorp))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
let a: u64x4 = transmute(a);
Expand Down Expand Up @@ -162,8 +160,7 @@ pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
#[inline]
#[target_feature(enable = "avx")]
// FIXME: should be `vandnpd` instruction.
#[cfg_attr(test, assert_instr(vandnps))]
#[cfg_attr(test, assert_instr(vandnp))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
let a: u64x4 = transmute(a);
Expand Down Expand Up @@ -615,8 +612,7 @@ pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
#[inline]
#[target_feature(enable = "avx")]
// FIXME Should be 'vxorpd' instruction.
#[cfg_attr(test, assert_instr(vxorps))]
#[cfg_attr(test, assert_instr(vxorp))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
let a: u64x4 = transmute(a);
Expand Down Expand Up @@ -943,7 +939,7 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(
all(test, not(target_os = "windows")),
all(test, not(target_env = "msvc")),
assert_instr(vextractf128, IMM1 = 1)
)]
#[rustc_legacy_const_generics(1)]
Expand All @@ -964,7 +960,7 @@ pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(
all(test, not(target_os = "windows")),
all(test, not(target_env = "msvc")),
assert_instr(vextractf128, IMM1 = 1)
)]
#[rustc_legacy_const_generics(1)]
Expand All @@ -980,7 +976,7 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(
all(test, not(target_os = "windows")),
all(test, not(target_env = "msvc")),
assert_instr(vextractf128, IMM1 = 1)
)]
#[rustc_legacy_const_generics(1)]
Expand All @@ -995,6 +991,29 @@ pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
transmute(dst)
}

/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
#[inline]
#[target_feature(enable = "avx")]
// This intrinsic has no corresponding instruction.
#[rustc_legacy_const_generics(1)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
static_assert_uimm_bits!(INDEX, 3);
simd_extract!(a.as_i32x8(), INDEX as u32)
}

/// Returns the first element of the input vector of `[8 x i32]`.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
#[inline]
#[target_feature(enable = "avx")]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
simd_extract!(a.as_i32x8(), 0)
}

/// Zeroes the contents of all XMM or YMM registers.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
Expand Down Expand Up @@ -1270,7 +1289,7 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(
all(test, not(target_os = "windows")),
all(test, not(target_env = "msvc")),
assert_instr(vinsertf128, IMM1 = 1)
)]
#[rustc_legacy_const_generics(2)]
Expand All @@ -1292,7 +1311,7 @@ pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(
all(test, not(target_os = "windows")),
all(test, not(target_env = "msvc")),
assert_instr(vinsertf128, IMM1 = 1)
)]
#[rustc_legacy_const_generics(2)]
Expand All @@ -1313,7 +1332,7 @@ pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> _
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(
all(test, not(target_os = "windows")),
all(test, not(target_env = "msvc")),
assert_instr(vinsertf128, IMM1 = 1)
)]
#[rustc_legacy_const_generics(2)]
Expand Down Expand Up @@ -1378,7 +1397,7 @@ pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m25
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
#[cfg_attr(test, assert_instr(vmovap))]
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
Expand All @@ -1393,7 +1412,7 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
#[cfg_attr(test, assert_instr(vmovap))]
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
Expand Down Expand Up @@ -1437,7 +1456,7 @@ pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
#[cfg_attr(test, assert_instr(vmovup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
let mut dst = _mm256_undefined_pd();
Expand All @@ -1456,7 +1475,7 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
#[cfg_attr(test, assert_instr(vmovup))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
mem_addr.cast::<__m256d>().write_unaligned(a);
Expand Down Expand Up @@ -1715,11 +1734,11 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
/// See [`_mm_sfence`] for details.
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
#[cfg_attr(test, assert_instr(vmovntdq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
"vmovntdq [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
Expand All @@ -1742,12 +1761,12 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
/// See [`_mm_sfence`] for details.
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
#[cfg_attr(test, assert_instr(vmovntpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
"vmovntpd [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
Expand Down Expand Up @@ -2145,7 +2164,7 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
#[inline]
#[target_feature(enable = "avx")]
#[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
#[cfg_attr(test, assert_instr(vxorp))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_setzero_pd() -> __m256d {
_mm256_set1_pd(0.0)
Expand Down Expand Up @@ -2676,8 +2695,7 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
// FIXME simd_shuffle!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
simd_shuffle!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4])
}

/// Casts vector of type __m128d to type __m256d;
Expand All @@ -2690,8 +2708,7 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
// instructions, thus it has zero latency.
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
// FIXME simd_shuffle!(a, a, [0, 1, -1, -1])
simd_shuffle!(a, a, [0, 1, 0, 0])
simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2])
}

/// Casts vector of type __m128i to type __m256i;
Expand All @@ -2705,8 +2722,8 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
let a = a.as_i64x2();
// FIXME simd_shuffle!(a, a, [0, 1, -1, -1])
let dst: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 0]);
let undefined = _mm_undefined_si128().as_i64x2();
let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
transmute(dst)
}

Expand Down Expand Up @@ -3719,6 +3736,22 @@ mod tests {
assert_eq_m128i(r, e);
}

#[simd_test(enable = "avx")]
unsafe fn test_mm256_extract_epi32() {
let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
let r1 = _mm256_extract_epi32::<0>(a);
let r2 = _mm256_extract_epi32::<3>(a);
assert_eq!(r1, -1);
assert_eq!(r2, 3);
}

#[simd_test(enable = "avx")]
unsafe fn test_mm256_cvtsi256_si32() {
let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
let r = _mm256_cvtsi256_si32(a);
assert_eq!(r, 1);
}

#[simd_test(enable = "avx")]
#[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
unsafe fn test_mm256_zeroall() {
Expand Down Expand Up @@ -4698,6 +4731,27 @@ mod tests {
assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
}

#[simd_test(enable = "avx")]
unsafe fn test_mm256_castps128_ps256() {
let a = _mm_setr_ps(1., 2., 3., 4.);
let r = _mm256_castps128_ps256(a);
assert_eq_m128(_mm256_castps256_ps128(r), a);
}

#[simd_test(enable = "avx")]
unsafe fn test_mm256_castpd128_pd256() {
let a = _mm_setr_pd(1., 2.);
let r = _mm256_castpd128_pd256(a);
assert_eq_m128d(_mm256_castpd256_pd128(r), a);
}

#[simd_test(enable = "avx")]
unsafe fn test_mm256_castsi128_si256() {
let a = _mm_setr_epi32(1, 2, 3, 4);
let r = _mm256_castsi128_si256(a);
assert_eq_m128i(_mm256_castsi256_si128(r), a);
}

#[simd_test(enable = "avx")]
unsafe fn test_mm256_zextps128_ps256() {
let a = _mm_setr_ps(1., 2., 3., 4.);
Expand Down
Loading