diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 5c8425623d..f6b2babf09 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -191,3 +191,7 @@ simd_ty!(i32x16[i32]: i32, i32, i32, i32, i32, i32, i32, i32 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); + +simd_ty!(i64x8[i64]: + i64, i64, i64, i64, i64, i64, i64, i64 + | x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 94efadac74..8994f57724 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -87,6 +87,13 @@ pub unsafe fn _mm512_setr_epi32( mem::transmute(r) } +/// Broadcast 64-bit integer `a` to all elements of `dst`. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { + mem::transmute(i64x8::splat(a)) +} + #[cfg(test)] mod tests { use std; diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs new file mode 100644 index 0000000000..9aacd9e02e --- /dev/null +++ b/crates/core_arch/src/x86/avx512ifma.rs @@ -0,0 +1,196 @@ +use core_arch::x86::*; + +#[cfg(test)] +use stdsimd_test::assert_instr; + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512IFMA52&expand=3488) +#[inline] +#[target_feature(enable = "avx512ifma")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + vpmadd52huq_512(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3497&avx512techs=AVX512IFMA52) +#[inline] +#[target_feature(enable = "avx512ifma")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + vpmadd52luq_512(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52huq_256(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52luq_256(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52huq))] +pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52huq_128(a, b, c) +} + +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL) +#[inline] +#[target_feature(enable = "avx512ifma,avx512vl")] +#[cfg_attr(test, assert_instr(vpmadd52luq))] +pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52luq_128(a, b, c) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"] + fn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; + #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128"] + fn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; + #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256"] + fn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; + #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256"] + fn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; + #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512"] + fn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; + #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512"] + fn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; +} + +#[cfg(test)] +mod tests { + use std; + use stdsimd_test::simd_test; + + use core_arch::x86::*; + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_madd52hi_epu64() { + let mut a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + a = _mm512_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm512_set1_epi64(11030549757952); + + assert_eq_m512i(a, expected); + } + + #[simd_test(enable = "avx512ifma")] + unsafe fn test_mm512_madd52lo_epu64() { + let mut a = _mm512_set1_epi64(10 << 40); + let b = _mm512_set1_epi64((11 << 40) + 4); + let c = _mm512_set1_epi64((12 << 40) + 3); + + a = _mm512_madd52lo_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm512_set1_epi64(100055558127628); + + assert_eq_m512i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_madd52hi_epu64() { + let mut a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + a = _mm256_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm256_set1_epi64x(11030549757952); + + assert_eq_m256i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm256_madd52lo_epu64() { + let mut a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + a = _mm256_madd52lo_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm256_set1_epi64x(100055558127628); + + assert_eq_m256i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_madd52hi_epu64() { + let mut a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + a = _mm_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm_set1_epi64x(11030549757952); + + assert_eq_m128i(a, expected); + } + + #[simd_test(enable = "avx512ifma,avx512vl")] + unsafe fn test_mm_madd52lo_epu64() { + let mut a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + a = _mm_madd52hi_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm_set1_epi64x(11030549757952); + + assert_eq_m128i(a, expected); + } +} diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 5870c2cc18..694b11cfa2 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -560,3 +560,6 @@ pub unsafe fn ud2() -> ! { mod avx512f; pub use self::avx512f::*; + +mod avx512ifma; +pub use self::avx512ifma::*; diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index 4a81b8f16d..29995454ea 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -273,15 +273,25 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { .flat_map(|c| c.to_lowercase()) .collect::(); + // The XML file names IFMA as "avx512ifma52", while Rust calls + // it "avx512ifma". Fix this mismatch by replacing the Intel + // name with the Rust name. + let fixup_cpuid = |cpuid: String| match cpuid.as_ref() { + "avx512ifma52" => String::from("avx512ifma"), + _ => cpuid, + }; + let fixed_cpuid = fixup_cpuid(cpuid); + let rust_feature = rust .target_feature .expect(&format!("no target feature listed for {}", rust.name)); - if rust_feature.contains(&cpuid) { + + if rust_feature.contains(&fixed_cpuid) { continue; } bail!( "intel cpuid `{}` not in `{}` for {}", - cpuid, + fixed_cpuid, rust_feature, rust.name ) @@ -359,7 +369,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { // Apparently all of clang/msvc/gcc accept these intrinsics on // 32-bit, so let's do the same "_mm_set_epi64x" | "_mm_set1_epi64x" | "_mm256_set_epi64x" | "_mm256_setr_epi64x" - | "_mm256_set1_epi64x" => true, + | "_mm256_set1_epi64x" | "_mm512_set1_epi64" => true, // These return a 64-bit argument but they're assembled from other // 32-bit registers, so these work on 32-bit just fine. See #308 for