rust-lang · Amanieu · May 3, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 25, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -56,6 +56,8 @@ jobs:
           os: ubuntu-latest
         - tuple: aarch64-unknown-linux-gnu
           os: ubuntu-latest
+        - tuple: aarch64_be-unknown-linux-gnu
+          os: ubuntu-latest
         - tuple: riscv64gc-unknown-linux-gnu
           os: ubuntu-latest
         - tuple: powerpc-unknown-linux-gnu
@@ -125,6 +127,11 @@ jobs:
             tuple: aarch64-unknown-linux-gnu
             os: ubuntu-latest
           test_everything: true
+        - target:
+            tuple: aarch64_be-unknown-linux-gnu
+            os: ubuntu-latest
+          test_everything: true
+          build_std: true
         - target:
             tuple: armv7-unknown-linux-gnueabihf
             os: ubuntu-latest
@@ -192,13 +199,16 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: recursive
     - name: Install Rust
       run: |
         rustup update nightly --no-self-update
         rustup default nightly
     - run: rustup target add ${{ matrix.target.tuple }}
+      if: matrix.build_std == ''
+    - run: |
+        rustup component add rust-src
+        echo "CARGO_UNSTABLE_BUILD_STD=std" >> $GITHUB_ENV
+      if: matrix.build_std != ''
     - run: cargo generate-lockfile
 
     # Configure some env vars based on matrix configuration

diff --git a/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile b/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile
@@ -26,5 +26,5 @@ ENV AARCH64_BE_LIBC="${AARCH64_BE_TOOLCHAIN}/aarch64_be-none-linux-gnu/libc"
 
 ENV CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER="${AARCH64_BE_TOOLCHAIN}/bin/aarch64_be-none-linux-gnu-gcc"
 ENV CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64_be -cpu max -L ${AARCH64_BE_LIBC}"
-ENV OBJDUMP="${AARCH64_BE_TOOLCHAIN}/bin/bin/aarch64-none-linux-gnu-objdump"
+ENV OBJDUMP="${AARCH64_BE_TOOLCHAIN}/bin/aarch64_be-none-linux-gnu-objdump"
 ENV STDARCH_TEST_SKIP_FEATURE=tme
diff --git a/ci/run-docker.sh b/ci/run-docker.sh
@@ -36,6 +36,7 @@ run() {
       --env NOSTD \
       --env NORUN \
       --env RUSTFLAGS \
+      --env CARGO_UNSTABLE_BUILD_STD \
       --volume "${HOME}/.cargo":/cargo \
       --volume "$(rustc --print sysroot)":/rust:ro \
       --volume "$(pwd)":/checkout:ro \

diff --git a/ci/run.sh b/ci/run.sh
@@ -187,7 +187,7 @@ case "${TARGET}" in
             --cppcompiler "${TEST_CXX_COMPILER}" \
             --skip "${TEST_SKIP_INTRINSICS}" \
             --target "${TARGET}" \
-            --linker "${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER}" \
+            --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \
             --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}"
         ;;
      *)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -24470,6 +24470,7 @@ pub fn vrsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s16)"]
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(test, assert_instr(rsubhn2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
@@ -24480,6 +24481,7 @@ pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s32)"]
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(test, assert_instr(rsubhn2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
@@ -24490,6 +24492,7 @@ pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s64)"]
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(test, assert_instr(rsubhn2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
@@ -24500,6 +24503,7 @@ pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u16)"]
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(test, assert_instr(rsubhn2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
@@ -24510,6 +24514,7 @@ pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u32)"]
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(test, assert_instr(rsubhn2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
@@ -24520,12 +24525,79 @@ pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u64)"]
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg(target_endian = "little")]
 #[cfg_attr(test, assert_instr(rsubhn2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
     let x: uint32x2_t = vrsubhn_u64(b, c);
     unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
 }
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let x: int8x8_t = vrsubhn_s16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let x: int16x4_t = vrsubhn_s32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_s64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let x: int32x2_t = vrsubhn_s64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u16)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let x: uint8x8_t = vrsubhn_u16(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u32)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let x: uint16x4_t = vrsubhn_u32(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3, 4, 5, 6, 7]) }
+}
+#[doc = "Rounding subtract returning high narrow"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsubhn_high_u64)"]
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg(target_endian = "big")]
+#[cfg_attr(test, assert_instr(rsubhn))]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+pub fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let x: uint32x2_t = vrsubhn_u64(b, c);
+    unsafe { simd_shuffle!(a, x, [0, 1, 2, 3]) }
+}
 #[doc = "Insert vector element from another vector element"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vset_lane_f64)"]
 #[inline]

diff --git a/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs b/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
@@ -19,6 +19,7 @@ macro_rules! test_vtbl {
      - table[$table_t:ident]: [$($table_v:expr),*] |
      $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
     ) => {
+        #[cfg(target_endian = "little")]
         #[simd_test(enable = "neon")]
         unsafe fn $test_name() {
             // create table as array, and transmute it to
@@ -168,6 +169,7 @@ macro_rules! test_vtbx {
      - ext[$ext_t:ident]: [$($ext_v:expr),*] |
      $(- ctrl[$ctrl_t:ident]: [$($ctrl_v:expr),*] => [$($exp_v:expr),*])|*
     ) => {
+        #[cfg(target_endian = "little")]
         #[simd_test(enable = "neon")]
         unsafe fn $test_name() {
             // create table as array, and transmute it to

diff --git a/crates/intrinsic-test/src/main.rs b/crates/intrinsic-test/src/main.rs
@@ -594,44 +594,39 @@ fn compare_outputs(
                 ))
                 .output();
 
-            let rust = if target != "aarch64_be-unknown-linux-gnu" {
-                Command::new("sh")
-                    .current_dir("rust_programs")
-                    .arg("-c")
-                    .arg(format!(
+            let rust = Command::new("sh")
+                .current_dir("rust_programs")
+                .arg("-c")
+                .arg(format!(
                         "cargo {toolchain} run --target {target} --bin {intrinsic} --release",
                         intrinsic = intrinsic.name,
                         toolchain = toolchain,
                         target = target
-                    ))
-                    .env("RUSTFLAGS", "-Cdebuginfo=0")
-                    .output()
-            } else {
-                Command::new("sh")
-                    .arg("-c")
-                    .arg(format!(
-                        "{runner} ./rust_programs/target/{target}/release/{intrinsic}",
-                        runner = runner,
-                        target = target,
-                        intrinsic = intrinsic.name,
-                    ))
-                    .output()
-            };
+                ))
+                .env("RUSTFLAGS", "-Cdebuginfo=0")
+                .output();
 
             let (c, rust) = match (c, rust) {
                 (Ok(c), Ok(rust)) => (c, rust),
                 a => panic!("{a:#?}"),
             };
 
             if !c.status.success() {
-                error!("Failed to run C program for intrinsic {}", intrinsic.name);
+                error!(
+                    "Failed to run C program for intrinsic {intrinsic}\nstdout: {stdout}\nstderr: {stderr}",
+                    intrinsic = intrinsic.name,
+                    stdout = std::str::from_utf8(&c.stdout).unwrap_or(""),
+                    stderr = std::str::from_utf8(&c.stderr).unwrap_or(""),
+                );
                 return Some(FailureReason::RunC(intrinsic.name.clone()));
             }
 
             if !rust.status.success() {
                 error!(
-                    "Failed to run rust program for intrinsic {}",
-                    intrinsic.name
+                    "Failed to run Rust program for intrinsic {intrinsic}\nstdout: {stdout}\nstderr: {stderr}",
+                    intrinsic = intrinsic.name,
+                    stdout = std::str::from_utf8(&rust.stdout).unwrap_or(""),
+                    stderr = std::str::from_utf8(&rust.stderr).unwrap_or(""),
                 );
                 return Some(FailureReason::RunRust(intrinsic.name.clone()));
             }

diff --git a/crates/simd-test-macro/src/lib.rs b/crates/simd-test-macro/src/lib.rs
@@ -58,7 +58,7 @@ pub fn simd_test(
     {
         "i686" | "x86_64" | "i586" => "is_x86_feature_detected",
         "arm" | "armv7" => "is_arm_feature_detected",
-        "aarch64" | "arm64ec" => "is_aarch64_feature_detected",
+        "aarch64" | "arm64ec" | "aarch64_be" => "is_aarch64_feature_detected",
         maybe_riscv if maybe_riscv.starts_with("riscv") => "is_riscv_feature_detected",
         "powerpc" | "powerpcle" => "is_powerpc_feature_detected",
         "powerpc64" | "powerpc64le" => "is_powerpc64_feature_detected",

diff --git a/crates/std_detect/src/detect/os/linux/aarch64.rs b/crates/std_detect/src/detect/os/linux/aarch64.rs
@@ -399,6 +399,7 @@ impl AtHwcap {
     }
 }
 
+#[cfg(target_endian = "little")]
 #[cfg(test)]
 mod tests {
     use super::*;

diff --git a/crates/std_detect/src/detect/os/linux/auxvec.rs b/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -290,6 +290,7 @@ mod tests {
                 assert_eq!(v.hwcap2, 0);
             }
         } else if #[cfg(target_arch = "aarch64")] {
+            #[cfg(target_endian = "little")]
             #[test]
             fn linux_artificial_aarch64() {
                 let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-artificial-aarch64.auxv");
@@ -298,6 +299,7 @@ mod tests {
                 assert_eq!(v.hwcap, 0x0123456789abcdef);
                 assert_eq!(v.hwcap2, 0x02468ace13579bdf);
             }
+            #[cfg(target_endian = "little")]
             #[test]
             fn linux_no_hwcap2_aarch64() {
                 let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv");

diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -59,6 +59,14 @@ neon-unstable-f16: &neon-unstable-f16
 neon-unstable-feat-lut: &neon-unstable-feat-lut
   FnCall: [unstable, ['feature = "stdarch_neon_feat_lut"', 'issue = "138050"']]
 
+# #[cfg(target_endian = "little")]
+little-endian: &little-endian
+  FnCall: [cfg, ['target_endian = "little"']]
+
+# #[cfg(target_endian = "big")]
+big-endian: &big-endian
+  FnCall: [cfg, ['target_endian = "big"']]
+
 intrinsics:
   - name: "vaddd_{type}"
     doc: Add
@@ -8906,6 +8914,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
     return_type: "{neon_type[3]}"
     attr:
+      - *little-endian
       - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rsubhn2]]}]]
       - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
     safety: safe
@@ -8923,6 +8932,29 @@ intrinsics:
           - FnCall: ["vrsubhn_{neon_type[1]}", [b, c]]
       - FnCall: [simd_shuffle!, [a, x, "{type[4]}"]]
 
+  - name: "vrsubhn_high_{neon_type[1]}"
+    doc: "Rounding subtract returning high narrow"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}", "c: {neon_type[1]}"]
+    return_type: "{neon_type[3]}"
+    attr:
+      - *big-endian
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [rsubhn]]}]]
+      - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
+    safety: safe
+    types:
+      - [int8x8_t, int16x8_t, int16x8_t, int8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [int16x4_t, int32x4_t, int32x4_t, int16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [int32x2_t, int64x2_t, int64x2_t, int32x4_t, '[0, 1, 2, 3]']
+      - [uint8x8_t, uint16x8_t, uint16x8_t, uint8x16_t, '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]']
+      - [uint16x4_t, uint32x4_t, uint32x4_t, uint16x8_t, '[0, 1, 2, 3, 4, 5, 6, 7]']
+      - [uint32x2_t, uint64x2_t, uint64x2_t, uint32x4_t, '[0, 1, 2, 3]']
+    compose:
+      - Let:
+          - x
+          - "{neon_type[0]}"
+          - FnCall: ["vrsubhn_{neon_type[1]}", [b, c]]
+      - FnCall: [simd_shuffle!, [a, x, "{type[4]}"]]
+
   - name: "vcopy{neon_type[0].lane_nox}"
     doc: "Insert vector element from another vector element"
     arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]

diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
@@ -164,7 +164,15 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 // Original limit was 20 instructions, but ARM DSP Intrinsics
                 // are exactly 20 instructions long. So, bump the limit to 22
                 // instead of adding here a long list of exceptions.
-                _ => 22,
+                _ => {
+                    // aarch64_be may add reverse instructions which increases
+                    // the number of instructions generated.
+                    if cfg!(all(target_endian = "big", target_arch = "aarch64")) {
+                        32
+                    } else {
+                        22
+                    }
+                }
             },
             |v| v.parse().unwrap(),
         );