Skip to content

Commit 2e59822

Browse files
authored
Fix aarch64 performance issues
- The fixed loop of four 128-bit chunks was not automatically unrolled. It is hand-unrolled now. This does not change the assembly output on x64. - The [vld1q_u8](https://doc.rust-lang.org/stable/core/arch/aarch64/fn.vld1q_u8.html) intrinsic is broken. The compiler thinks it can "optimize" loads by loading bytes individually if a SIMD shuffle instruction follows. According to [the ARM docs](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?search=vld1q_u8) it should be coded as one instruction. This had an effect on the code when the loop was manually unrolled. Workaround: see code. Issue filed: rust-lang/stdarch#1148
1 parent a65aac0 commit 2e59822

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

src/implementation/aarch64/neon.rs

+13-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,19 @@ impl SimdU8Value {
8585
#[inline]
8686
#[allow(clippy::cast_ptr_alignment)]
8787
unsafe fn load_from(ptr: *const u8) -> Self {
88-
Self::from(vld1q_u8(ptr))
88+
// WORKAROUND:
89+
// The vld1q_u8 intrinsic is currently broken, it treats it as individual
90+
// byte loads so the compiler sometimes decides it is a better to load
91+
// individual bytes to "optimize" a subsequent SIMD shuffle
92+
//
93+
// This code forces a full 128-bit load.
94+
let mut dst = core::mem::MaybeUninit::<uint8x16_t>::uninit();
95+
core::ptr::copy_nonoverlapping(
96+
ptr as *const u8,
97+
dst.as_mut_ptr() as *mut u8,
98+
core::mem::size_of::<uint8x16_t>(),
99+
);
100+
Self::from(dst.assume_init())
89101
}
90102

91103
#[inline]

src/implementation/algorithm.rs

+13-2
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,20 @@ macro_rules! algorithm_simd {
192192

193193
#[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
194194
#[inline]
195+
#[allow(unconditional_panic)] // does not panic because len is checked
196+
#[allow(const_err)] // the same, but for Rust 1.38.0
195197
unsafe fn check_block(&mut self, input: SimdInput) {
196-
for i in 0..input.vals.len() {
197-
self.check_bytes(input.vals[i]);
198+
// necessary because a for loop is not unrolled on ARM64
199+
if input.vals.len() == 2 {
200+
self.check_bytes(input.vals[0]);
201+
self.check_bytes(input.vals[1]);
202+
} else if input.vals.len() == 4 {
203+
self.check_bytes(input.vals[0]);
204+
self.check_bytes(input.vals[1]);
205+
self.check_bytes(input.vals[2]);
206+
self.check_bytes(input.vals[3]);
207+
} else {
208+
panic!("Unsupported number of chunks");
198209
}
199210
}
200211
}

0 commit comments

Comments
 (0)