Skip to content

Aarch64 performance fixes #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 2, 2021
14 changes: 13 additions & 1 deletion src/implementation/aarch64/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,19 @@ impl SimdU8Value {
#[inline]
#[allow(clippy::cast_ptr_alignment)]
unsafe fn load_from(ptr: *const u8) -> Self {
Self::from(vld1q_u8(ptr))
// WORKAROUND:
// The vld1q_u8 intrinsic is currently broken, it treats it as individual
// byte loads so the compiler sometimes decides it is a better to load
// individual bytes to "optimize" a subsequent SIMD shuffle
//
// This code forces a full 128-bit load.
let mut dst = core::mem::MaybeUninit::<uint8x16_t>::uninit();
core::ptr::copy_nonoverlapping(
ptr as *const u8,
dst.as_mut_ptr() as *mut u8,
core::mem::size_of::<uint8x16_t>(),
);
Self::from(dst.assume_init())
}

#[inline]
Expand Down
15 changes: 13 additions & 2 deletions src/implementation/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,20 @@ macro_rules! algorithm_simd {

#[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
#[inline]
#[allow(unconditional_panic)] // does not panic because len is checked
#[allow(const_err)] // the same, but for Rust 1.38.0
unsafe fn check_block(&mut self, input: SimdInput) {
for i in 0..input.vals.len() {
self.check_bytes(input.vals[i]);
// necessary because a for loop is not unrolled on ARM64
if input.vals.len() == 2 {
self.check_bytes(input.vals[0]);
self.check_bytes(input.vals[1]);
} else if input.vals.len() == 4 {
self.check_bytes(input.vals[0]);
self.check_bytes(input.vals[1]);
self.check_bytes(input.vals[2]);
self.check_bytes(input.vals[3]);
} else {
panic!("Unsupported number of chunks");
}
}
}
Expand Down