Skip to content

Commit 86a3fe3

Browse files
committed
copy_misaligned_words: use inline asm on ARM, simplify fallback implementation
1 parent 4df7a8d commit 86a3fe3

File tree

2 files changed

+54
-52
lines changed

2 files changed

+54
-52
lines changed

compiler-builtins/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#![feature(linkage)]
99
#![feature(naked_functions)]
1010
#![feature(repr_simd)]
11+
#![feature(cfg_match)]
1112
#![cfg_attr(f16_enabled, feature(f16))]
1213
#![cfg_attr(f128_enabled, feature(f128))]
1314
#![no_builtins]

compiler-builtins/src/mem/impls.rs

+53-52
Original file line numberDiff line numberDiff line change
@@ -41,44 +41,44 @@ unsafe fn read_usize_unaligned(x: *const usize) -> usize {
4141
core::mem::transmute(x_read)
4242
}
4343

44-
/// Loads a `T`-sized chunk from `src` into `dst` at offset `offset`, if that does not exceed
45-
/// `load_sz`. The offset pointers must both be `T`-aligned. Returns the new offset, advanced by the
46-
/// chunk size if a load happened.
47-
#[cfg(not(feature = "mem-unaligned"))]
4844
#[inline(always)]
49-
unsafe fn load_chunk_aligned<T: Copy>(
50-
src: *const usize,
51-
dst: *mut usize,
52-
load_sz: usize,
53-
offset: usize,
54-
) -> usize {
55-
let chunk_sz = core::mem::size_of::<T>();
56-
if (load_sz & chunk_sz) != 0 {
57-
*dst.wrapping_byte_add(offset).cast::<T>() = *src.wrapping_byte_add(offset).cast::<T>();
58-
offset | chunk_sz
59-
} else {
60-
offset
45+
unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
46+
let dest_end = dest.wrapping_add(n);
47+
while dest < dest_end {
48+
*dest = *src;
49+
dest = dest.wrapping_add(1);
50+
src = src.wrapping_add(1);
6151
}
6252
}
6353

64-
/// Load `load_sz` many bytes from `src`, which must be usize-aligned. Acts as if we did a `usize`
65-
/// read with the out-of-bounds part filled with 0s.
66-
/// `load_sz` be strictly less than `WORD_SIZE`.
54+
/// Load `load_sz` many bytes from `src`, which must be usize-aligned.
55+
/// `load_sz` be strictly less than `WORD_SIZE`. The remaining bytes are filled non-deterministically.
6756
#[cfg(not(feature = "mem-unaligned"))]
6857
#[inline(always)]
6958
unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
7059
debug_assert!(load_sz < WORD_SIZE);
71-
// We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
72-
// (since `load_sz < WORD_SIZE`).
73-
const { assert!(WORD_SIZE <= 8) };
60+
debug_assert!(src.addr() % WORD_SIZE == 0);
61+
62+
let mut out: usize;
63+
core::cfg_match! {
64+
// We don't need an x86 path here as `feature = "mem-unaligned"` is always set there.
65+
all(not(miri), any(target_arch = "arm", target_arch = "aarch64", target_arch = "arm64ec")) => {
66+
unsafe {
67+
core::arch::asm!(
68+
"ldr {out}, [{src}]",
69+
src = in(reg) src,
70+
out = lateout(reg) out,
71+
options(nostack, readonly, preserves_flags),
72+
);
73+
}
74+
}
75+
_ => {
76+
out = 0;
77+
copy_forward_bytes(&raw mut out as *mut u8, src as *mut u8, load_sz);
78+
}
79+
80+
}
7481

75-
let mut i = 0;
76-
let mut out = 0usize;
77-
// We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
78-
i = load_chunk_aligned::<u32>(src, &raw mut out, load_sz, i);
79-
i = load_chunk_aligned::<u16>(src, &raw mut out, load_sz, i);
80-
i = load_chunk_aligned::<u8>(src, &raw mut out, load_sz, i);
81-
debug_assert!(i == load_sz);
8282
out
8383
}
8484

@@ -90,35 +90,36 @@ unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
9090
#[inline(always)]
9191
unsafe fn load_aligned_end_partial(src: *const usize, load_sz: usize) -> usize {
9292
debug_assert!(load_sz < WORD_SIZE);
93-
// We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
94-
// (since `load_sz < WORD_SIZE`).
95-
const { assert!(WORD_SIZE <= 8) };
93+
debug_assert!(src.addr() % WORD_SIZE == 0);
94+
95+
let mut out: usize;
96+
core::cfg_match! {
97+
// We don't need an x86 path here as `feature = "mem-unaligned"` is always set there.
98+
all(not(miri), any(target_arch = "arm", target_arch = "aarch64", target_arch = "arm64ec")) => {
99+
unsafe {
100+
core::arch::asm!(
101+
"ldr {out}, [{src}]",
102+
src = in(reg) src,
103+
out = lateout(reg) out,
104+
options(nostack, readonly, preserves_flags),
105+
);
106+
}
107+
}
108+
_ => {
109+
out = 0;
110+
// Obtain pointers pointing to the beginning of the range we want to load.
111+
let src_shifted = src.wrapping_byte_add(WORD_SIZE - load_sz);
112+
let out_shifted = (&raw mut out).wrapping_byte_add(WORD_SIZE - load_sz);
113+
copy_forward_bytes(out_shifted as *mut u8, src_shifted as *mut u8, load_sz);
114+
}
115+
116+
}
96117

97-
let mut i = 0;
98-
let mut out = 0usize;
99-
// Obtain pointers pointing to the beginning of the range we want to load.
100-
let src_shifted = src.wrapping_byte_add(WORD_SIZE - load_sz);
101-
let out_shifted = (&raw mut out).wrapping_byte_add(WORD_SIZE - load_sz);
102-
// We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
103-
i = load_chunk_aligned::<u8>(src_shifted, out_shifted, load_sz, i);
104-
i = load_chunk_aligned::<u16>(src_shifted, out_shifted, load_sz, i);
105-
i = load_chunk_aligned::<u32>(src_shifted, out_shifted, load_sz, i);
106-
debug_assert!(i == load_sz);
107118
out
108119
}
109120

110121
#[inline(always)]
111122
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) {
112-
#[inline(always)]
113-
unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
114-
let dest_end = dest.wrapping_add(n);
115-
while dest < dest_end {
116-
*dest = *src;
117-
dest = dest.wrapping_add(1);
118-
src = src.wrapping_add(1);
119-
}
120-
}
121-
122123
#[inline(always)]
123124
unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
124125
let mut dest_usize = dest as *mut usize;

0 commit comments

Comments
 (0)