Skip to content

Commit 1c50c7a

Browse files
authored
Merge pull request #532 from jyn514/lse.rs
2 parents 049f73e + cac12e8 commit 1c50c7a

File tree

5 files changed

+436
-4
lines changed

5 files changed

+436
-4
lines changed

build.rs

+60-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::env;
1+
use std::{collections::HashMap, env, sync::atomic::Ordering};
22

33
fn main() {
44
println!("cargo:rerun-if-changed=build.rs");
@@ -90,6 +90,65 @@ fn main() {
9090
{
9191
println!("cargo:rustc-cfg=kernel_user_helpers")
9292
}
93+
94+
if llvm_target[0] == "aarch64" {
95+
generate_aarch64_outlined_atomics();
96+
}
97+
}
98+
99+
fn aarch64_symbol(ordering: Ordering) -> &'static str {
100+
match ordering {
101+
Ordering::Relaxed => "relax",
102+
Ordering::Acquire => "acq",
103+
Ordering::Release => "rel",
104+
Ordering::AcqRel => "acq_rel",
105+
_ => panic!("unknown symbol for {:?}", ordering),
106+
}
107+
}
108+
109+
/// The `concat_idents` macro is extremely annoying and doesn't allow us to define new items.
110+
/// Define them from the build script instead.
111+
/// Note that the majority of the code is still defined in `aarch64.rs` through inline macros.
112+
fn generate_aarch64_outlined_atomics() {
113+
use std::fmt::Write;
114+
// #[macro_export] so that we can use this in tests
115+
let gen_macro =
116+
|name| format!("#[macro_export] macro_rules! foreach_{name} {{ ($macro:path) => {{\n");
117+
118+
// Generate different macros for add/clr/eor/set so that we can test them separately.
119+
let sym_names = ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"];
120+
let mut macros = HashMap::new();
121+
for sym in sym_names {
122+
macros.insert(sym, gen_macro(sym));
123+
}
124+
125+
// Only CAS supports 16 bytes, and it has a different implementation that uses a different macro.
126+
let mut cas16 = gen_macro("cas16");
127+
128+
for ordering in [
129+
Ordering::Relaxed,
130+
Ordering::Acquire,
131+
Ordering::Release,
132+
Ordering::AcqRel,
133+
] {
134+
let sym_ordering = aarch64_symbol(ordering);
135+
for size in [1, 2, 4, 8] {
136+
for (sym, macro_) in &mut macros {
137+
let name = format!("__aarch64_{sym}{size}_{sym_ordering}");
138+
writeln!(macro_, "$macro!( {ordering:?}, {size}, {name} );").unwrap();
139+
}
140+
}
141+
let name = format!("__aarch64_cas16_{sym_ordering}");
142+
writeln!(cas16, "$macro!( {ordering:?}, {name} );").unwrap();
143+
}
144+
145+
let mut buf = String::new();
146+
for macro_def in macros.values().chain(std::iter::once(&cas16)) {
147+
buf += macro_def;
148+
buf += "}; }";
149+
}
150+
let dst = std::env::var("OUT_DIR").unwrap() + "/outlined_atomics.rs";
151+
std::fs::write(dst, buf).unwrap();
93152
}
94153

95154
#[cfg(feature = "c")]

src/aarch64.rs

+277
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
//! Aarch64 targets have two possible implementations for atomics:
2+
//! 1. Load-Locked, Store-Conditional (LL/SC), older and slower.
3+
//! 2. Large System Extensions (LSE), newer and faster.
4+
//! To avoid breaking backwards compat, C toolchains introduced a concept of "outlined atomics",
5+
//! where atomic operations call into the compiler runtime to dispatch between two depending on
6+
//! which is supported on the current CPU.
7+
//! See https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics for more discussion.
8+
//!
9+
//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
10+
//! Use the `compiler-rt` intrinsics if you want LSE support.
11+
//!
12+
//! Ported from `aarch64/lse.S` in LLVM's compiler-rt.
13+
//!
14+
//! Generate functions for each of the following symbols:
15+
//! __aarch64_casM_ORDER
16+
//! __aarch64_swpN_ORDER
17+
//! __aarch64_ldaddN_ORDER
18+
//! __aarch64_ldclrN_ORDER
19+
//! __aarch64_ldeorN_ORDER
20+
//! __aarch64_ldsetN_ORDER
21+
//! for N = {1, 2, 4, 8}, M = {1, 2, 4, 8, 16}, ORDER = { relax, acq, rel, acq_rel }
22+
//!
23+
//! The original `lse.S` has some truly horrifying code that expects to be compiled multiple times with different constants.
24+
//! We do something similar, but with macro arguments.
25+
#![cfg_attr(feature = "c", allow(unused_macros))] // avoid putting the macros into a submodule
26+
27+
// We don't do runtime dispatch so we don't have to worry about the `__aarch64_have_lse_atomics` global ctor.
28+
29+
/// Translate a byte size to a Rust type.
30+
#[rustfmt::skip]
31+
macro_rules! int_ty {
32+
(1) => { i8 };
33+
(2) => { i16 };
34+
(4) => { i32 };
35+
(8) => { i64 };
36+
(16) => { i128 };
37+
}
38+
39+
/// Given a byte size and a register number, return a register of the appropriate size.
40+
///
41+
/// See <https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers>.
42+
#[rustfmt::skip]
43+
macro_rules! reg {
44+
(1, $num:literal) => { concat!("w", $num) };
45+
(2, $num:literal) => { concat!("w", $num) };
46+
(4, $num:literal) => { concat!("w", $num) };
47+
(8, $num:literal) => { concat!("x", $num) };
48+
}
49+
50+
/// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction.
51+
#[rustfmt::skip]
52+
macro_rules! acquire {
53+
(Relaxed) => { "" };
54+
(Acquire) => { "a" };
55+
(Release) => { "" };
56+
(AcqRel) => { "a" };
57+
}
58+
59+
/// Given an atomic ordering, translate it to the release suffix for the stxr aarch64 ASM instruction.
60+
#[rustfmt::skip]
61+
macro_rules! release {
62+
(Relaxed) => { "" };
63+
(Acquire) => { "" };
64+
(Release) => { "l" };
65+
(AcqRel) => { "l" };
66+
}
67+
68+
/// Given a size in bytes, translate it to the byte suffix for an aarch64 ASM instruction.
69+
#[rustfmt::skip]
70+
macro_rules! size {
71+
(1) => { "b" };
72+
(2) => { "h" };
73+
(4) => { "" };
74+
(8) => { "" };
75+
(16) => { "" };
76+
}
77+
78+
/// Given a byte size, translate it to an Unsigned eXTend instruction
79+
/// with the correct semantics.
80+
///
81+
/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM->
82+
#[rustfmt::skip]
83+
macro_rules! uxt {
84+
(1) => { "uxtb" };
85+
(2) => { "uxth" };
86+
($_:tt) => { "mov" };
87+
}
88+
89+
/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Register instruction
90+
/// with the correct semantics.
91+
///
92+
/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXR--Load-Exclusive-Register->.
93+
macro_rules! ldxr {
94+
($ordering:ident, $bytes:tt) => {
95+
concat!("ld", acquire!($ordering), "xr", size!($bytes))
96+
};
97+
}
98+
99+
/// Given an atomic ordering and byte size, translate it to a STore eXclusive Register instruction
100+
/// with the correct semantics.
101+
///
102+
/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXR--Store-Exclusive-Register->.
103+
macro_rules! stxr {
104+
($ordering:ident, $bytes:tt) => {
105+
concat!("st", release!($ordering), "xr", size!($bytes))
106+
};
107+
}
108+
109+
/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Pair of registers instruction
110+
/// with the correct semantics.
111+
///
112+
/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXP--Load-Exclusive-Pair-of-Registers->
113+
macro_rules! ldxp {
114+
($ordering:ident) => {
115+
concat!("ld", acquire!($ordering), "xp")
116+
};
117+
}
118+
119+
/// Given an atomic ordering and byte size, translate it to a STore eXclusive Pair of registers instruction
120+
/// with the correct semantics.
121+
///
122+
/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXP--Store-Exclusive-Pair-of-registers->.
123+
macro_rules! stxp {
124+
($ordering:ident) => {
125+
concat!("st", release!($ordering), "xp")
126+
};
127+
}
128+
129+
/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.compare_and_swap>.
130+
macro_rules! compare_and_swap {
131+
($ordering:ident, $bytes:tt, $name:ident) => {
132+
intrinsics! {
133+
#[maybe_use_optimized_c_shim]
134+
#[naked]
135+
pub unsafe extern "C" fn $name (
136+
expected: int_ty!($bytes), desired: int_ty!($bytes), ptr: *mut int_ty!($bytes)
137+
) -> int_ty!($bytes) {
138+
// We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
139+
unsafe { core::arch::asm! {
140+
// UXT s(tmp0), s(0)
141+
concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
142+
"0:",
143+
// LDXR s(0), [x2]
144+
concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x2]"),
145+
// cmp s(0), s(tmp0)
146+
concat!("cmp ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
147+
"bne 1f",
148+
// STXR w(tmp1), s(1), [x2]
149+
concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 1), ", [x2]"),
150+
"cbnz w17, 0b",
151+
"1:",
152+
"ret",
153+
options(noreturn)
154+
} }
155+
}
156+
}
157+
};
158+
}
159+
160+
// i128 uses a completely different impl, so it has its own macro.
161+
macro_rules! compare_and_swap_i128 {
162+
($ordering:ident, $name:ident) => {
163+
intrinsics! {
164+
#[maybe_use_optimized_c_shim]
165+
#[naked]
166+
pub unsafe extern "C" fn $name (
167+
expected: i128, desired: i128, ptr: *mut i128
168+
) -> i128 {
169+
unsafe { core::arch::asm! {
170+
"mov x16, x0",
171+
"mov x17, x1",
172+
"0:",
173+
// LDXP x0, x1, [x4]
174+
concat!(ldxp!($ordering), " x0, x1, [x4]"),
175+
"cmp x0, x16",
176+
"ccmp x1, x17, #0, eq",
177+
"bne 1f",
178+
// STXP w(tmp2), x2, x3, [x4]
179+
concat!(stxp!($ordering), " w15, x2, x3, [x4]"),
180+
"cbnz w15, 0b",
181+
"1:",
182+
"ret",
183+
options(noreturn)
184+
} }
185+
}
186+
}
187+
};
188+
}
189+
190+
/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.swap>.
191+
macro_rules! swap {
192+
($ordering:ident, $bytes:tt, $name:ident) => {
193+
intrinsics! {
194+
#[maybe_use_optimized_c_shim]
195+
#[naked]
196+
pub unsafe extern "C" fn $name (
197+
left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
198+
) -> int_ty!($bytes) {
199+
unsafe { core::arch::asm! {
200+
// mov s(tmp0), s(0)
201+
concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
202+
"0:",
203+
// LDXR s(0), [x1]
204+
concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
205+
// STXR w(tmp1), s(tmp0), [x1]
206+
concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
207+
"cbnz w17, 0b",
208+
"ret",
209+
options(noreturn)
210+
} }
211+
}
212+
}
213+
};
214+
}
215+
216+
/// See (e.g.) <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.fetch_add>.
217+
macro_rules! fetch_op {
218+
($ordering:ident, $bytes:tt, $name:ident, $op:literal) => {
219+
intrinsics! {
220+
#[maybe_use_optimized_c_shim]
221+
#[naked]
222+
pub unsafe extern "C" fn $name (
223+
val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
224+
) -> int_ty!($bytes) {
225+
unsafe { core::arch::asm! {
226+
// mov s(tmp0), s(0)
227+
concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
228+
"0:",
229+
// LDXR s(0), [x1]
230+
concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
231+
// OP s(tmp1), s(0), s(tmp0)
232+
concat!($op, " ", reg!($bytes, 17), ", ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
233+
// STXR w(tmp2), s(tmp1), [x1]
234+
concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
235+
"cbnz w15, 0b",
236+
"ret",
237+
options(noreturn)
238+
} }
239+
}
240+
}
241+
}
242+
}
243+
244+
// We need a single macro to pass to `foreach_ldadd`.
245+
macro_rules! add {
246+
($ordering:ident, $bytes:tt, $name:ident) => {
247+
fetch_op! { $ordering, $bytes, $name, "add" }
248+
};
249+
}
250+
251+
macro_rules! and {
252+
($ordering:ident, $bytes:tt, $name:ident) => {
253+
fetch_op! { $ordering, $bytes, $name, "bic" }
254+
};
255+
}
256+
257+
macro_rules! xor {
258+
($ordering:ident, $bytes:tt, $name:ident) => {
259+
fetch_op! { $ordering, $bytes, $name, "eor" }
260+
};
261+
}
262+
263+
macro_rules! or {
264+
($ordering:ident, $bytes:tt, $name:ident) => {
265+
fetch_op! { $ordering, $bytes, $name, "orr" }
266+
};
267+
}
268+
269+
// See `generate_aarch64_outlined_atomics` in build.rs.
270+
include!(concat!(env!("OUT_DIR"), "/outlined_atomics.rs"));
271+
foreach_cas!(compare_and_swap);
272+
foreach_cas16!(compare_and_swap_i128);
273+
foreach_swp!(swap);
274+
foreach_ldadd!(add);
275+
foreach_ldclr!(and);
276+
foreach_ldeor!(xor);
277+
foreach_ldset!(or);

src/lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ pub mod mem;
5757
#[cfg(target_arch = "arm")]
5858
pub mod arm;
5959

60+
#[cfg(all(target_arch = "aarch64", not(feature = "no-asm"),))]
61+
pub mod aarch64;
62+
6063
#[cfg(all(
6164
kernel_user_helpers,
6265
any(target_os = "linux", target_os = "android"),

src/macros.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -204,15 +204,15 @@ macro_rules! intrinsics {
204204
(
205205
#[maybe_use_optimized_c_shim]
206206
$(#[$($attr:tt)*])*
207-
pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? {
207+
pub $(unsafe $(@ $empty:tt)? )? extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? {
208208
$($body:tt)*
209209
}
210210

211211
$($rest:tt)*
212212
) => (
213213
#[cfg($name = "optimized-c")]
214214
#[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
215-
pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
215+
pub $(unsafe $($empty)? )? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
216216
extern $abi {
217217
fn $name($($argname: $ty),*) $(-> $ret)?;
218218
}
@@ -224,7 +224,7 @@ macro_rules! intrinsics {
224224
#[cfg(not($name = "optimized-c"))]
225225
intrinsics! {
226226
$(#[$($attr)*])*
227-
pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
227+
pub $(unsafe $($empty)? )? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
228228
$($body)*
229229
}
230230
}

0 commit comments

Comments
 (0)