Skip to content

Commit 370fd8c

Browse files
authored
Rollup merge of #68787 - amosonn:patch-1, r=nagisa
Optimize core::ptr::align_offset (part 1) r? @nagisa See #68616 for main discussion.
2 parents 51c6c25 + 22b263a commit 370fd8c

File tree

1 file changed

+22
-16
lines changed

1 file changed

+22
-16
lines changed

src/libcore/ptr/mod.rs

+22-16
Original file line numberDiff line numberDiff line change
@@ -1081,9 +1081,8 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
10811081
// uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
10821082
// usize::max_value()` instead, because we take the result `mod n` at the end
10831083
// anyway.
1084-
inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse)))
1085-
& (going_mod - 1);
1086-
if going_mod > m {
1084+
inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse)));
1085+
if going_mod >= m {
10871086
return inverse & (m - 1);
10881087
}
10891088
going_mod = going_mod.wrapping_mul(going_mod);
@@ -1115,26 +1114,33 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
11151114
let gcdpow = intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a));
11161115
let gcd = 1usize << gcdpow;
11171116

1118-
if p as usize & (gcd - 1) == 0 {
1117+
if p as usize & (gcd.wrapping_sub(1)) == 0 {
11191118
// This branch solves for the following linear congruence equation:
11201119
//
1121-
// $$ p + so 0 mod a $$
1120+
// ` p + so = 0 mod a `
11221121
//
1123-
// $p$ here is the pointer value, $s$ – stride of `T`, $o$ offset in `T`s, and $a$ – the
1122+
// `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
11241123
// requested alignment.
11251124
//
1126-
// g = gcd(a, s)
1127-
// o = (a - (p mod a))/g * ((s/g)⁻¹ mod a)
1125+
// With `g = gcd(a, s)`, and the above asserting that `p` is also divisible by `g`, we can
1126+
// denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
11281127
//
1129-
// The first term is “the relative alignment of p to a”, the second term is “how does
1130-
// incrementing p by s bytes change the relative alignment of p”. Division by `g` is
1131-
// necessary to make this equation well formed if $a$ and $s$ are not co-prime.
1128+
// ` p' + s'o = 0 mod a' `
1129+
// ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
11321130
//
1133-
// Furthermore, the result produced by this solution is not “minimal”, so it is necessary
1134-
// to take the result $o mod lcm(s, a)$. We can replace $lcm(s, a)$ with just a $a / g$.
1135-
let j = a.wrapping_sub(pmoda) >> gcdpow;
1136-
let k = smoda >> gcdpow;
1137-
return intrinsics::unchecked_rem(j.wrapping_mul(mod_inv(k, a)), a >> gcdpow);
1131+
// The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second
1132+
// term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again
1133+
// divided by `g`).
1134+
// Division by `g` is necessary to make the inverse well formed if `a` and `s` are not
1135+
// co-prime.
1136+
//
1137+
// Furthermore, the result produced by this solution is not "minimal", so it is necessary
1138+
// to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`.
1139+
let a2 = a >> gcdpow;
1140+
let a2minus1 = a2.wrapping_sub(1);
1141+
let s2 = smoda >> gcdpow;
1142+
let minusp2 = a2.wrapping_sub(pmoda >> gcdpow);
1143+
return (minusp2.wrapping_mul(mod_inv(s2, a2))) & a2minus1;
11381144
}
11391145

11401146
// Cannot be aligned at all.

0 commit comments

Comments
 (0)