Skip to content

Commit 65f3f8b

Browse files
committed
Auto merge of #89611 - eduardosm:next_code_point, r=Mark-Simulacrum
libcore: assume the input of `next_code_point` and `next_code_point_reverse` is UTF-8-like The functions are now `unsafe` and they use `Option::unwrap_unchecked` instead of `unwrap_or_0` `unwrap_or_0` was added in 42357d7. I guess `unwrap_unchecked` was not available back then. Given this example: ```rust pub fn first_char(s: &str) -> Option<char> { s.chars().next() } ``` Previously, the following assembly was produced: ```asm _ZN7example10first_char17ha056ddea6bafad1cE: .cfi_startproc test rsi, rsi je .LBB0_1 movzx edx, byte ptr [rdi] test dl, dl js .LBB0_3 mov eax, edx ret .LBB0_1: mov eax, 1114112 ret .LBB0_3: lea r8, [rdi + rsi] xor eax, eax mov r9, r8 cmp rsi, 1 je .LBB0_5 movzx eax, byte ptr [rdi + 1] add rdi, 2 and eax, 63 mov r9, rdi .LBB0_5: mov ecx, edx and ecx, 31 cmp dl, -33 jbe .LBB0_6 cmp r9, r8 je .LBB0_9 movzx esi, byte ptr [r9] add r9, 1 and esi, 63 shl eax, 6 or eax, esi cmp dl, -16 jb .LBB0_12 .LBB0_13: cmp r9, r8 je .LBB0_14 movzx edx, byte ptr [r9] and edx, 63 jmp .LBB0_16 .LBB0_6: shl ecx, 6 or eax, ecx ret .LBB0_9: xor esi, esi mov r9, r8 shl eax, 6 or eax, esi cmp dl, -16 jae .LBB0_13 .LBB0_12: shl ecx, 12 or eax, ecx ret .LBB0_14: xor edx, edx .LBB0_16: and ecx, 7 shl ecx, 18 shl eax, 6 or eax, ecx or eax, edx ret ``` After this change, the assembly is reduced to: ```asm _ZN7example10first_char17h4318683472f884ccE: .cfi_startproc test rsi, rsi je .LBB0_1 movzx ecx, byte ptr [rdi] test cl, cl js .LBB0_3 mov eax, ecx ret .LBB0_1: mov eax, 1114112 ret .LBB0_3: mov eax, ecx and eax, 31 movzx esi, byte ptr [rdi + 1] and esi, 63 cmp cl, -33 jbe .LBB0_4 movzx edx, byte ptr [rdi + 2] shl esi, 6 and edx, 63 or edx, esi cmp cl, -16 jb .LBB0_7 movzx ecx, byte ptr [rdi + 3] and eax, 7 shl eax, 18 shl edx, 6 and ecx, 63 or ecx, edx or eax, ecx ret .LBB0_4: shl eax, 6 or eax, esi ret .LBB0_7: shl eax, 12 or eax, edx ret ```
2 parents 49d4232 + 23637e2 commit 65f3f8b

File tree

3 files changed

+36
-25
lines changed

3 files changed

+36
-25
lines changed

library/core/src/str/iter.rs

+6-8
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,9 @@ impl<'a> Iterator for Chars<'a> {
3939

4040
#[inline]
4141
fn next(&mut self) -> Option<char> {
42-
next_code_point(&mut self.iter).map(|ch| {
43-
// SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
44-
unsafe { char::from_u32_unchecked(ch) }
45-
})
42+
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
43+
// the resulting `ch` is a valid Unicode Scalar Value.
44+
unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
4645
}
4746

4847
#[inline]
@@ -81,10 +80,9 @@ impl fmt::Debug for Chars<'_> {
8180
impl<'a> DoubleEndedIterator for Chars<'a> {
8281
#[inline]
8382
fn next_back(&mut self) -> Option<char> {
84-
next_code_point_reverse(&mut self.iter).map(|ch| {
85-
// SAFETY: `str` invariant says `ch` is a valid Unicode Scalar Value.
86-
unsafe { char::from_u32_unchecked(ch) }
87-
})
83+
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
84+
// the resulting `ch` is a valid Unicode Scalar Value.
85+
unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
8886
}
8987
}
9088

library/core/src/str/validations.rs

+28-16
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,15 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
2525
(byte as i8) < -64
2626
}
2727

28-
#[inline]
29-
const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
30-
match opt {
31-
Some(&byte) => byte,
32-
None => 0,
33-
}
34-
}
35-
3628
/// Reads the next code point out of a byte iterator (assuming a
3729
/// UTF-8-like encoding).
30+
///
31+
/// # Safety
32+
///
33+
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
3834
#[unstable(feature = "str_internals", issue = "none")]
3935
#[inline]
40-
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36+
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
4137
// Decode UTF-8
4238
let x = *bytes.next()?;
4339
if x < 128 {
@@ -48,18 +44,24 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<
4844
// Decode from a byte combination out of: [[[x y] z] w]
4945
// NOTE: Performance is sensitive to the exact formulation here
5046
let init = utf8_first_byte(x, 2);
51-
let y = unwrap_or_0(bytes.next());
47+
// SAFETY: `bytes` produces an UTF-8-like string,
48+
// so the iterator must produce a value here.
49+
let y = unsafe { *bytes.next().unwrap_unchecked() };
5250
let mut ch = utf8_acc_cont_byte(init, y);
5351
if x >= 0xE0 {
5452
// [[x y z] w] case
5553
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
56-
let z = unwrap_or_0(bytes.next());
54+
// SAFETY: `bytes` produces an UTF-8-like string,
55+
// so the iterator must produce a value here.
56+
let z = unsafe { *bytes.next().unwrap_unchecked() };
5757
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
5858
ch = init << 12 | y_z;
5959
if x >= 0xF0 {
6060
// [x y z w] case
6161
// use only the lower 3 bits of `init`
62-
let w = unwrap_or_0(bytes.next());
62+
// SAFETY: `bytes` produces an UTF-8-like string,
63+
// so the iterator must produce a value here.
64+
let w = unsafe { *bytes.next().unwrap_unchecked() };
6365
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
6466
}
6567
}
@@ -69,8 +71,12 @@ pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<
6971

7072
/// Reads the last code point out of a byte iterator (assuming a
7173
/// UTF-8-like encoding).
74+
///
75+
/// # Safety
76+
///
77+
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
7278
#[inline]
73-
pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
79+
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
7480
where
7581
I: DoubleEndedIterator<Item = &'a u8>,
7682
{
@@ -83,13 +89,19 @@ where
8389
// Multibyte case follows
8490
// Decode from a byte combination out of: [x [y [z w]]]
8591
let mut ch;
86-
let z = unwrap_or_0(bytes.next_back());
92+
// SAFETY: `bytes` produces an UTF-8-like string,
93+
// so the iterator must produce a value here.
94+
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
8795
ch = utf8_first_byte(z, 2);
8896
if utf8_is_cont_byte(z) {
89-
let y = unwrap_or_0(bytes.next_back());
97+
// SAFETY: `bytes` produces an UTF-8-like string,
98+
// so the iterator must produce a value here.
99+
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
90100
ch = utf8_first_byte(y, 3);
91101
if utf8_is_cont_byte(y) {
92-
let x = unwrap_or_0(bytes.next_back());
102+
// SAFETY: `bytes` produces an UTF-8-like string,
103+
// so the iterator must produce a value here.
104+
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
93105
ch = utf8_first_byte(x, 4);
94106
ch = utf8_acc_cont_byte(ch, y);
95107
}

library/std/src/sys_common/wtf8.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -809,7 +809,8 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {
809809

810810
#[inline]
811811
fn next(&mut self) -> Option<CodePoint> {
812-
next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
812+
// SAFETY: `self.bytes` has been created from a WTF-8 string
813+
unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
813814
}
814815

815816
#[inline]

0 commit comments

Comments
 (0)