Skip to content

Commit cb5e4ab

Browse files
Rollup merge of rust-lang#67569 - Mark-Simulacrum:opt-char-encode, r=oli-obk
Clean up unsafety in char::encode_utf8 This originally started as an attempt to allow LLVM to optimize through encode_utf8 to detect the try_encode_utf8 case (rust-lang#52579, rust-lang#52580), but due to a typo my conclusion that my optimizations were successful was incorrect. Furthermore, as far as I can tell, this optimization is probably just not possible with LLVM today. This [code](https://rust.godbolt.org/z/JggRj4) compiles down to a long series of compares, notably, two identical series of compares. That essentially means that LLVM is today unable to see that these two ifs are identical and as such can be merged and then realize that no value of the if condition can result in a call to `please_delete`. As such, for now, we do not attempt to specifically optimize for that case.
2 parents b8b9a0f + df4d490 commit cb5e4ab

File tree

2 files changed

+30
-30
lines changed

2 files changed

+30
-30
lines changed

src/libcore/char/methods.rs

+29-30
Original file line numberDiff line numberDiff line change
@@ -434,36 +434,35 @@ impl char {
434434
#[inline]
435435
pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
436436
let code = self as u32;
437-
// SAFETY: each arm checks the size of the slice and only uses `get_unchecked` unsafe ops
438-
unsafe {
439-
let len = if code < MAX_ONE_B && !dst.is_empty() {
440-
*dst.get_unchecked_mut(0) = code as u8;
441-
1
442-
} else if code < MAX_TWO_B && dst.len() >= 2 {
443-
*dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
444-
*dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT;
445-
2
446-
} else if code < MAX_THREE_B && dst.len() >= 3 {
447-
*dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
448-
*dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
449-
*dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT;
450-
3
451-
} else if dst.len() >= 4 {
452-
*dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
453-
*dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
454-
*dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
455-
*dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT;
456-
4
457-
} else {
458-
panic!(
459-
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
460-
from_u32_unchecked(code).len_utf8(),
461-
code,
462-
dst.len(),
463-
)
464-
};
465-
from_utf8_unchecked_mut(dst.get_unchecked_mut(..len))
466-
}
437+
let len = self.len_utf8();
438+
match (len, &mut dst[..]) {
439+
(1, [a, ..]) => {
440+
*a = code as u8;
441+
}
442+
(2, [a, b, ..]) => {
443+
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
444+
*b = (code & 0x3F) as u8 | TAG_CONT;
445+
}
446+
(3, [a, b, c, ..]) => {
447+
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
448+
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
449+
*c = (code & 0x3F) as u8 | TAG_CONT;
450+
}
451+
(4, [a, b, c, d, ..]) => {
452+
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
453+
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
454+
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
455+
*d = (code & 0x3F) as u8 | TAG_CONT;
456+
}
457+
_ => panic!(
458+
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
459+
len,
460+
code,
461+
dst.len(),
462+
),
463+
};
464+
// SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
465+
unsafe { from_utf8_unchecked_mut(&mut dst[..len]) }
467466
}
468467

469468
/// Encodes this character as UTF-16 into the provided `u16` buffer,

src/libcore/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@
129129
#![feature(associated_type_bounds)]
130130
#![feature(const_type_id)]
131131
#![feature(const_caller_location)]
132+
#![feature(slice_patterns)]
132133

133134
#[prelude_import]
134135
#[allow(unused)]

0 commit comments

Comments
 (0)