Skip to content

Commit caf8bcc

Browse files
committed
Optimize Wtf8Buf::into_string for the case where it contains UTF-8.
Add a `is_known_utf8` flag to `Wtf8Buf`, which tracks whether the string is known to contain UTF-8. This is efficiently computed in many common situations, such as when a `Wtf8Buf` is constructed from a `String` or `&str`, or with `Wtf8Buf::from_wide` which is already doing UTF-16 decoding and already checking for surrogates. This makes `OsString::into_string` O(1) rather than O(N) on Windows in common cases. And, it eliminates the need to scan through the string for surrogates in `Args::next` and `Vars::next`, because the strings are already being translated with `Wtf8Buf::from_wide`. Many things on Windows construct `OsString`s with `Wtf8Buf::from_wide`, such as `DirEntry::file_name` and `fs::read_link`, so with this patch, users of those functions can subsequently call `.into_string()` without paying for an extra scan through the string for surrogates.
1 parent 10f4ce3 commit caf8bcc

File tree

3 files changed

+366
-39
lines changed

3 files changed

+366
-39
lines changed

library/std/src/sys/windows/os_str.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,7 @@ impl Slice {
164164
}
165165

166166
pub fn to_owned(&self) -> Buf {
167-
let mut buf = Wtf8Buf::with_capacity(self.inner.len());
168-
buf.push_wtf8(&self.inner);
169-
Buf { inner: buf }
167+
Buf { inner: self.inner.to_owned() }
170168
}
171169

172170
pub fn clone_into(&self, buf: &mut Buf) {

library/std/src/sys_common/wtf8.rs

+81-17
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,24 @@ impl CodePoint {
8989
self.value
9090
}
9191

92+
/// Returns the numeric value of the code point if it is a leading surrogate.
93+
#[inline]
94+
pub fn to_lead_surrogate(&self) -> Option<u16> {
95+
match self.value {
96+
lead @ 0xD800..=0xDBFF => Some(lead as u16),
97+
_ => None,
98+
}
99+
}
100+
101+
/// Returns the numeric value of the code point if it is a trailing surrogate.
102+
#[inline]
103+
pub fn to_trail_surrogate(&self) -> Option<u16> {
104+
match self.value {
105+
trail @ 0xDC00..=0xDFFF => Some(trail as u16),
106+
_ => None,
107+
}
108+
}
109+
92110
/// Optionally returns a Unicode scalar value for the code point.
93111
///
94112
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
@@ -117,6 +135,14 @@ impl CodePoint {
117135
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
118136
pub struct Wtf8Buf {
119137
bytes: Vec<u8>,
138+
139+
/// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
140+
/// know this if we're constructed from a `String` or `&str`.
141+
///
142+
/// It is possible for `bytes` to have valid UTF-8 without this being
143+
/// set, such as when we're concatenating `&Wtf8`'s and surrogates become
144+
/// paired, as we don't bother to rescan the entire string.
145+
is_known_utf8: bool,
120146
}
121147

122148
impl ops::Deref for Wtf8Buf {
@@ -147,13 +173,13 @@ impl Wtf8Buf {
147173
/// Creates a new, empty WTF-8 string.
148174
#[inline]
149175
pub fn new() -> Wtf8Buf {
150-
Wtf8Buf { bytes: Vec::new() }
176+
Wtf8Buf { bytes: Vec::new(), is_known_utf8: true }
151177
}
152178

153179
/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
154180
#[inline]
155181
pub fn with_capacity(capacity: usize) -> Wtf8Buf {
156-
Wtf8Buf { bytes: Vec::with_capacity(capacity) }
182+
Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
157183
}
158184

159185
/// Creates a WTF-8 string from a UTF-8 `String`.
@@ -163,7 +189,7 @@ impl Wtf8Buf {
163189
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
164190
#[inline]
165191
pub fn from_string(string: String) -> Wtf8Buf {
166-
Wtf8Buf { bytes: string.into_bytes() }
192+
Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true }
167193
}
168194

169195
/// Creates a WTF-8 string from a UTF-8 `&str` slice.
@@ -173,11 +199,12 @@ impl Wtf8Buf {
173199
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
174200
#[inline]
175201
pub fn from_str(str: &str) -> Wtf8Buf {
176-
Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
202+
Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true }
177203
}
178204

179205
pub fn clear(&mut self) {
180-
self.bytes.clear()
206+
self.bytes.clear();
207+
self.is_known_utf8 = true;
181208
}
182209

183210
/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
@@ -195,15 +222,17 @@ impl Wtf8Buf {
195222
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
196223
// Skip the WTF-8 concatenation check,
197224
// surrogate pairs are already decoded by decode_utf16
198-
string.push_code_point_unchecked(code_point)
225+
string.push_code_point_unchecked(code_point);
226+
// The string now contains an unpaired surrogate.
227+
string.is_known_utf8 = false;
199228
}
200229
}
201230
}
202231
string
203232
}
204233

205234
/// Copied from String::push
206-
/// This does **not** include the WTF-8 concatenation check.
235+
/// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
207236
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
208237
let mut bytes = [0; 4];
209238
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
@@ -217,6 +246,9 @@ impl Wtf8Buf {
217246

218247
#[inline]
219248
pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
249+
// Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
250+
// cause them to change from well-formed UTF-8 to ill-formed UTF-8,
251+
// which would break the assumptions of the `is_known_utf8` field.
220252
unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
221253
}
222254

@@ -313,7 +345,15 @@ impl Wtf8Buf {
313345
self.push_char(decode_surrogate_pair(lead, trail));
314346
self.bytes.extend_from_slice(other_without_trail_surrogate);
315347
}
316-
_ => self.bytes.extend_from_slice(&other.bytes),
348+
_ => {
349+
self.bytes.extend_from_slice(&other.bytes);
350+
351+
// If we're pushing a string containing a surrogate, we may no
352+
// longer have UTF-8.
353+
if other.next_surrogate(0).is_some() {
354+
self.is_known_utf8 = false;
355+
}
356+
}
317357
}
318358
}
319359

@@ -330,13 +370,19 @@ impl Wtf8Buf {
330370
/// like concatenating ill-formed UTF-16 strings effectively would.
331371
#[inline]
332372
pub fn push(&mut self, code_point: CodePoint) {
333-
if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
373+
if let Some(trail) = code_point.to_trail_surrogate() {
334374
if let Some(lead) = (&*self).final_lead_surrogate() {
335375
let len_without_lead_surrogate = self.len() - 3;
336376
self.bytes.truncate(len_without_lead_surrogate);
337-
self.push_char(decode_surrogate_pair(lead, trail as u16));
377+
self.push_char(decode_surrogate_pair(lead, trail));
338378
return;
339379
}
380+
381+
// We're pushing a trailing surrogate.
382+
self.is_known_utf8 = false;
383+
} else if code_point.to_lead_surrogate().is_some() {
384+
// We're pushing a leading surrogate.
385+
self.is_known_utf8 = false;
340386
}
341387

342388
// No newly paired surrogates at the boundary.
@@ -363,9 +409,10 @@ impl Wtf8Buf {
363409
/// (that is, if the string contains surrogates),
364410
/// the original WTF-8 string is returned instead.
365411
pub fn into_string(self) -> Result<String, Wtf8Buf> {
366-
match self.next_surrogate(0) {
367-
None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
368-
Some(_) => Err(self),
412+
if self.is_known_utf8 || self.next_surrogate(0).is_none() {
413+
Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
414+
} else {
415+
Err(self)
369416
}
370417
}
371418

@@ -375,6 +422,11 @@ impl Wtf8Buf {
375422
///
376423
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
377424
pub fn into_string_lossy(mut self) -> String {
425+
// Fast path: If we already have UTF-8, we can return it immediately.
426+
if self.is_known_utf8 {
427+
return unsafe { String::from_utf8_unchecked(self.bytes) };
428+
}
429+
378430
let mut pos = 0;
379431
loop {
380432
match self.next_surrogate(pos) {
@@ -397,7 +449,7 @@ impl Wtf8Buf {
397449
/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
398450
pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
399451
let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
400-
Wtf8Buf { bytes: bytes.into_vec() }
452+
Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false }
401453
}
402454
}
403455

@@ -575,6 +627,11 @@ impl Wtf8 {
575627
}
576628
}
577629

630+
/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
631+
pub fn to_owned(&self) -> Wtf8Buf {
632+
Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false }
633+
}
634+
578635
/// Lossily converts the string to UTF-8.
579636
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
580637
///
@@ -664,7 +721,8 @@ impl Wtf8 {
664721
}
665722

666723
pub fn clone_into(&self, buf: &mut Wtf8Buf) {
667-
self.bytes.clone_into(&mut buf.bytes)
724+
self.bytes.clone_into(&mut buf.bytes);
725+
buf.is_known_utf8 = false;
668726
}
669727

670728
/// Boxes this `Wtf8`.
@@ -704,12 +762,18 @@ impl Wtf8 {
704762

705763
#[inline]
706764
pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
707-
Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
765+
Wtf8Buf {
766+
bytes: self.bytes.to_ascii_lowercase(),
767+
is_known_utf8: self.next_surrogate(0).is_none(),
768+
}
708769
}
709770

710771
#[inline]
711772
pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
712-
Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
773+
Wtf8Buf {
774+
bytes: self.bytes.to_ascii_uppercase(),
775+
is_known_utf8: self.next_surrogate(0).is_none(),
776+
}
713777
}
714778

715779
#[inline]

0 commit comments

Comments
 (0)