Skip to content

Commit 94bec90

Browse files
committed
Auto merge of #91244 - dtolnay:lossy, r=Mark-Simulacrum
Eliminate bunch of copies of error codepath from Utf8LossyChunksIter Using a macro to stamp out 7 identical copies of the nontrivial slicing logic to exit this loop didn't seem like a necessary use of a macro. The early return case can be handled by `break` without practically any changes to the logic inside the loop. All this code is from early 2014 (#12062—nearly 8 years ago; pre-1.0) so it's possible there were compiler limitations that forced the macro way at the time. Confirmed that `x.py bench library/alloc --stage 0 --test-args from_utf8_lossy` is unaffected on my machine.
2 parents 6db0a0e + c6810a5 commit 94bec90

File tree

1 file changed

+37
-38
lines changed

1 file changed

+37
-38
lines changed

library/core/src/str/lossy.rs

+37-38
Original file line numberDiff line numberDiff line change
@@ -61,36 +61,26 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
6161
}
6262

6363
let mut i = 0;
64+
let mut valid_up_to = 0;
6465
while i < self.source.len() {
65-
let i_ = i;
66-
67-
// SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
68-
// only increases, so `0 <= i < self.source.len()`.
66+
// SAFETY: `i < self.source.len()` per previous line.
67+
// For some reason the following are both significantly slower:
68+
// while let Some(&byte) = self.source.get(i) {
69+
// while let Some(byte) = self.source.get(i).copied() {
6970
let byte = unsafe { *self.source.get_unchecked(i) };
7071
i += 1;
7172

7273
if byte < 128 {
74+
// This could be a `1 => ...` case in the match below, but for
75+
// the common case of all-ASCII inputs, we bypass loading the
76+
// sizeable UTF8_CHAR_WIDTH table into cache.
7377
} else {
7478
let w = utf8_char_width(byte);
7579

76-
macro_rules! error {
77-
() => {{
78-
// SAFETY: We have checked up to `i` that source is valid UTF-8.
79-
unsafe {
80-
let r = Utf8LossyChunk {
81-
valid: from_utf8_unchecked(&self.source[0..i_]),
82-
broken: &self.source[i_..i],
83-
};
84-
self.source = &self.source[i..];
85-
return Some(r);
86-
}
87-
}};
88-
}
89-
9080
match w {
9181
2 => {
9282
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
93-
error!();
83+
break;
9484
}
9585
i += 1;
9686
}
@@ -100,13 +90,11 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
10090
(0xE1..=0xEC, 0x80..=0xBF) => (),
10191
(0xED, 0x80..=0x9F) => (),
10292
(0xEE..=0xEF, 0x80..=0xBF) => (),
103-
_ => {
104-
error!();
105-
}
93+
_ => break,
10694
}
10795
i += 1;
10896
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
109-
error!();
97+
break;
11098
}
11199
i += 1;
112100
}
@@ -115,34 +103,45 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
115103
(0xF0, 0x90..=0xBF) => (),
116104
(0xF1..=0xF3, 0x80..=0xBF) => (),
117105
(0xF4, 0x80..=0x8F) => (),
118-
_ => {
119-
error!();
120-
}
106+
_ => break,
121107
}
122108
i += 1;
123109
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
124-
error!();
110+
break;
125111
}
126112
i += 1;
127113
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
128-
error!();
114+
break;
129115
}
130116
i += 1;
131117
}
132-
_ => {
133-
error!();
134-
}
118+
_ => break,
135119
}
136120
}
121+
122+
valid_up_to = i;
137123
}
138124

139-
let r = Utf8LossyChunk {
140-
// SAFETY: We have checked that the entire source is valid UTF-8.
141-
valid: unsafe { from_utf8_unchecked(self.source) },
142-
broken: &[],
143-
};
144-
self.source = &[];
145-
Some(r)
125+
// SAFETY: `i <= self.source.len()` because it is only ever incremented
126+
// via `i += 1` and in between every single one of those increments, `i`
127+
// is compared against `self.source.len()`. That happens either
128+
// literally by `i < self.source.len()` in the while-loop's condition,
129+
// or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
130+
// loop is terminated as soon as the latest `i += 1` has made `i` no
131+
// longer less than `self.source.len()`, which means it'll be at most
132+
// equal to `self.source.len()`.
133+
let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
134+
self.source = remaining;
135+
136+
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
137+
// `valid_up_to = i` and `i` only increases.
138+
let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
139+
140+
Some(Utf8LossyChunk {
141+
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
142+
valid: unsafe { from_utf8_unchecked(valid) },
143+
broken,
144+
})
146145
}
147146
}
148147

0 commit comments

Comments
 (0)