Auto merge of #91244 - dtolnay:lossy, r=Mark-Simulacrum

bors · bors · commit 94bec9070223 · 2021-11-30T01:08:56.000Z
Eliminate bunch of copies of error codepath from Utf8LossyChunksIter Using a macro to stamp out 7 identical copies of the nontrivial slicing logic to exit this loop didn't seem like a necessary use of a macro. The early return case can be handled by `break` without practically any changes to the logic inside the loop. All this code is from early 2014 (#12062&mdash;nearly 8 years ago; pre-1.0) so it's possible there were compiler limitations that forced the macro way at the time. Confirmed that `x.py bench library/alloc --stage 0 --test-args from_utf8_lossy` is unaffected on my machine.
diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
@@ -61,36 +61,26 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
         }
 
         let mut i = 0;
+        let mut valid_up_to = 0;
         while i < self.source.len() {
-            let i_ = i;
-
-            // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
-            // only increases, so `0 <= i < self.source.len()`.
+            // SAFETY: `i < self.source.len()` per previous line.
+            // For some reason the following are both significantly slower:
+            // while let Some(&byte) = self.source.get(i) {
+            // while let Some(byte) = self.source.get(i).copied() {
             let byte = unsafe { *self.source.get_unchecked(i) };
             i += 1;
 
             if byte < 128 {
+                // This could be a `1 => ...` case in the match below, but for
+                // the common case of all-ASCII inputs, we bypass loading the
+                // sizeable UTF8_CHAR_WIDTH table into cache.
             } else {
                 let w = utf8_char_width(byte);
 
-                macro_rules! error {
-                    () => {{
-                        // SAFETY: We have checked up to `i` that source is valid UTF-8.
-                        unsafe {
-                            let r = Utf8LossyChunk {
-                                valid: from_utf8_unchecked(&self.source[0..i_]),
-                                broken: &self.source[i_..i],
-                            };
-                            self.source = &self.source[i..];
-                            return Some(r);
-                        }
-                    }};
-                }
-
                 match w {
                     2 => {
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                     }
@@ -100,13 +90,11 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
                             (0xE1..=0xEC, 0x80..=0xBF) => (),
                             (0xED, 0x80..=0x9F) => (),
                             (0xEE..=0xEF, 0x80..=0xBF) => (),
-                            _ => {
-                                error!();
-                            }
+                            _ => break,
                         }
                         i += 1;
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                     }
@@ -115,34 +103,45 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
                             (0xF0, 0x90..=0xBF) => (),
                             (0xF1..=0xF3, 0x80..=0xBF) => (),
                             (0xF4, 0x80..=0x8F) => (),
-                            _ => {
-                                error!();
-                            }
+                            _ => break,
                         }
                         i += 1;
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                     }
-                    _ => {
-                        error!();
-                    }
+                    _ => break,
                 }
             }
+
+            valid_up_to = i;
         }
 
-        let r = Utf8LossyChunk {
-            // SAFETY: We have checked that the entire source is valid UTF-8.
-            valid: unsafe { from_utf8_unchecked(self.source) },
-            broken: &[],
-        };
-        self.source = &[];
-        Some(r)
+        // SAFETY: `i <= self.source.len()` because it is only ever incremented
+        // via `i += 1` and in between every single one of those increments, `i`
+        // is compared against `self.source.len()`. That happens either
+        // literally by `i < self.source.len()` in the while-loop's condition,
+        // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
+        // loop is terminated as soon as the latest `i += 1` has made `i` no
+        // longer less than `self.source.len()`, which means it'll be at most
+        // equal to `self.source.len()`.
+        let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
+        self.source = remaining;
+
+        // SAFETY: `valid_up_to <= i` because it is only ever assigned via
+        // `valid_up_to = i` and `i` only increases.
+        let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
+
+        Some(Utf8LossyChunk {
+            // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
+            valid: unsafe { from_utf8_unchecked(valid) },
+            broken,
+        })
     }
 }