@@ -61,36 +61,26 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
61
61
}
62
62
63
63
let mut i = 0 ;
64
+ let mut valid_up_to = 0 ;
64
65
while i < self . source . len ( ) {
65
- let i_ = i ;
66
-
67
- // SAFETY: `i` starts at `0`, is less than ` self.source.len()`, and
68
- // only increases, so `0 <= i < self.source.len()`.
66
+ // SAFETY: `i < self.source.len()` per previous line.
67
+ // For some reason the following are both significantly slower:
68
+ // while let Some(&byte) = self.source.get(i) {
69
+ // while let Some(byte) = self.source.get(i).copied() {
69
70
let byte = unsafe { * self . source . get_unchecked ( i) } ;
70
71
i += 1 ;
71
72
72
73
if byte < 128 {
74
+ // This could be a `1 => ...` case in the match below, but for
75
+ // the common case of all-ASCII inputs, we bypass loading the
76
+ // sizeable UTF8_CHAR_WIDTH table into cache.
73
77
} else {
74
78
let w = utf8_char_width ( byte) ;
75
79
76
- macro_rules! error {
77
- ( ) => { {
78
- // SAFETY: We have checked up to `i` that source is valid UTF-8.
79
- unsafe {
80
- let r = Utf8LossyChunk {
81
- valid: from_utf8_unchecked( & self . source[ 0 ..i_] ) ,
82
- broken: & self . source[ i_..i] ,
83
- } ;
84
- self . source = & self . source[ i..] ;
85
- return Some ( r) ;
86
- }
87
- } } ;
88
- }
89
-
90
80
match w {
91
81
2 => {
92
82
if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
93
- error ! ( ) ;
83
+ break ;
94
84
}
95
85
i += 1 ;
96
86
}
@@ -100,13 +90,11 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
100
90
( 0xE1 ..=0xEC , 0x80 ..=0xBF ) => ( ) ,
101
91
( 0xED , 0x80 ..=0x9F ) => ( ) ,
102
92
( 0xEE ..=0xEF , 0x80 ..=0xBF ) => ( ) ,
103
- _ => {
104
- error ! ( ) ;
105
- }
93
+ _ => break ,
106
94
}
107
95
i += 1 ;
108
96
if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
109
- error ! ( ) ;
97
+ break ;
110
98
}
111
99
i += 1 ;
112
100
}
@@ -115,34 +103,45 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
115
103
( 0xF0 , 0x90 ..=0xBF ) => ( ) ,
116
104
( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) => ( ) ,
117
105
( 0xF4 , 0x80 ..=0x8F ) => ( ) ,
118
- _ => {
119
- error ! ( ) ;
120
- }
106
+ _ => break ,
121
107
}
122
108
i += 1 ;
123
109
if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
124
- error ! ( ) ;
110
+ break ;
125
111
}
126
112
i += 1 ;
127
113
if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
128
- error ! ( ) ;
114
+ break ;
129
115
}
130
116
i += 1 ;
131
117
}
132
- _ => {
133
- error ! ( ) ;
134
- }
118
+ _ => break ,
135
119
}
136
120
}
121
+
122
+ valid_up_to = i;
137
123
}
138
124
139
- let r = Utf8LossyChunk {
140
- // SAFETY: We have checked that the entire source is valid UTF-8.
141
- valid : unsafe { from_utf8_unchecked ( self . source ) } ,
142
- broken : & [ ] ,
143
- } ;
144
- self . source = & [ ] ;
145
- Some ( r)
125
+ // SAFETY: `i <= self.source.len()` because it is only ever incremented
126
+ // via `i += 1` and in between every single one of those increments, `i`
127
+ // is compared against `self.source.len()`. That happens either
128
+ // literally by `i < self.source.len()` in the while-loop's condition,
129
+ // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
130
+ // loop is terminated as soon as the latest `i += 1` has made `i` no
131
+ // longer less than `self.source.len()`, which means it'll be at most
132
+ // equal to `self.source.len()`.
133
+ let ( inspected, remaining) = unsafe { self . source . split_at_unchecked ( i) } ;
134
+ self . source = remaining;
135
+
136
+ // SAFETY: `valid_up_to <= i` because it is only ever assigned via
137
+ // `valid_up_to = i` and `i` only increases.
138
+ let ( valid, broken) = unsafe { inspected. split_at_unchecked ( valid_up_to) } ;
139
+
140
+ Some ( Utf8LossyChunk {
141
+ // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
142
+ valid : unsafe { from_utf8_unchecked ( valid) } ,
143
+ broken,
144
+ } )
146
145
}
147
146
}
148
147
0 commit comments