@@ -89,6 +89,24 @@ impl CodePoint {
89
89
self . value
90
90
}
91
91
92
+ /// Returns the numeric value of the code point if it is a leading surrogate.
93
+ #[ inline]
94
+ pub fn to_lead_surrogate ( & self ) -> Option < u16 > {
95
+ match self . value {
96
+ lead @ 0xD800 ..=0xDBFF => Some ( lead as u16 ) ,
97
+ _ => None ,
98
+ }
99
+ }
100
+
101
+ /// Returns the numeric value of the code point if it is a trailing surrogate.
102
+ #[ inline]
103
+ pub fn to_trail_surrogate ( & self ) -> Option < u16 > {
104
+ match self . value {
105
+ trail @ 0xDC00 ..=0xDFFF => Some ( trail as u16 ) ,
106
+ _ => None ,
107
+ }
108
+ }
109
+
92
110
/// Optionally returns a Unicode scalar value for the code point.
93
111
///
94
112
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
@@ -117,6 +135,14 @@ impl CodePoint {
117
135
#[ derive( Eq , PartialEq , Ord , PartialOrd , Clone ) ]
118
136
pub struct Wtf8Buf {
119
137
bytes : Vec < u8 > ,
138
+
139
+ /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
140
+ /// know this if we're constructed from a `String` or `&str`.
141
+ ///
142
+ /// It is possible for `bytes` to have valid UTF-8 without this being
143
+ /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
144
+ /// paired, as we don't bother to rescan the entire string.
145
+ is_known_utf8 : bool ,
120
146
}
121
147
122
148
impl ops:: Deref for Wtf8Buf {
@@ -147,13 +173,13 @@ impl Wtf8Buf {
147
173
/// Creates a new, empty WTF-8 string.
148
174
#[ inline]
149
175
pub fn new ( ) -> Wtf8Buf {
150
- Wtf8Buf { bytes : Vec :: new ( ) }
176
+ Wtf8Buf { bytes : Vec :: new ( ) , is_known_utf8 : true }
151
177
}
152
178
153
179
/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
154
180
#[ inline]
155
181
pub fn with_capacity ( capacity : usize ) -> Wtf8Buf {
156
- Wtf8Buf { bytes : Vec :: with_capacity ( capacity) }
182
+ Wtf8Buf { bytes : Vec :: with_capacity ( capacity) , is_known_utf8 : true }
157
183
}
158
184
159
185
/// Creates a WTF-8 string from a UTF-8 `String`.
@@ -163,7 +189,7 @@ impl Wtf8Buf {
163
189
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
164
190
#[ inline]
165
191
pub fn from_string ( string : String ) -> Wtf8Buf {
166
- Wtf8Buf { bytes : string. into_bytes ( ) }
192
+ Wtf8Buf { bytes : string. into_bytes ( ) , is_known_utf8 : true }
167
193
}
168
194
169
195
/// Creates a WTF-8 string from a UTF-8 `&str` slice.
@@ -173,11 +199,12 @@ impl Wtf8Buf {
173
199
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
174
200
#[ inline]
175
201
pub fn from_str ( str : & str ) -> Wtf8Buf {
176
- Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) }
202
+ Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) , is_known_utf8 : true }
177
203
}
178
204
179
205
pub fn clear ( & mut self ) {
180
- self . bytes . clear ( )
206
+ self . bytes . clear ( ) ;
207
+ self . is_known_utf8 = true ;
181
208
}
182
209
183
210
/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
@@ -195,15 +222,17 @@ impl Wtf8Buf {
195
222
let code_point = unsafe { CodePoint :: from_u32_unchecked ( surrogate as u32 ) } ;
196
223
// Skip the WTF-8 concatenation check,
197
224
// surrogate pairs are already decoded by decode_utf16
198
- string. push_code_point_unchecked ( code_point)
225
+ string. push_code_point_unchecked ( code_point) ;
226
+ // The string now contains an unpaired surrogate.
227
+ string. is_known_utf8 = false ;
199
228
}
200
229
}
201
230
}
202
231
string
203
232
}
204
233
205
234
/// Copied from String::push
206
- /// This does **not** include the WTF-8 concatenation check.
235
+ /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check .
207
236
fn push_code_point_unchecked ( & mut self , code_point : CodePoint ) {
208
237
let mut bytes = [ 0 ; 4 ] ;
209
238
let bytes = char:: encode_utf8_raw ( code_point. value , & mut bytes) ;
@@ -217,6 +246,9 @@ impl Wtf8Buf {
217
246
218
247
#[ inline]
219
248
pub fn as_mut_slice ( & mut self ) -> & mut Wtf8 {
249
+ // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
250
+ // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
251
+ // which would break the assumptions of the `is_known_utf8` field.
220
252
unsafe { Wtf8 :: from_mut_bytes_unchecked ( & mut self . bytes ) }
221
253
}
222
254
@@ -313,7 +345,15 @@ impl Wtf8Buf {
313
345
self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
314
346
self . bytes . extend_from_slice ( other_without_trail_surrogate) ;
315
347
}
316
- _ => self . bytes . extend_from_slice ( & other. bytes ) ,
348
+ _ => {
349
+ self . bytes . extend_from_slice ( & other. bytes ) ;
350
+
351
+ // If we're pushing a string containing a surrogate, we may no
352
+ // longer have UTF-8.
353
+ if other. next_surrogate ( 0 ) . is_some ( ) {
354
+ self . is_known_utf8 = false ;
355
+ }
356
+ }
317
357
}
318
358
}
319
359
@@ -330,13 +370,19 @@ impl Wtf8Buf {
330
370
/// like concatenating ill-formed UTF-16 strings effectively would.
331
371
#[ inline]
332
372
pub fn push ( & mut self , code_point : CodePoint ) {
333
- if let trail @ 0xDC00 ..= 0xDFFF = code_point. to_u32 ( ) {
373
+ if let Some ( trail) = code_point. to_trail_surrogate ( ) {
334
374
if let Some ( lead) = ( & * self ) . final_lead_surrogate ( ) {
335
375
let len_without_lead_surrogate = self . len ( ) - 3 ;
336
376
self . bytes . truncate ( len_without_lead_surrogate) ;
337
- self . push_char ( decode_surrogate_pair ( lead, trail as u16 ) ) ;
377
+ self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
338
378
return ;
339
379
}
380
+
381
+ // We're pushing a trailing surrogate.
382
+ self . is_known_utf8 = false ;
383
+ } else if code_point. to_lead_surrogate ( ) . is_some ( ) {
384
+ // We're pushing a leading surrogate.
385
+ self . is_known_utf8 = false ;
340
386
}
341
387
342
388
// No newly paired surrogates at the boundary.
@@ -363,9 +409,10 @@ impl Wtf8Buf {
363
409
/// (that is, if the string contains surrogates),
364
410
/// the original WTF-8 string is returned instead.
365
411
pub fn into_string ( self ) -> Result < String , Wtf8Buf > {
366
- match self . next_surrogate ( 0 ) {
367
- None => Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } ) ,
368
- Some ( _) => Err ( self ) ,
412
+ if self . is_known_utf8 || self . next_surrogate ( 0 ) . is_none ( ) {
413
+ Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } )
414
+ } else {
415
+ Err ( self )
369
416
}
370
417
}
371
418
@@ -375,6 +422,11 @@ impl Wtf8Buf {
375
422
///
376
423
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
377
424
pub fn into_string_lossy ( mut self ) -> String {
425
+ // Fast path: If we already have UTF-8, we can return it immediately.
426
+ if self . is_known_utf8 {
427
+ return unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
428
+ }
429
+
378
430
let mut pos = 0 ;
379
431
loop {
380
432
match self . next_surrogate ( pos) {
@@ -397,7 +449,7 @@ impl Wtf8Buf {
397
449
/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
398
450
pub fn from_box ( boxed : Box < Wtf8 > ) -> Wtf8Buf {
399
451
let bytes: Box < [ u8 ] > = unsafe { mem:: transmute ( boxed) } ;
400
- Wtf8Buf { bytes : bytes. into_vec ( ) }
452
+ Wtf8Buf { bytes : bytes. into_vec ( ) , is_known_utf8 : false }
401
453
}
402
454
}
403
455
@@ -575,6 +627,11 @@ impl Wtf8 {
575
627
}
576
628
}
577
629
630
+ /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
631
+ pub fn to_owned ( & self ) -> Wtf8Buf {
632
+ Wtf8Buf { bytes : self . bytes . to_vec ( ) , is_known_utf8 : false }
633
+ }
634
+
578
635
/// Lossily converts the string to UTF-8.
579
636
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
580
637
///
@@ -664,7 +721,8 @@ impl Wtf8 {
664
721
}
665
722
666
723
pub fn clone_into ( & self , buf : & mut Wtf8Buf ) {
667
- self . bytes . clone_into ( & mut buf. bytes )
724
+ self . bytes . clone_into ( & mut buf. bytes ) ;
725
+ buf. is_known_utf8 = false ;
668
726
}
669
727
670
728
/// Boxes this `Wtf8`.
@@ -704,12 +762,18 @@ impl Wtf8 {
704
762
705
763
#[ inline]
706
764
pub fn to_ascii_lowercase ( & self ) -> Wtf8Buf {
707
- Wtf8Buf { bytes : self . bytes . to_ascii_lowercase ( ) }
765
+ Wtf8Buf {
766
+ bytes : self . bytes . to_ascii_lowercase ( ) ,
767
+ is_known_utf8 : self . next_surrogate ( 0 ) . is_none ( ) ,
768
+ }
708
769
}
709
770
710
771
#[ inline]
711
772
pub fn to_ascii_uppercase ( & self ) -> Wtf8Buf {
712
- Wtf8Buf { bytes : self . bytes . to_ascii_uppercase ( ) }
773
+ Wtf8Buf {
774
+ bytes : self . bytes . to_ascii_uppercase ( ) ,
775
+ is_known_utf8 : self . next_surrogate ( 0 ) . is_none ( ) ,
776
+ }
713
777
}
714
778
715
779
#[ inline]
0 commit comments