@@ -102,6 +102,62 @@ pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
102
102
} )
103
103
}
104
104
105
+ // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
106
+ // classes.
107
+
108
+ /// True if `c` is considered a whitespace according to Rust language definition.
109
+ pub fn is_whitespace ( c : char ) -> bool {
110
+ // This is Pattern_White_Space.
111
+ //
112
+ // Note that this set is stable (ie, it doesn't change with different
113
+ // Unicode versions), so it's ok to just hard-code the values.
114
+
115
+ match c {
116
+ // Usual ASCII suspects
117
+ | '\u{0009}' // \t
118
+ | '\u{000A}' // \n
119
+ | '\u{000B}' // vertical tab
120
+ | '\u{000C}' // form feed
121
+ | '\u{000D}' // \r
122
+ | '\u{0020}' // space
123
+
124
+ // NEXT LINE from latin1
125
+ | '\u{0085}'
126
+
127
+ // Bidi markers
128
+ | '\u{200E}' // LEFT-TO-RIGHT MARK
129
+ | '\u{200F}' // RIGHT-TO-LEFT MARK
130
+
131
+ // Dedicated whitespace characters from Unicode
132
+ | '\u{2028}' // LINE SEPARATOR
133
+ | '\u{2029}' // PARAGRAPH SEPARATOR
134
+ => true ,
135
+ _ => false ,
136
+ }
137
+ }
138
+
139
+ /// True if `c` is valid as a first character of an identifier.
140
+ pub fn is_id_start ( c : char ) -> bool {
141
+ // This is XID_Start OR '_' (which formally is not a XID_Start).
142
+ // We also add fast-path for ascii idents
143
+ ( 'a' <= c && c <= 'z' )
144
+ || ( 'A' <= c && c <= 'Z' )
145
+ || c == '_'
146
+ || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_start ( c) )
147
+ }
148
+
149
+ /// True if `c` is valid as a non-first character of an identifier.
150
+ pub fn is_id_continue ( c : char ) -> bool {
151
+ // This is exactly XID_Continue.
152
+ // We also add fast-path for ascii idents
153
+ ( 'a' <= c && c <= 'z' )
154
+ || ( 'A' <= c && c <= 'Z' )
155
+ || ( '0' <= c && c <= '9' )
156
+ || c == '_'
157
+ || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_continue ( c) )
158
+ }
159
+
160
+
105
161
impl Cursor < ' _ > {
106
162
fn advance_token ( & mut self ) -> Token {
107
163
let first_char = self . bump ( ) . unwrap ( ) ;
@@ -111,9 +167,9 @@ impl Cursor<'_> {
111
167
'*' => self . block_comment ( ) ,
112
168
_ => Slash ,
113
169
} ,
114
- c if character_properties :: is_whitespace ( c) => self . whitespace ( ) ,
170
+ c if is_whitespace ( c) => self . whitespace ( ) ,
115
171
'r' => match ( self . nth_char ( 0 ) , self . nth_char ( 1 ) ) {
116
- ( '#' , c1) if character_properties :: is_id_start ( c1) => self . raw_ident ( ) ,
172
+ ( '#' , c1) if is_id_start ( c1) => self . raw_ident ( ) ,
117
173
( '#' , _) | ( '"' , _) => {
118
174
let ( n_hashes, started, terminated) = self . raw_double_quoted_string ( ) ;
119
175
let suffix_start = self . len_consumed ( ) ;
@@ -158,7 +214,7 @@ impl Cursor<'_> {
158
214
}
159
215
_ => self . ident ( ) ,
160
216
} ,
161
- c if character_properties :: is_id_start ( c) => self . ident ( ) ,
217
+ c if is_id_start ( c) => self . ident ( ) ,
162
218
c @ '0' ..='9' => {
163
219
let literal_kind = self . number ( c) ;
164
220
let suffix_start = self . len_consumed ( ) ;
@@ -246,8 +302,8 @@ impl Cursor<'_> {
246
302
}
247
303
248
304
fn whitespace ( & mut self ) -> TokenKind {
249
- debug_assert ! ( character_properties :: is_whitespace( self . prev( ) ) ) ;
250
- while character_properties :: is_whitespace ( self . nth_char ( 0 ) ) {
305
+ debug_assert ! ( is_whitespace( self . prev( ) ) ) ;
306
+ while is_whitespace ( self . nth_char ( 0 ) ) {
251
307
self . bump ( ) ;
252
308
}
253
309
Whitespace
@@ -257,19 +313,19 @@ impl Cursor<'_> {
257
313
debug_assert ! (
258
314
self . prev( ) == 'r'
259
315
&& self . nth_char( 0 ) == '#'
260
- && character_properties :: is_id_start( self . nth_char( 1 ) )
316
+ && is_id_start( self . nth_char( 1 ) )
261
317
) ;
262
318
self . bump ( ) ;
263
319
self . bump ( ) ;
264
- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
320
+ while is_id_continue ( self . nth_char ( 0 ) ) {
265
321
self . bump ( ) ;
266
322
}
267
323
RawIdent
268
324
}
269
325
270
326
fn ident ( & mut self ) -> TokenKind {
271
- debug_assert ! ( character_properties :: is_id_start( self . prev( ) ) ) ;
272
- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
327
+ debug_assert ! ( is_id_start( self . prev( ) ) ) ;
328
+ while is_id_continue ( self . nth_char ( 0 ) ) {
273
329
self . bump ( ) ;
274
330
}
275
331
Ident
@@ -314,7 +370,7 @@ impl Cursor<'_> {
314
370
// integer literal followed by field/method access or a range pattern
315
371
// (`0..2` and `12.foo()`)
316
372
'.' if self . nth_char ( 1 ) != '.'
317
- && !character_properties :: is_id_start ( self . nth_char ( 1 ) ) =>
373
+ && !is_id_start ( self . nth_char ( 1 ) ) =>
318
374
{
319
375
// might have stuff after the ., and if it does, it needs to start
320
376
// with a number
@@ -344,15 +400,15 @@ impl Cursor<'_> {
344
400
fn lifetime_or_char ( & mut self ) -> TokenKind {
345
401
debug_assert ! ( self . prev( ) == '\'' ) ;
346
402
let mut starts_with_number = false ;
347
- if ( character_properties :: is_id_start ( self . nth_char ( 0 ) )
403
+ if ( is_id_start ( self . nth_char ( 0 ) )
348
404
|| self . nth_char ( 0 ) . is_digit ( 10 ) && {
349
405
starts_with_number = true ;
350
406
true
351
407
} )
352
408
&& self . nth_char ( 1 ) != '\''
353
409
{
354
410
self . bump ( ) ;
355
- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
411
+ while is_id_continue ( self . nth_char ( 0 ) ) {
356
412
self . bump ( ) ;
357
413
}
358
414
@@ -494,64 +550,13 @@ impl Cursor<'_> {
494
550
}
495
551
496
552
fn eat_literal_suffix ( & mut self ) {
497
- if !character_properties :: is_id_start ( self . nth_char ( 0 ) ) {
553
+ if !is_id_start ( self . nth_char ( 0 ) ) {
498
554
return ;
499
555
}
500
556
self . bump ( ) ;
501
557
502
- while character_properties :: is_id_continue ( self . nth_char ( 0 ) ) {
558
+ while is_id_continue ( self . nth_char ( 0 ) ) {
503
559
self . bump ( ) ;
504
560
}
505
561
}
506
562
}
507
-
508
- pub mod character_properties {
509
- // See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
510
- // classes.
511
-
512
- // This is Pattern_White_Space.
513
- //
514
- // Note that this set is stable (ie, it doesn't change with different
515
- // Unicode versions), so it's ok to just hard-code the values.
516
- pub fn is_whitespace ( c : char ) -> bool {
517
- match c {
518
- // Usual ASCII suspects
519
- | '\u{0009}' // \t
520
- | '\u{000A}' // \n
521
- | '\u{000B}' // vertical tab
522
- | '\u{000C}' // form feed
523
- | '\u{000D}' // \r
524
- | '\u{0020}' // space
525
-
526
- // NEXT LINE from latin1
527
- | '\u{0085}'
528
-
529
- // Bidi markers
530
- | '\u{200E}' // LEFT-TO-RIGHT MARK
531
- | '\u{200F}' // RIGHT-TO-LEFT MARK
532
-
533
- // Dedicated whitespace characters from Unicode
534
- | '\u{2028}' // LINE SEPARATOR
535
- | '\u{2029}' // PARAGRAPH SEPARATOR
536
- => true ,
537
- _ => false ,
538
- }
539
- }
540
-
541
- // This is XID_Start OR '_' (which formally is not a XID_Start).
542
- pub fn is_id_start ( c : char ) -> bool {
543
- ( 'a' <= c && c <= 'z' )
544
- || ( 'A' <= c && c <= 'Z' )
545
- || c == '_'
546
- || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_start ( c) )
547
- }
548
-
549
- // This is XID_Continue.
550
- pub fn is_id_continue ( c : char ) -> bool {
551
- ( 'a' <= c && c <= 'z' )
552
- || ( 'A' <= c && c <= 'Z' )
553
- || ( '0' <= c && c <= '9' )
554
- || c == '_'
555
- || ( c > '\x7f' && unicode_xid:: UnicodeXID :: is_xid_continue ( c) )
556
- }
557
- }
0 commit comments