Skip to content

Commit 1f147a2

Browse files
author
Julian Wollersberger
committed
Replace nth_char(0) with next() in cursor.first()
and optimize the iterator returned by `tokenize(). This improves lexer performance by 35%
1 parent 72d6606 commit 1f147a2

File tree

2 files changed

+28
-25
lines changed

2 files changed

+28
-25
lines changed

compiler/rustc_lexer/src/cursor.rs

+21-12
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ use std::str::Chars;
22

33
/// Peekable iterator over a char sequence.
44
///
5-
/// Next characters can be peeked via `nth_char` method,
5+
/// Next characters can be peeked via `first` method,
66
/// and position can be shifted forward via `bump` method.
77
pub(crate) struct Cursor<'a> {
88
initial_len: usize,
9+
/// Iterator over chars. Slightly faster than a &str.
910
chars: Chars<'a>,
1011
#[cfg(debug_assertions)]
1112
prev: char,
@@ -37,22 +38,21 @@ impl<'a> Cursor<'a> {
3738
}
3839
}
3940

40-
/// Returns nth character relative to the current cursor position.
41+
/// Peeks the next symbol from the input stream without consuming it.
4142
/// If requested position doesn't exist, `EOF_CHAR` is returned.
4243
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
4344
/// it should be checked with `is_eof` method.
44-
fn nth_char(&self, n: usize) -> char {
45-
self.chars().nth(n).unwrap_or(EOF_CHAR)
46-
}
47-
48-
/// Peeks the next symbol from the input stream without consuming it.
4945
pub(crate) fn first(&self) -> char {
50-
self.nth_char(0)
46+
// `.next()` optimizes better than `.nth(0)`
47+
self.chars.clone().next().unwrap_or(EOF_CHAR)
5148
}
5249

5350
/// Peeks the second symbol from the input stream without consuming it.
5451
pub(crate) fn second(&self) -> char {
55-
self.nth_char(1)
52+
// `.next()` optimizes better than `.nth(1)`
53+
let mut iter = self.chars.clone();
54+
iter.next();
55+
iter.next().unwrap_or(EOF_CHAR)
5656
}
5757

5858
/// Checks if there is nothing more to consume.
@@ -65,9 +65,9 @@ impl<'a> Cursor<'a> {
6565
self.initial_len - self.chars.as_str().len()
6666
}
6767

68-
/// Returns a `Chars` iterator over the remaining characters.
69-
fn chars(&self) -> Chars<'a> {
70-
self.chars.clone()
68+
/// Resets the number of bytes consumed to 0.
69+
pub(crate) fn reset_len_consumed(&mut self) {
70+
self.initial_len = self.chars.as_str().len();
7171
}
7272

7373
/// Moves to the next character.
@@ -81,4 +81,13 @@ impl<'a> Cursor<'a> {
8181

8282
Some(c)
8383
}
84+
85+
/// Eats symbols while predicate returns true or until the end of file is reached.
86+
pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
87+
// It was tried making optimized version of this for eg. line comments, but
88+
// LLVM can inline all of this and compile it down to fast iteration over bytes.
89+
while predicate(self.first()) && !self.is_eof() {
90+
self.bump();
91+
}
92+
}
8493
}

compiler/rustc_lexer/src/lib.rs

+7-13
Original file line numberDiff line numberDiff line change
@@ -225,14 +225,15 @@ pub fn first_token(input: &str) -> Token {
225225
}
226226

227227
/// Creates an iterator that produces tokens from the input string.
228-
pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
228+
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
229+
let mut cursor = Cursor::new(input);
229230
std::iter::from_fn(move || {
230-
if input.is_empty() {
231-
return None;
231+
if cursor.is_eof() {
232+
None
233+
} else {
234+
cursor.reset_len_consumed();
235+
Some(cursor.advance_token())
232236
}
233-
let token = first_token(input);
234-
input = &input[token.len..];
235-
Some(token)
236237
})
237238
}
238239

@@ -808,11 +809,4 @@ impl Cursor<'_> {
808809

809810
self.eat_while(is_id_continue);
810811
}
811-
812-
/// Eats symbols while predicate returns true or until the end of file is reached.
813-
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
814-
while predicate(self.first()) && !self.is_eof() {
815-
self.bump();
816-
}
817-
}
818812
}

0 commit comments

Comments
 (0)