Skip to content

Commit ec44d48

Browse files
authored
Rollup merge of #94316 - nnethercote:improve-string-literal-unescaping, r=petrochenkov
Improve string literal unescaping Some easy wins that affect a few popular crates. r? ```@matklad```
2 parents 9e7131a + 44308dc commit ec44d48

File tree

3 files changed

+45
-40
lines changed

3 files changed

+45
-40
lines changed

compiler/rustc_ast/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#![feature(min_specialization)]
1717
#![recursion_limit = "256"]
1818
#![feature(slice_internals)]
19+
#![feature(stmt_expr_attributes)]
1920

2021
#[macro_use]
2122
extern crate rustc_macros;

compiler/rustc_ast/src/util/literal.rs

+22-17
Original file line numberDiff line numberDiff line change
@@ -56,25 +56,30 @@ impl LitKind {
5656
// new symbol because the string in the LitKind is different to the
5757
// string in the token.
5858
let s = symbol.as_str();
59-
let symbol =
60-
if s.contains(&['\\', '\r']) {
61-
let mut buf = String::with_capacity(s.len());
62-
let mut error = Ok(());
63-
unescape_literal(&s, Mode::Str, &mut |_, unescaped_char| {
64-
match unescaped_char {
65-
Ok(c) => buf.push(c),
66-
Err(err) => {
67-
if err.is_fatal() {
68-
error = Err(LitError::LexerError);
69-
}
59+
let symbol = if s.contains(&['\\', '\r']) {
60+
let mut buf = String::with_capacity(s.len());
61+
let mut error = Ok(());
62+
// Force-inlining here is aggressive but the closure is
63+
// called on every char in the string, so it can be
64+
// hot in programs with many long strings.
65+
unescape_literal(
66+
&s,
67+
Mode::Str,
68+
&mut #[inline(always)]
69+
|_, unescaped_char| match unescaped_char {
70+
Ok(c) => buf.push(c),
71+
Err(err) => {
72+
if err.is_fatal() {
73+
error = Err(LitError::LexerError);
7074
}
7175
}
72-
});
73-
error?;
74-
Symbol::intern(&buf)
75-
} else {
76-
symbol
77-
};
76+
},
77+
);
78+
error?;
79+
Symbol::intern(&buf)
80+
} else {
81+
symbol
82+
};
7883
LitKind::Str(symbol, ast::StrStyle::Cooked)
7984
}
8085
token::StrRaw(n) => {

compiler/rustc_lexer/src/unescape.rs

+22-23
Original file line numberDiff line numberDiff line change
@@ -159,26 +159,8 @@ impl Mode {
159159
}
160160
}
161161

162-
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
163-
if first_char != '\\' {
164-
// Previous character was not a slash, and we don't expect it to be
165-
// an escape-only character.
166-
return match first_char {
167-
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
168-
'\r' => Err(EscapeError::BareCarriageReturn),
169-
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
170-
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
171-
_ => {
172-
if mode.is_bytes() && !first_char.is_ascii() {
173-
// Byte literal can't be a non-ascii character.
174-
return Err(EscapeError::NonAsciiCharInByte);
175-
}
176-
Ok(first_char)
177-
}
178-
};
179-
}
180-
181-
// Previous character is '\\', try to unescape it.
162+
fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
163+
// Previous character was '\\', unescape what follows.
182164

183165
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
184166

@@ -270,9 +252,24 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
270252
Ok(res)
271253
}
272254

255+
#[inline]
256+
fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> {
257+
if mode.is_bytes() && !first_char.is_ascii() {
258+
// Byte literal can't be a non-ascii character.
259+
Err(EscapeError::NonAsciiCharInByte)
260+
} else {
261+
Ok(first_char)
262+
}
263+
}
264+
273265
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
274266
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
275-
let res = scan_escape(first_char, chars, mode)?;
267+
let res = match first_char {
268+
'\\' => scan_escape(chars, mode),
269+
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
270+
'\r' => Err(EscapeError::BareCarriageReturn),
271+
_ => ascii_check(first_char, mode),
272+
}?;
276273
if chars.next().is_some() {
277274
return Err(EscapeError::MoreThanOneChar);
278275
}
@@ -303,12 +300,14 @@ where
303300
skip_ascii_whitespace(&mut chars, start, callback);
304301
continue;
305302
}
306-
_ => scan_escape(first_char, &mut chars, mode),
303+
_ => scan_escape(&mut chars, mode),
307304
}
308305
}
309306
'\n' => Ok('\n'),
310307
'\t' => Ok('\t'),
311-
_ => scan_escape(first_char, &mut chars, mode),
308+
'"' => Err(EscapeError::EscapeOnlyChar),
309+
'\r' => Err(EscapeError::BareCarriageReturn),
310+
_ => ascii_check(first_char, mode),
312311
};
313312
let end = initial_len - chars.as_str().len();
314313
callback(start..end, unescaped_char);

0 commit comments

Comments
 (0)