Skip to content

Commit 5e2c110

Browse files
committed
Auto merge of #60793 - Xanewok:raw-string-cleanup, r=petrochenkov
lexer: Disallow bare CR in raw byte strings Handles bare CR ~but doesn't translate `\r\n` to `\n` yet in raw strings yet~ and translates CRLF to LF in raw strings. As a side-note I think it'd be good to change the `unescape_` to return plain iterators to reduce some boilerplate (e.g. `has_error` could benefit from collecting `Result<T>` and aborting early on errors) but will do that separately, unless I missed something here that prevents it. @matklad @petrochenkov thoughts?
2 parents 02564de + 630d5f3 commit 5e2c110

8 files changed

+180
-135
lines changed

src/libsyntax/parse/lexer/mod.rs

+58-100
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
130130
self.ch.is_none()
131131
}
132132

133-
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
133+
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
134134
let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
135135
err.span_label(self.mk_sp(pos, pos), "unterminated raw string");
136136

@@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
292292
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
293293
}
294294

295-
/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
296-
/// escaped character to the error message
297-
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
298-
let mut m = m.to_string();
299-
m.push_str(": ");
300-
push_escaped_char(&mut m, c);
301-
self.err_span_(from_pos, to_pos, &m[..]);
302-
}
303-
304295
/// Advance peek_token to refer to the next token, and
305296
/// possibly update the interner.
306297
fn advance_token(&mut self) -> Result<(), ()> {
@@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
10701061
self.validate_byte_str_escape(start_with_quote);
10711062
(token::ByteStr, symbol)
10721063
},
1073-
Some('r') => self.scan_raw_byte_string(),
1064+
Some('r') => {
1065+
let (start, end, hash_count) = self.scan_raw_string();
1066+
let symbol = self.name_from_to(start, end);
1067+
self.validate_raw_byte_str_escape(start, end);
1068+
1069+
(token::ByteStrRaw(hash_count), symbol)
1070+
}
10741071
_ => unreachable!(), // Should have been a token::Ident above.
10751072
};
10761073
let suffix = self.scan_optional_raw_name();
@@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
10861083
Ok(TokenKind::lit(token::Str, symbol, suffix))
10871084
}
10881085
'r' => {
1089-
let start_bpos = self.pos;
1090-
self.bump();
1091-
let mut hash_count: u16 = 0;
1092-
while self.ch_is('#') {
1093-
if hash_count == 65535 {
1094-
let bpos = self.next_pos;
1095-
self.fatal_span_(start_bpos,
1096-
bpos,
1097-
"too many `#` symbols: raw strings may be \
1098-
delimited by up to 65535 `#` symbols").raise();
1099-
}
1100-
self.bump();
1101-
hash_count += 1;
1102-
}
1103-
1104-
if self.is_eof() {
1105-
self.fail_unterminated_raw_string(start_bpos, hash_count);
1106-
} else if !self.ch_is('"') {
1107-
let last_bpos = self.pos;
1108-
let curr_char = self.ch.unwrap();
1109-
self.fatal_span_char(start_bpos,
1110-
last_bpos,
1111-
"found invalid character; only `#` is allowed \
1112-
in raw string delimitation",
1113-
curr_char).raise();
1114-
}
1115-
self.bump();
1116-
let content_start_bpos = self.pos;
1117-
let mut content_end_bpos;
1118-
let mut valid = true;
1119-
'outer: loop {
1120-
if self.is_eof() {
1121-
self.fail_unterminated_raw_string(start_bpos, hash_count);
1122-
}
1123-
// if self.ch_is('"') {
1124-
// content_end_bpos = self.pos;
1125-
// for _ in 0..hash_count {
1126-
// self.bump();
1127-
// if !self.ch_is('#') {
1128-
// continue 'outer;
1129-
let c = self.ch.unwrap();
1130-
match c {
1131-
'"' => {
1132-
content_end_bpos = self.pos;
1133-
for _ in 0..hash_count {
1134-
self.bump();
1135-
if !self.ch_is('#') {
1136-
continue 'outer;
1137-
}
1138-
}
1139-
break;
1140-
}
1141-
'\r' => {
1142-
if !self.nextch_is('\n') {
1143-
let last_bpos = self.pos;
1144-
self.err_span_(start_bpos,
1145-
last_bpos,
1146-
"bare CR not allowed in raw string, use \\r \
1147-
instead");
1148-
valid = false;
1149-
}
1150-
}
1151-
_ => (),
1152-
}
1153-
self.bump();
1154-
}
1155-
1156-
self.bump();
1157-
let symbol = if valid {
1158-
self.name_from_to(content_start_bpos, content_end_bpos)
1159-
} else {
1160-
Symbol::intern("??")
1161-
};
1086+
let (start, end, hash_count) = self.scan_raw_string();
1087+
let symbol = self.name_from_to(start, end);
1088+
self.validate_raw_str_escape(start, end);
11621089
let suffix = self.scan_optional_raw_name();
11631090

11641091
Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
@@ -1315,16 +1242,18 @@ impl<'a> StringReader<'a> {
13151242
id
13161243
}
13171244

1318-
fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
1245+
/// Scans a raw (byte) string, returning byte position range for `"<literal>"`
1246+
/// (including quotes) along with `#` character count in `(b)r##..."<literal>"##...`;
1247+
fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
13191248
let start_bpos = self.pos;
13201249
self.bump();
1321-
let mut hash_count = 0;
1250+
let mut hash_count: u16 = 0;
13221251
while self.ch_is('#') {
13231252
if hash_count == 65535 {
13241253
let bpos = self.next_pos;
13251254
self.fatal_span_(start_bpos,
13261255
bpos,
1327-
"too many `#` symbols: raw byte strings may be \
1256+
"too many `#` symbols: raw strings may be \
13281257
delimited by up to 65535 `#` symbols").raise();
13291258
}
13301259
self.bump();
@@ -1334,13 +1263,13 @@ impl<'a> StringReader<'a> {
13341263
if self.is_eof() {
13351264
self.fail_unterminated_raw_string(start_bpos, hash_count);
13361265
} else if !self.ch_is('"') {
1337-
let pos = self.pos;
1338-
let ch = self.ch.unwrap();
1266+
let last_bpos = self.pos;
1267+
let curr_char = self.ch.unwrap();
13391268
self.fatal_span_char(start_bpos,
1340-
pos,
1341-
"found invalid character; only `#` is allowed in raw \
1342-
string delimitation",
1343-
ch).raise();
1269+
last_bpos,
1270+
"found invalid character; only `#` is allowed \
1271+
in raw string delimitation",
1272+
curr_char).raise();
13441273
}
13451274
self.bump();
13461275
let content_start_bpos = self.pos;
@@ -1360,19 +1289,14 @@ impl<'a> StringReader<'a> {
13601289
}
13611290
break;
13621291
}
1363-
Some(c) => {
1364-
if c > '\x7F' {
1365-
let pos = self.pos;
1366-
self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1367-
}
1368-
}
1292+
_ => (),
13691293
}
13701294
self.bump();
13711295
}
13721296

13731297
self.bump();
13741298

1375-
(token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
1299+
(content_start_bpos, content_end_bpos, hash_count)
13761300
}
13771301

13781302
fn validate_char_escape(&self, start_with_quote: BytePos) {
@@ -1422,6 +1346,40 @@ impl<'a> StringReader<'a> {
14221346
});
14231347
}
14241348

1349+
fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
1350+
self.with_str_from_to(content_start, content_end, |lit: &str| {
1351+
unescape::unescape_raw_str(lit, &mut |range, c| {
1352+
if let Err(err) = c {
1353+
emit_unescape_error(
1354+
&self.sess.span_diagnostic,
1355+
lit,
1356+
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
1357+
unescape::Mode::Str,
1358+
range,
1359+
err,
1360+
)
1361+
}
1362+
})
1363+
});
1364+
}
1365+
1366+
fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
1367+
self.with_str_from_to(content_start, content_end, |lit: &str| {
1368+
unescape::unescape_raw_byte_str(lit, &mut |range, c| {
1369+
if let Err(err) = c {
1370+
emit_unescape_error(
1371+
&self.sess.span_diagnostic,
1372+
lit,
1373+
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
1374+
unescape::Mode::ByteStr,
1375+
range,
1376+
err,
1377+
)
1378+
}
1379+
})
1380+
});
1381+
}
1382+
14251383
fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
14261384
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
14271385
unescape::unescape_byte_str(lit, &mut |range, c| {

src/libsyntax/parse/literal.rs

+34-26
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ use crate::ast::{self, Lit, LitKind};
44
use crate::parse::parser::Parser;
55
use crate::parse::PResult;
66
use crate::parse::token::{self, Token, TokenKind};
7-
use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
7+
use crate::parse::unescape::{unescape_char, unescape_byte};
8+
use crate::parse::unescape::{unescape_str, unescape_byte_str};
9+
use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str};
810
use crate::print::pprust;
911
use crate::symbol::{kw, sym, Symbol};
1012
use crate::tokenstream::{TokenStream, TokenTree};
@@ -141,7 +143,17 @@ impl LitKind {
141143
// Ditto.
142144
let s = symbol.as_str();
143145
let symbol = if s.contains('\r') {
144-
Symbol::intern(&raw_str_lit(&s))
146+
let mut buf = String::with_capacity(s.len());
147+
let mut error = Ok(());
148+
unescape_raw_str(&s, &mut |_, unescaped_char| {
149+
match unescaped_char {
150+
Ok(c) => buf.push(c),
151+
Err(_) => error = Err(LitError::LexerError),
152+
}
153+
});
154+
error?;
155+
buf.shrink_to_fit();
156+
Symbol::intern(&buf)
145157
} else {
146158
symbol
147159
};
@@ -161,7 +173,26 @@ impl LitKind {
161173
buf.shrink_to_fit();
162174
LitKind::ByteStr(Lrc::new(buf))
163175
}
164-
token::ByteStrRaw(_) => LitKind::ByteStr(Lrc::new(symbol.to_string().into_bytes())),
176+
token::ByteStrRaw(_) => {
177+
let s = symbol.as_str();
178+
let bytes = if s.contains('\r') {
179+
let mut buf = Vec::with_capacity(s.len());
180+
let mut error = Ok(());
181+
unescape_raw_byte_str(&s, &mut |_, unescaped_byte| {
182+
match unescaped_byte {
183+
Ok(c) => buf.push(c),
184+
Err(_) => error = Err(LitError::LexerError),
185+
}
186+
});
187+
error?;
188+
buf.shrink_to_fit();
189+
buf
190+
} else {
191+
symbol.to_string().into_bytes()
192+
};
193+
194+
LitKind::ByteStr(Lrc::new(bytes))
195+
},
165196
token::Err => LitKind::Err(symbol),
166197
})
167198
}
@@ -353,29 +384,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option<S
353384
}
354385
}
355386

356-
/// Parses a string representing a raw string literal into its final form. The
357-
/// only operation this does is convert embedded CRLF into a single LF.
358-
fn raw_str_lit(lit: &str) -> String {
359-
debug!("raw_str_lit: {:?}", lit);
360-
let mut res = String::with_capacity(lit.len());
361-
362-
let mut chars = lit.chars().peekable();
363-
while let Some(c) = chars.next() {
364-
if c == '\r' {
365-
if *chars.peek().unwrap() != '\n' {
366-
panic!("lexer accepted bare CR");
367-
}
368-
chars.next();
369-
res.push('\n');
370-
} else {
371-
res.push(c);
372-
}
373-
}
374-
375-
res.shrink_to_fit();
376-
res
377-
}
378-
379387
// Checks if `s` looks like i32 or u1234 etc.
380388
fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
381389
s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())

0 commit comments

Comments
 (0)