From 1cbecc38ac7b03dfdc3a75e784473f08bda5e994 Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Fri, 7 Mar 2025 11:17:39 +0000 Subject: [PATCH 1/2] Separate the unescape functions for string, byte string and C string, but avoid duplicating code via macro_rules. Also plays with NonZero, since C strings cannot contain null bytes, which can be captured in the type system. --- compiler/rustc_ast/src/util/literal.rs | 17 +- compiler/rustc_lexer/src/unescape.rs | 515 +++++++++--------- compiler/rustc_lexer/src/unescape/tests.rs | 2 +- compiler/rustc_parse/src/lexer/mod.rs | 6 +- library/core/src/num/niche_types.rs | 2 + library/core/src/num/nonzero.rs | 1 + .../crates/parser/src/lexed_str.rs | 139 +++-- .../crates/syntax/src/ast/token_ext.rs | 26 +- .../crates/syntax/src/validation.rs | 26 +- 9 files changed, 354 insertions(+), 380 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 6896ac723fa58..dc66a42dc1c6a 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use std::{ascii, fmt, str}; use rustc_lexer::unescape::{ - MixedUnit, Mode, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, + MixedUnit, unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, }; use rustc_span::{Span, Symbol, kw, sym}; use tracing::debug; @@ -87,11 +87,10 @@ impl LitKind { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_unicode( + unescape_str( s, - Mode::Str, &mut #[inline(always)] - |_, c| match c { + |_, res| match res { Ok(c) => buf.push(c), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") @@ -111,8 +110,8 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), + unescape_byte_str(s, &mut |_, res| match res { + Ok(b) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") } @@ -128,11 +127,11 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { + unescape_cstr(s, &mut |_span, c| match c { Ok(MixedUnit::Char(c)) => { - buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + buf.extend_from_slice(c.get().encode_utf8(&mut [0; 4]).as_bytes()) } - Ok(MixedUnit::HighByte(b)) => buf.push(b), + Ok(MixedUnit::HighByte(b)) => buf.push(b.get()), Err(err) => { assert!(!err.is_fatal(), "failed to unescape C string literal") } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index d6ea4249247f3..5c7d1106f568f 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -1,6 +1,7 @@ //! Utilities for validating string and char literals and turning them into //! values they represent. +use std::num::NonZero; use std::ops::Range; use std::str::Chars; @@ -80,203 +81,246 @@ impl EscapeError { } } -/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without -/// quotes) and produces a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, -/// the callback will be called exactly once. -pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - Char | Byte => { - let mut chars = src.chars(); - let res = unescape_char_or_byte(&mut chars, mode); - callback(0..(src.len() - chars.as_str().len()), res); - } - Str | ByteStr => unescape_non_raw_common(src, mode, callback), - RawStr | RawByteStr => check_raw_common(src, mode, callback), - RawCStr => check_raw_common(src, mode, &mut |r, mut result| { - if let Ok('\0') = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - CStr => unreachable!(), - } -} - /// Used for mixed utf8 string literals, i.e. those that allow both unicode /// chars and high bytes. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum MixedUnit { - /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) + /// Used for ASCII chars (written directly or via `\x01`..`\x7f` escapes) /// and Unicode chars (written directly or via `\u` escapes). /// /// For example, if '¥' appears in a string it is represented here as /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` - Char(char), + Char(NonZero), /// Used for high bytes (`\x80`..`\xff`). /// /// For example, if `\xa5` appears in a string it is represented here as /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant /// byte string as the single byte `0xa5`. - HighByte(u8), + HighByte(NonZero), } -impl From for MixedUnit { - fn from(c: char) -> Self { +impl From> for MixedUnit { + fn from(c: NonZero) -> Self { MixedUnit::Char(c) } } -impl From for MixedUnit { - fn from(n: u8) -> Self { - if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) } +impl From> for MixedUnit { + fn from(byte: NonZero) -> Self { + if byte.get().is_ascii() { + MixedUnit::Char(NonZero::new(byte.get() as char).unwrap()) + } else { + MixedUnit::HighByte(byte) + } } } +impl TryFrom for MixedUnit { + type Error = EscapeError; -/// Takes the contents of a mixed-utf8 literal (without quotes) and produces -/// a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. -pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(MixedUnit::Char('\0')) = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), + fn try_from(c: char) -> Result { + NonZero::new(c).map(MixedUnit::Char).ok_or(EscapeError::NulInCStr) } } -/// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error. -pub fn unescape_char(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Char) +impl TryFrom for MixedUnit { + type Error = EscapeError; + + fn try_from(byte: u8) -> Result { + NonZero::::new(byte).map(From::from).ok_or(EscapeError::NulInCStr) + } } -/// Takes a contents of a byte literal (without quotes), and returns an -/// unescaped byte or an error. -pub fn unescape_byte(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) +macro_rules! check { + ($string_ty:literal + ($check:ident: $char2unit:expr => $unit:ty)) => { + #[doc = concat!("Take the contents of a raw ", stringify!($string_ty), + " literal (without quotes) and produce a sequence of results of ", + stringify!($unit_ty), " or error (returned via `callback`).", + "\nNB: Raw strings don't do any unescaping, but do produce errors on bare CR.")] + pub fn $check(src: &str, callback: &mut impl FnMut(Range, Result<$unit, EscapeError>)) + { + src.char_indices().for_each(|(pos, c)| { + callback( + pos..pos + c.len_utf8(), + if c == '\r' { Err(EscapeError::BareCarriageReturnInRawString) } else { $char2unit(c) }, + ); + }); + } + }; } -/// What kind of literal do we parse. -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum Mode { - Char, +check!("string" (check_raw_str: Ok => char)); +check!("byte string" (check_raw_byte_str: ascii_char_to_byte => u8)); +check!("C string" (check_raw_cstr: |c| NonZero::::new(c).ok_or(EscapeError::NulInCStr) => NonZero)); + +macro_rules! unescape { + ($string_ty:literal + ($unescape:ident: $char2unit:expr => $unit:ty) + $scan_escape:ident) => { + #[doc = concat!("Take the contents of a ", stringify!($string_ty), + " literal (without quotes) and produce a sequence of results of escaped ", + stringify!($unit_ty), " or error (returned via `callback`).")] + pub fn $unescape(src: &str, callback: &mut impl FnMut(Range, Result<$unit, EscapeError>)) + { + let mut chars = src.chars(); + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\\' => { + if let Some(b'\n') = chars.as_str().as_bytes().first() { + let _ = chars.next(); + // skip whitespace for backslash newline, see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + let mut callback_err = |range, err| callback(range, Err(err)); + skip_ascii_whitespace(&mut chars, start, &mut callback_err); + continue; + } else { + $scan_escape(&mut chars) + } + } + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => $char2unit(c), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); + } + } + }; +} - Byte, +unescape!("string" (unescape_str: Ok => char) scan_escape_str); +unescape!("byte string" (unescape_byte_str: ascii_char_to_byte => u8) scan_escape_byte_str); +unescape!("C string" (unescape_cstr: TryFrom::try_from => MixedUnit) scan_escape_c_str); - Str, - RawStr, +/// Skip ASCII whitespace, except for the formfeed character +/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)). +/// Warns on unescaped newline and following non-ASCII whitespace. +fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) +where + F: FnMut(Range, EscapeError), +{ + let rest = chars.as_str(); + let first_non_space = rest + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(rest.len()); + let (space, rest) = rest.split_at(first_non_space); + // backslash newline adds 2 bytes + let end = start + 2 + first_non_space; + if space.contains('\n') { + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + *chars = rest.chars(); + if let Some(c) = chars.clone().next() { + if c.is_whitespace() { + // for error reporting, include the character that was not skipped in the span + callback(start..end + c.len_utf8(), EscapeError::UnskippedWhitespaceWarning); + } + } +} - ByteStr, - RawByteStr, +/// Takes the contents of a char literal (without quotes), +/// and returns an unescaped char or an error. +pub fn unescape_char(src: &str) -> Result { + unescape_char_iter(&mut src.chars()) +} - CStr, - RawCStr, +/// Takes the contents of a byte literal (without quotes), +/// and returns an unescaped byte or an error. +pub fn unescape_byte(src: &str) -> Result { + unescape_byte_iter(&mut src.chars()) } -impl Mode { - pub fn in_double_quotes(self) -> bool { - match self { - Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, - Char | Byte => false, +macro_rules! unescape_iter { + (($unescape:ident: $char2unit:expr => $unit:ty) $scan_escape:ident) => { + fn $unescape(chars: &mut Chars<'_>) -> Result<$unit, EscapeError> { + let res = match chars.next().ok_or(EscapeError::ZeroChars)? { + '\\' => $scan_escape(chars), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => $char2unit(c), + }?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); + } + Ok(res) } - } + }; +} - /// Are `\x80`..`\xff` allowed? - fn allow_high_bytes(self) -> bool { - match self { - Char | Str => false, - Byte | ByteStr | CStr => true, - RawStr | RawByteStr | RawCStr => unreachable!(), - } - } +unescape_iter!((unescape_char_iter: Ok => char) scan_escape_str); +unescape_iter!((unescape_byte_iter: ascii_char_to_byte => u8) scan_escape_byte_str); - /// Are unicode (non-ASCII) chars allowed? - #[inline] - fn allow_unicode_chars(self) -> bool { - match self { - Byte | ByteStr | RawByteStr => false, - Char | Str | RawStr | CStr | RawCStr => true, +macro_rules! scan_escape { + ($scan:ident: $zero_result:expr, $from_hex:expr, $from_unicode:expr => $unit:ty) => { + fn $scan(chars: &mut Chars<'_>) -> Result<$unit, EscapeError> { + // Previous character was '\\', unescape what follows. + let c = chars.next().ok_or(EscapeError::LoneSlash)?; + if c == '0' { + $zero_result + } else { + simple_escape(c).map(|b| b.get().try_into().unwrap()).or_else(|c| match c { + 'x' => $from_hex(hex_escape(chars)?), + 'u' => $from_unicode({ + let value = unicode_escape(chars)?; + if value > char::MAX as u32 { + Err(EscapeError::OutOfRangeUnicodeEscape) + } else { + char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape) + } + }), + _ => Err(EscapeError::InvalidEscape), + }) + } } - } + }; +} - /// Are unicode escapes (`\u`) allowed? - fn allow_unicode_escapes(self) -> bool { - match self { - Byte | ByteStr => false, - Char | Str | CStr => true, - RawByteStr | RawStr | RawCStr => unreachable!(), - } - } +scan_escape!(scan_escape_str: Ok('\0'), char_from_byte, |id| id => char); +scan_escape!(scan_escape_byte_str: Ok(b'\0'), Ok, |_| Err(EscapeError::UnicodeEscapeInByte) => u8); +scan_escape!(scan_escape_c_str: Err(EscapeError::NulInCStr), TryInto::try_into, |r: Result| r?.try_into() => MixedUnit); - pub fn prefix_noraw(self) -> &'static str { - match self { - Char | Str | RawStr => "", - Byte | ByteStr | RawByteStr => "b", - CStr | RawCStr => "c", - } - } +fn char_from_byte(b: u8) -> Result { + if b.is_ascii() { Ok(b as char) } else { Err(EscapeError::OutOfRangeHexEscape) } } -fn scan_escape + From>( - chars: &mut Chars<'_>, - mode: Mode, -) -> Result { +/// Parse the character of an ASCII escape (except nul) without the leading backslash. +fn simple_escape(c: char) -> Result, char> { // Previous character was '\\', unescape what follows. - let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', - 'x' => { - // Parse hexadecimal character code. - - let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let value = (hi * 16 + lo) as u8; - - return if !mode.allow_high_bytes() && !value.is_ascii() { - Err(EscapeError::OutOfRangeHexEscape) - } else { - // This may be a high byte, but that will only happen if `T` is - // `MixedUnit`, because of the `allow_high_bytes` check above. - Ok(T::from(value)) - }; - } - 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(T::from(res)) + Ok(NonZero::new(match c { + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + _ => Err(c)?, + }) + .unwrap()) } -fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { - // We've parsed '\u', now we have to parse '{..}'. +/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x". +fn hex_escape(chars: &mut impl Iterator) -> Result { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + Ok((hi * 16 + lo) as u8) +} +/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. +/// This r"{...}" normally comes after r"\u" and cannot start with an underscore. +fn unicode_escape(chars: &mut impl Iterator) -> Result { if chars.next() != Some('{') { return Err(EscapeError::NoBraceInUnicodeEscape); } // First character must be a hexadecimal digit. - let mut n_digits = 1; let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), '}' => return Err(EscapeError::EmptyUnicodeEscape), @@ -285,28 +329,19 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result return Err(EscapeError::UnclosedUnicodeEscape), Some('_') => continue, Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if !allow_unicode_escapes { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or({ - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - }); + return if n_digits > 6 { + Err(EscapeError::OverlongUnicodeEscape) + } else { + Ok(value) + }; } Some(c) => { let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; @@ -321,118 +356,74 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } -} - -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - let c = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = match c { - '\\' => scan_escape(chars, mode), - '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.allow_unicode_chars()), - }?; - if chars.next().is_some() { - return Err(EscapeError::MoreThanOneChar); - } - Ok(res) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) +/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without quotes) +/// and produces a sequence of unescaped characters or errors, +/// which are returned by invoking `callback`. +/// +/// For `Char` and `Byte` modes, the callback will be called exactly once. +pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) where - F: FnMut(Range, Result), + F: FnMut(Range, Result), { - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here is complicated because - // `skip_ascii_whitespace` makes us to skip over chars without counting - // them in the range computation. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\\' => { - match chars.clone().next() { - Some('\n') => { - // Rust language specification requires us to skip whitespaces - // if unescaped '\' character is followed by '\n'. - // For details see [Rust language reference] - // (https://doc.rust-lang.org/reference/tokens.html#string-literals). - skip_ascii_whitespace(&mut chars, start, &mut |range, err| { - callback(range, Err(err)) - }); - continue; - } - _ => scan_escape::(&mut chars, mode), - } - } - '"' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, allow_unicode_chars).map(T::from), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); + let mut byte_callback = + |range, res: Result| callback(range, res.map(char::from)); + match mode { + Char => { + let mut chars = src.chars(); + let res = unescape_char_iter(&mut chars); + callback(0..(src.len() - chars.as_str().len()), res); + } + Byte => { + let mut chars = src.chars(); + let res = unescape_byte_iter(&mut chars).map(char::from); + callback(0..(src.len() - chars.as_str().len()), res); + } + Str => unescape_str(src, callback), + ByteStr => unescape_byte_str(src, &mut byte_callback), + RawStr => check_raw_str(src, callback), + RawByteStr => check_raw_byte_str(src, &mut byte_callback), + RawCStr => check_raw_cstr(src, &mut |r, res: Result, EscapeError>| { + callback(r, res.map(|c| c.get())) + }), + CStr => unreachable!(), } } -fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) -where - F: FnMut(Range, EscapeError), -{ - let tail = chars.as_str(); - let first_non_space = tail - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(tail.len()); - if tail[1..first_non_space].contains('\n') { - // The +1 accounts for the escaping slash. - let end = start + first_non_space + 1; - callback(start..end, EscapeError::MultipleSkippedLinesWarning); - } - let tail = &tail[first_non_space..]; - if let Some(c) = tail.chars().next() { - if c.is_whitespace() { - // For error reporting, we would like the span to contain the character that was not - // skipped. The +1 is necessary to account for the leading \ that started the escape. - let end = start + first_non_space + c.len_utf8() + 1; - callback(start..end, EscapeError::UnskippedWhitespaceWarning); +/// What kind of literal do we parse. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Mode { + Char, + + Byte, + + Str, + RawStr, + + ByteStr, + RawByteStr, + + CStr, + RawCStr, +} + +impl Mode { + pub fn in_double_quotes(self) -> bool { + match self { + Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, + Char | Byte => false, } } - *chars = tail.chars(); -} -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only produce errors on bare CR. -fn check_raw_common(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here matches the one in - // `unescape_non_raw_common` for consistency, even though this function - // doesn't have to worry about skipping any chars. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, allow_unicode_chars), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); + pub fn prefix_noraw(self) -> &'static str { + match self { + Char | Str | RawStr => "", + Byte | ByteStr | RawByteStr => "b", + CStr | RawCStr => "c", + } } } -#[inline] -pub fn byte_from_char(c: char) -> u8 { - let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); - res as u8 +fn ascii_char_to_byte(c: char) -> Result { + // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte) + if c.is_ascii() { Ok(c as u8) } else { Err(EscapeError::NonAsciiCharInByte) } } diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 5b99495f47581..831bb83f84192 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -244,7 +244,7 @@ fn test_unescape_byte_str_good() { unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { if let Ok(b) = &mut buf { match c { - Ok(c) => b.push(byte_from_char(c)), + Ok(c) => b.push(c as u8), Err(e) => buf = Err((range, e)), } } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 1d17290e1c706..eb4ece2cd8e00 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -985,10 +985,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> { prefix_len: u32, postfix_len: u32, ) -> (token::LitKind, Symbol) { - self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape::unescape_mixed(src, mode, &mut |span, result| { - callback(span, result.map(drop)) - }) + self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, _mode, callback| { + unescape::unescape_cstr(src, &mut |span, result| callback(span, result.map(drop))) }) } } diff --git a/library/core/src/num/niche_types.rs b/library/core/src/num/niche_types.rs index 47ff4254e533b..b92561c9e356d 100644 --- a/library/core/src/num/niche_types.rs +++ b/library/core/src/num/niche_types.rs @@ -131,6 +131,8 @@ define_valid_range_type! { pub struct NonZeroI32Inner(i32 as u32 in 1..=0xffff_ffff); pub struct NonZeroI64Inner(i64 as u64 in 1..=0xffffffff_ffffffff); pub struct NonZeroI128Inner(i128 as u128 in 1..=0xffffffffffffffff_ffffffffffffffff); + + pub struct NonZeroCharInner(char as u32 in 1..=0x10ffff); } #[cfg(target_pointer_width = "16")] diff --git a/library/core/src/num/nonzero.rs b/library/core/src/num/nonzero.rs index 7585ec140e31e..2145812885d91 100644 --- a/library/core/src/num/nonzero.rs +++ b/library/core/src/num/nonzero.rs @@ -79,6 +79,7 @@ impl_zeroable_primitive!( NonZeroI64Inner(i64), NonZeroI128Inner(i128), NonZeroIsizeInner(isize), + NonZeroCharInner(char), ); /// A value that is known not to equal zero. diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index c97596d5097ec..09ac5b00b43bd 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -10,14 +10,13 @@ use std::ops; -use rustc_lexer::unescape::{EscapeError, Mode}; - -use crate::{ - Edition, - SyntaxKind::{self, *}, - T, +use rustc_lexer::unescape::{ + unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, EscapeError, Mode, }; +use crate::SyntaxKind::{self, *}; +use crate::{Edition, T}; + pub struct LexedStr<'a> { text: &'a str, kind: Vec, @@ -149,14 +148,14 @@ impl<'a> Converter<'a> { self.res } - fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) { + fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec) { self.res.push(kind, self.offset); self.offset += len; - if let Some(err) = err { - let token = self.res.len() as u32; - let msg = err.to_owned(); - self.res.error.push(LexError { msg, token }); + for msg in errors { + if !msg.is_empty() { + self.res.error.push(LexError { msg, token: self.res.len() as u32 }); + } } } @@ -165,14 +164,16 @@ impl<'a> Converter<'a> { // We drop some useful information here (see patterns with double dots `..`) // Storing that info in `SyntaxKind` is not possible due to its layout requirements of // being `u16` that come from `rowan::SyntaxKind`. - let mut err = ""; + let mut errors: Vec = vec![]; let syntax_kind = { match kind { rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT, rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => { if !terminated { - err = "Missing trailing `*/` symbols to terminate the block comment"; + errors.push( + "Missing trailing `*/` symbols to terminate the block comment".into(), + ); } COMMENT } @@ -184,7 +185,7 @@ impl<'a> Converter<'a> { SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT) } rustc_lexer::TokenKind::InvalidIdent => { - err = "Ident contains invalid characters"; + errors.push("Ident contains invalid characters".into()); IDENT } @@ -192,7 +193,7 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => { // FIXME: rustc does something better for recovery. - err = "Invalid string literal (reserved syntax)"; + errors.push("Invalid string literal (reserved syntax)".into()); ERROR } rustc_lexer::TokenKind::GuardedStrPrefix => { @@ -208,12 +209,12 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::Lifetime { starts_with_number } => { if *starts_with_number { - err = "Lifetime name cannot start with a number"; + errors.push("Lifetime name cannot start with a number".into()); } LIFETIME_IDENT } rustc_lexer::TokenKind::UnknownPrefixLifetime => { - err = "Unknown lifetime prefix"; + errors.push("Unknown lifetime prefix".into()); LIFETIME_IDENT } rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT, @@ -248,119 +249,128 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::Unknown => ERROR, rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT, rustc_lexer::TokenKind::UnknownPrefix => { - err = "unknown literal prefix"; + errors.push("unknown literal prefix".into()); IDENT } rustc_lexer::TokenKind::Eof => EOF, } }; - let err = if err.is_empty() { None } else { Some(err) }; - self.push(syntax_kind, token_text.len(), err); + self.push(syntax_kind, token_text.len(), errors); } fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) { - let mut err = ""; + let invalid_raw_msg = String::from("Invalid raw string literal"); + + let mut errors = vec![]; + let mut no_end_quote = |c: char, kind: &str| { + errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal")); + }; let syntax_kind = match *kind { rustc_lexer::LiteralKind::Int { empty_int, base: _ } => { if empty_int { - err = "Missing digits after the integer base prefix"; + errors.push("Missing digits after the integer base prefix".into()); } INT_NUMBER } rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => { if empty_exponent { - err = "Missing digits after the exponent symbol"; + errors.push("Missing digits after the exponent symbol".into()); } FLOAT_NUMBER } rustc_lexer::LiteralKind::Char { terminated } => { if !terminated { - err = "Missing trailing `'` symbol to terminate the character literal"; + no_end_quote('\'', "character"); } else { let text = &self.res.text[self.offset + 1..][..len - 1]; - let i = text.rfind('\'').unwrap(); - let text = &text[..i]; - if let Err(e) = rustc_lexer::unescape::unescape_char(text) { - err = error_to_diagnostic_message(e, Mode::Char); + let text = &text[..text.rfind('\'').unwrap()]; + if let Err(e) = unescape_char(text) { + errors.push(err_to_msg(e, Mode::Char)); } } CHAR } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { - err = "Missing trailing `'` symbol to terminate the byte literal"; + no_end_quote('\'', "byte"); } else { let text = &self.res.text[self.offset + 2..][..len - 2]; - let i = text.rfind('\'').unwrap(); - let text = &text[..i]; - if let Err(e) = rustc_lexer::unescape::unescape_byte(text) { - err = error_to_diagnostic_message(e, Mode::Byte); + let text = &text[..text.rfind('\'').unwrap()]; + if let Err(e) = unescape_byte(text) { + errors.push(err_to_msg(e, Mode::Byte)); } } - BYTE } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { - err = "Missing trailing `\"` symbol to terminate the string literal"; + no_end_quote('"', "string"); } else { let text = &self.res.text[self.offset + 1..][..len - 1]; - let i = text.rfind('"').unwrap(); - let text = &text[..i]; - err = unescape_string_error_message(text, Mode::Str); + let text = &text[..text.rfind('"').unwrap()]; + unescape_str(text, &mut |_, res| { + if let Err(e) = res { + errors.push(err_to_msg(e, Mode::Str)); + } + }); } STRING } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { - err = "Missing trailing `\"` symbol to terminate the byte string literal"; + no_end_quote('"', "byte string"); } else { let text = &self.res.text[self.offset + 2..][..len - 2]; - let i = text.rfind('"').unwrap(); - let text = &text[..i]; - err = unescape_string_error_message(text, Mode::ByteStr); + let text = &text[..text.rfind('"').unwrap()]; + unescape_byte_str(text, &mut |_, res| { + if let Err(e) = res { + errors.push(err_to_msg(e, Mode::ByteStr)); + } + }); } BYTE_STRING } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { - err = "Missing trailing `\"` symbol to terminate the string literal"; + no_end_quote('"', "C string") } else { let text = &self.res.text[self.offset + 2..][..len - 2]; - let i = text.rfind('"').unwrap(); - let text = &text[..i]; - err = unescape_string_error_message(text, Mode::CStr); + let text = &text[..text.rfind('"').unwrap()]; + unescape_cstr(text, &mut |_, res| { + if let Err(e) = res { + errors.push(err_to_msg(e, Mode::CStr)); + } + }); } C_STRING } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if n_hashes.is_none() { - err = "Invalid raw string literal"; + errors.push(invalid_raw_msg); } STRING } rustc_lexer::LiteralKind::RawByteStr { n_hashes } => { if n_hashes.is_none() { - err = "Invalid raw string literal"; + errors.push(invalid_raw_msg); } BYTE_STRING } rustc_lexer::LiteralKind::RawCStr { n_hashes } => { if n_hashes.is_none() { - err = "Invalid raw string literal"; + errors.push(invalid_raw_msg); } C_STRING } }; - let err = if err.is_empty() { None } else { Some(err) }; - self.push(syntax_kind, len, err); + self.push(syntax_kind, len, errors); } } -fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { +fn err_to_msg(error: EscapeError, mode: Mode) -> String { match error { EscapeError::ZeroChars => "empty character literal", EscapeError::MoreThanOneChar => "character literal may only contain one codepoint", @@ -396,28 +406,5 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { EscapeError::UnskippedWhitespaceWarning => "", EscapeError::MultipleSkippedLinesWarning => "", } -} - -fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str { - let mut error_message = ""; - match mode { - Mode::CStr => { - rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { - if let Err(e) = res { - error_message = error_to_diagnostic_message(e, mode); - } - }); - } - Mode::ByteStr | Mode::Str => { - rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { - if let Err(e) = res { - error_message = error_to_diagnostic_message(e, mode); - } - }); - } - _ => { - // Other Modes are not supported yet or do not apply - } - } - error_message + .into() } diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index df851ab5b2525..f940438c6176b 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -3,7 +3,8 @@ use std::{borrow::Cow, num::ParseIntError}; use rustc_lexer::unescape::{ - unescape_byte, unescape_char, unescape_mixed, unescape_unicode, EscapeError, MixedUnit, Mode, + unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, unescape_unicode, + EscapeError, MixedUnit, Mode, }; use stdx::always; @@ -218,7 +219,7 @@ impl ast::String { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = None; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( + unescape_str(text, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { @@ -259,18 +260,18 @@ impl ast::ByteString { let mut buf: Vec = Vec::new(); let mut prev_end = 0; let mut has_error = None; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( - unescaped_char, + unescape_byte_str(text, &mut |char_range, unescaped_byte| match ( + unescaped_byte, buf.capacity() == 0, ) { - (Ok(c), false) => buf.push(c as u8), + (Ok(b), false) => buf.push(b), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end } - (Ok(c), true) => { + (Ok(b), true) => { buf.reserve_exact(text.len()); buf.extend_from_slice(&text.as_bytes()[..prev_end]); - buf.push(c as u8); + buf.push(b); } (Err(e), _) => has_error = Some(e), }); @@ -297,7 +298,7 @@ impl IsString for ast::CString { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { + unescape_cstr(text, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); // XXX: This method should only be used for highlighting ranges. The unescaped @@ -320,13 +321,10 @@ impl ast::CString { let mut prev_end = 0; let mut has_error = None; let extend_unit = |buf: &mut Vec, unit: MixedUnit| match unit { - MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), - MixedUnit::HighByte(b) => buf.push(b), + MixedUnit::Char(c) => buf.extend(c.get().encode_utf8(&mut [0; 4]).as_bytes()), + MixedUnit::HighByte(b) => buf.push(b.get()), }; - unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match ( - unescaped, - buf.capacity() == 0, - ) { + unescape_cstr(text, &mut |char_range, unescaped| match (unescaped, buf.capacity() == 0) { (Ok(u), false) => extend_unit(&mut buf, u), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index 85eefac734b20..8cdfeb9d84557 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -5,7 +5,9 @@ mod block; use rowan::Direction; -use rustc_lexer::unescape::{self, unescape_mixed, unescape_unicode, Mode}; +use rustc_lexer::unescape::{ + self, unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, +}; use crate::{ algo, @@ -140,7 +142,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::String(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 1, '"') { - unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { + unescape_str(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -151,7 +153,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| { + unescape_byte_str(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -162,7 +164,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { + unescape_cstr(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -172,20 +174,16 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Char(_) => { if let Some(without_quotes) = unquote(text, 1, '\'') { - unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { - if let Err(err) = char { - push_err(1, range.start, err); - } - }); + if let Err(err) = unescape_char(without_quotes) { + push_err(1, 0, err); + } } } ast::LiteralKind::Byte(_) => { if let Some(without_quotes) = unquote(text, 2, '\'') { - unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { - if let Err(err) = char { - push_err(2, range.start, err); - } - }); + if let Err(err) = unescape_byte(without_quotes) { + push_err(2, 0, err); + } } } ast::LiteralKind::IntNumber(_) From 30822ec0ec25723f36f9e73c42d91a83dc121388 Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Fri, 14 Mar 2025 09:42:49 +0000 Subject: [PATCH 2/2] Replace all uses of unescape_unicode: no more unreachable! --- compiler/rustc_ast/src/util/literal.rs | 6 +- compiler/rustc_lexer/src/unescape.rs | 67 +++++++++----- compiler/rustc_lexer/src/unescape/tests.rs | 14 +-- compiler/rustc_parse/src/lexer/mod.rs | 90 ++++++------------ compiler/rustc_parse_format/src/lib.rs | 8 +- .../clippy/clippy_dev/src/update_lints.rs | 2 +- .../crates/hir-expand/src/attrs.rs | 2 +- .../crates/hir-expand/src/builtin/fn_macro.rs | 14 +-- .../crates/syntax/src/ast/token_ext.rs | 92 ++++++++----------- .../crates/syntax/src/validation.rs | 6 +- 10 files changed, 136 insertions(+), 165 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index dc66a42dc1c6a..0d44f583f3907 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -89,7 +89,7 @@ impl LitKind { // programs with many long strings containing escapes. unescape_str( s, - &mut #[inline(always)] + #[inline(always)] |_, res| match res { Ok(c) => buf.push(c), Err(err) => { @@ -110,7 +110,7 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_byte_str(s, &mut |_, res| match res { + unescape_byte_str(s, |_, res| match res { Ok(b) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") @@ -127,7 +127,7 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_cstr(s, &mut |_span, c| match c { + unescape_cstr(s, |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.get().encode_utf8(&mut [0; 4]).as_bytes()) } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 5c7d1106f568f..d991748b5b0b7 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -139,7 +139,7 @@ macro_rules! check { " literal (without quotes) and produce a sequence of results of ", stringify!($unit_ty), " or error (returned via `callback`).", "\nNB: Raw strings don't do any unescaping, but do produce errors on bare CR.")] - pub fn $check(src: &str, callback: &mut impl FnMut(Range, Result<$unit, EscapeError>)) + pub fn $check(src: &str, mut callback: impl FnMut(Range, Result<$unit, EscapeError>)) { src.char_indices().for_each(|(pos, c)| { callback( @@ -162,7 +162,7 @@ macro_rules! unescape { #[doc = concat!("Take the contents of a ", stringify!($string_ty), " literal (without quotes) and produce a sequence of results of escaped ", stringify!($unit_ty), " or error (returned via `callback`).")] - pub fn $unescape(src: &str, callback: &mut impl FnMut(Range, Result<$unit, EscapeError>)) + pub fn $unescape(src: &str, mut callback: impl FnMut(Range, Result<$unit, EscapeError>)) { let mut chars = src.chars(); while let Some(c) = chars.next() { @@ -356,36 +356,57 @@ fn unicode_escape(chars: &mut impl Iterator) -> Result(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut byte_callback = - |range, res: Result| callback(range, res.map(char::from)); +pub fn unescape_for_errors( + src: &str, + mode: Mode, + mut error_callback: impl FnMut(Range, EscapeError), +) { match mode { Char => { let mut chars = src.chars(); - let res = unescape_char_iter(&mut chars); - callback(0..(src.len() - chars.as_str().len()), res); + if let Err(e) = unescape_char_iter(&mut chars) { + error_callback(0..(src.len() - chars.as_str().len()), e); + } } Byte => { let mut chars = src.chars(); - let res = unescape_byte_iter(&mut chars).map(char::from); - callback(0..(src.len() - chars.as_str().len()), res); + if let Err(e) = unescape_byte_iter(&mut chars) { + error_callback(0..(src.len() - chars.as_str().len()), e); + } } - Str => unescape_str(src, callback), - ByteStr => unescape_byte_str(src, &mut byte_callback), - RawStr => check_raw_str(src, callback), - RawByteStr => check_raw_byte_str(src, &mut byte_callback), - RawCStr => check_raw_cstr(src, &mut |r, res: Result, EscapeError>| { - callback(r, res.map(|c| c.get())) + Str => unescape_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + ByteStr => unescape_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + CStr => unescape_cstr(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + RawStr => check_raw_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + RawByteStr => check_raw_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + RawCStr => check_raw_cstr(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } }), - CStr => unreachable!(), } } diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 831bb83f84192..c094e8d9da354 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -100,7 +100,7 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + unescape_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -124,7 +124,7 @@ fn test_unescape_str_warn() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::Str, &mut |range, c| { + unescape_str(literal_text, |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -241,7 +241,7 @@ fn test_unescape_byte_good() { fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { + unescape_byte_str(literal_text, |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c as u8), @@ -264,7 +264,7 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + check_raw_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -274,13 +274,13 @@ fn test_unescape_raw_str() { #[test] fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { + fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); + check_raw_byte_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); - check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]); + check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(b'a'))]); } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index eb4ece2cd8e00..1f990b46475b7 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,5 +1,3 @@ -use std::ops::Range; - use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; @@ -525,7 +523,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { } err.emit() } - self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' ' + self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' ' } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { @@ -537,7 +535,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0763) .emit() } - self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' + self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { @@ -549,7 +547,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0765) .emit() } - self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " " + self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " " } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { @@ -561,7 +559,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0766) .emit() } - self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " + self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { @@ -573,13 +571,13 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0767) .emit() } - self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" " + self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" " } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::StrRaw(n_hashes); - self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## + self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## } else { self.report_raw_str_error(start, 1); } @@ -588,7 +586,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::ByteStrRaw(n_hashes); - self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## + self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## } else { self.report_raw_str_error(start, 2); } @@ -597,7 +595,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::CStrRaw(n_hashes); - self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## + self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## } else { self.report_raw_str_error(start, 2); } @@ -913,7 +911,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num }); } - fn cook_common( + fn cook_quoted( &self, mut kind: token::LitKind, mode: Mode, @@ -921,32 +919,28 @@ impl<'psess, 'src> Lexer<'psess, 'src> { end: BytePos, prefix_len: u32, postfix_len: u32, - unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)), ) -> (token::LitKind, Symbol) { let content_start = start + BytePos(prefix_len); let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); - unescape(lit_content, mode, &mut |range, result| { - // Here we only check for errors. The actual unescaping is done later. - if let Err(err) = result { - let span_with_quotes = self.mk_sp(start, end); - let (start, end) = (range.start as u32, range.end as u32); - let lo = content_start + BytePos(start); - let hi = lo + BytePos(end - start); - let span = self.mk_sp(lo, hi); - let is_fatal = err.is_fatal(); - if let Some(guar) = emit_unescape_error( - self.dcx(), - lit_content, - span_with_quotes, - span, - mode, - range, - err, - ) { - assert!(is_fatal); - kind = token::Err(guar); - } + unescape::unescape_for_errors(lit_content, mode, |range, err| { + let span_with_quotes = self.mk_sp(start, end); + let (start, end) = (range.start as u32, range.end as u32); + let lo = content_start + BytePos(start); + let hi = lo + BytePos(end - start); + let span = self.mk_sp(lo, hi); + let is_fatal = err.is_fatal(); + if let Some(guar) = emit_unescape_error( + self.dcx(), + lit_content, + span_with_quotes, + span, + mode, + range, + err, + ) { + assert!(is_fatal); + kind = token::Err(guar); } }); @@ -959,36 +953,6 @@ impl<'psess, 'src> Lexer<'psess, 'src> { }; (kind, sym) } - - fn cook_unicode( - &self, - kind: token::LitKind, - mode: Mode, - start: BytePos, - end: BytePos, - prefix_len: u32, - postfix_len: u32, - ) -> (token::LitKind, Symbol) { - self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape::unescape_unicode(src, mode, &mut |span, result| { - callback(span, result.map(drop)) - }) - }) - } - - fn cook_mixed( - &self, - kind: token::LitKind, - mode: Mode, - start: BytePos, - end: BytePos, - prefix_len: u32, - postfix_len: u32, - ) -> (token::LitKind, Symbol) { - self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, _mode, callback| { - unescape::unescape_cstr(src, &mut |span, result| callback(span, result.map(drop))) - }) - } } pub fn nfc_normalize(string: &str) -> Symbol { diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index 5b8a2fe52d3f5..73bb8ab17734e 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -1094,11 +1094,9 @@ fn find_width_map_from_snippet( fn unescape_string(string: &str) -> Option { let mut buf = String::new(); let mut ok = true; - unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| { - match unescaped_char { - Ok(c) => buf.push(c), - Err(_) => ok = false, - } + unescape::unescape_str(string, &mut |_, res| match res { + Ok(c) => buf.push(c), + Err(_) => ok = false, }); ok.then_some(buf) diff --git a/src/tools/clippy/clippy_dev/src/update_lints.rs b/src/tools/clippy/clippy_dev/src/update_lints.rs index b80ee5aac7e76..3f785135030ec 100644 --- a/src/tools/clippy/clippy_dev/src/update_lints.rs +++ b/src/tools/clippy/clippy_dev/src/update_lints.rs @@ -830,7 +830,7 @@ fn remove_line_splices(s: &str) -> String { .and_then(|s| s.strip_suffix('"')) .unwrap_or_else(|| panic!("expected quoted string, found `{s}`")); let mut res = String::with_capacity(s.len()); - unescape::unescape_unicode(s, unescape::Mode::Str, &mut |range, ch| { + unescape::unescape_str(s, |range, ch| { if ch.is_ok() { res.push_str(&s[range]); } diff --git a/src/tools/rust-analyzer/crates/hir-expand/src/attrs.rs b/src/tools/rust-analyzer/crates/hir-expand/src/attrs.rs index c9c793d54f26c..3c7d0495833f6 100644 --- a/src/tools/rust-analyzer/crates/hir-expand/src/attrs.rs +++ b/src/tools/rust-analyzer/crates/hir-expand/src/attrs.rs @@ -415,7 +415,7 @@ fn unescape(s: &str) -> Option> { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = false; - unescape::unescape_unicode(s, unescape::Mode::Str, &mut |char_range, unescaped_char| match ( + unescape::unescape_str(s, |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { diff --git a/src/tools/rust-analyzer/crates/hir-expand/src/builtin/fn_macro.rs b/src/tools/rust-analyzer/crates/hir-expand/src/builtin/fn_macro.rs index 55242ab3e57d1..02bf0c72f9221 100644 --- a/src/tools/rust-analyzer/crates/hir-expand/src/builtin/fn_macro.rs +++ b/src/tools/rust-analyzer/crates/hir-expand/src/builtin/fn_macro.rs @@ -9,7 +9,7 @@ use span::{Edition, EditionedFileId, Span}; use stdx::format_to; use syntax::{ format_smolstr, - unescape::{unescape_byte, unescape_char, unescape_unicode, Mode}, + unescape::{unescape_byte, unescape_char, unescape_str}, }; use syntax_bridge::syntax_node_to_token_tree; @@ -429,7 +429,7 @@ fn compile_error_expand( span: _, kind: tt::LitKind::Str | tt::LitKind::StrRaw(_), suffix: _, - }))] => ExpandError::other(span, Box::from(unescape_str(text).as_str())), + }))] => ExpandError::other(span, Box::from(unescape_symbol(text).as_str())), _ => ExpandError::other(span, "`compile_error!` argument must be a string"), }; @@ -477,7 +477,7 @@ fn concat_expand( format_to!(text, "{}", it.symbol.as_str()) } tt::LitKind::Str => { - text.push_str(unescape_str(&it.symbol).as_str()); + text.push_str(unescape_symbol(&it.symbol).as_str()); record_span(it.span); } tt::LitKind::StrRaw(_) => { @@ -681,7 +681,7 @@ fn parse_string(tt: &tt::TopSubtree) -> Result<(Symbol, Span), ExpandError> { span, kind: tt::LitKind::Str, suffix: _, - })) => Ok((unescape_str(text), *span)), + })) => Ok((unescape_symbol(text), *span)), TtElement::Leaf(tt::Leaf::Literal(tt::Literal { symbol: text, span, @@ -702,7 +702,7 @@ fn parse_string(tt: &tt::TopSubtree) -> Result<(Symbol, Span), ExpandError> { span, kind: tt::LitKind::Str, suffix: _, - })) => Some((unescape_str(text), *span)), + })) => Some((unescape_symbol(text), *span)), TtElement::Leaf(tt::Leaf::Literal(tt::Literal { symbol: text, span, @@ -887,11 +887,11 @@ fn quote_expand( ) } -fn unescape_str(s: &Symbol) -> Symbol { +fn unescape_symbol(s: &Symbol) -> Symbol { if s.as_str().contains('\\') { let s = s.as_str(); let mut buf = String::with_capacity(s.len()); - unescape_unicode(s, Mode::Str, &mut |_, c| { + unescape_str(s, |_, c| { if let Ok(c) = c { buf.push(c) } diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index f940438c6176b..d4017e487aba1 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -1,10 +1,11 @@ //! There are many AstNodes, but only a few tokens, so we hand-write them here. +use std::ops::Range; use std::{borrow::Cow, num::ParseIntError}; use rustc_lexer::unescape::{ - unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, unescape_unicode, - EscapeError, MixedUnit, Mode, + unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, EscapeError, + MixedUnit, }; use stdx::always; @@ -151,7 +152,7 @@ impl QuoteOffsets { pub trait IsString: AstToken { const RAW_PREFIX: &'static str; - const MODE: Mode; + fn unescape(s: &str, callback: impl FnMut(Range, Result)); fn is_raw(&self) -> bool { self.text().starts_with(Self::RAW_PREFIX) } @@ -186,7 +187,7 @@ pub trait IsString: AstToken { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| { + Self::unescape(text, &mut |range: Range, unescaped_char| { if let Some((s, e)) = range.start.try_into().ok().zip(range.end.try_into().ok()) { cb(TextRange::new(s, e) + offset, unescaped_char); } @@ -204,7 +205,9 @@ pub trait IsString: AstToken { impl IsString for ast::String { const RAW_PREFIX: &'static str = "r"; - const MODE: Mode = Mode::Str; + fn unescape(s: &str, cb: impl FnMut(Range, Result)) { + unescape_str(s, cb) + } } impl ast::String { @@ -219,20 +222,19 @@ impl ast::String { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = None; - unescape_str(text, &mut |char_range, unescaped_char| match ( - unescaped_char, - buf.capacity() == 0, - ) { - (Ok(c), false) => buf.push(c), - (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { - prev_end = char_range.end - } - (Ok(c), true) => { - buf.reserve_exact(text.len()); - buf.push_str(&text[..prev_end]); - buf.push(c); + unescape_str(text, |char_range, unescaped_char| { + match (unescaped_char, buf.capacity() == 0) { + (Ok(c), false) => buf.push(c), + (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { + prev_end = char_range.end + } + (Ok(c), true) => { + buf.reserve_exact(text.len()); + buf.push_str(&text[..prev_end]); + buf.push(c); + } + (Err(e), _) => has_error = Some(e), } - (Err(e), _) => has_error = Some(e), }); match (has_error, buf.capacity() == 0) { @@ -245,7 +247,9 @@ impl ast::String { impl IsString for ast::ByteString { const RAW_PREFIX: &'static str = "br"; - const MODE: Mode = Mode::ByteStr; + fn unescape(s: &str, mut callback: impl FnMut(Range, Result)) { + unescape_byte_str(s, |range, res| callback(range, res.map(char::from))) + } } impl ast::ByteString { @@ -260,20 +264,19 @@ impl ast::ByteString { let mut buf: Vec = Vec::new(); let mut prev_end = 0; let mut has_error = None; - unescape_byte_str(text, &mut |char_range, unescaped_byte| match ( - unescaped_byte, - buf.capacity() == 0, - ) { - (Ok(b), false) => buf.push(b), - (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { - prev_end = char_range.end - } - (Ok(b), true) => { - buf.reserve_exact(text.len()); - buf.extend_from_slice(&text.as_bytes()[..prev_end]); - buf.push(b); + unescape_byte_str(text, |char_range, unescaped_byte| { + match (unescaped_byte, buf.capacity() == 0) { + (Ok(b), false) => buf.push(b), + (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { + prev_end = char_range.end + } + (Ok(b), true) => { + buf.reserve_exact(text.len()); + buf.extend_from_slice(&text.as_bytes()[..prev_end]); + buf.push(b); + } + (Err(e), _) => has_error = Some(e), } - (Err(e), _) => has_error = Some(e), }); match (has_error, buf.capacity() == 0) { @@ -286,25 +289,10 @@ impl ast::ByteString { impl IsString for ast::CString { const RAW_PREFIX: &'static str = "cr"; - const MODE: Mode = Mode::CStr; - - fn escaped_char_ranges(&self, cb: &mut dyn FnMut(TextRange, Result)) { - let text_range_no_quotes = match self.text_range_between_quotes() { - Some(it) => it, - None => return, - }; - - let start = self.syntax().text_range().start(); - let text = &self.text()[text_range_no_quotes - start]; - let offset = text_range_no_quotes.start() - start; - - unescape_cstr(text, &mut |range, unescaped_char| { - let text_range = - TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); - // XXX: This method should only be used for highlighting ranges. The unescaped - // char/byte is not used. For simplicity, we return an arbitrary placeholder char. - cb(text_range + offset, unescaped_char.map(|_| ' ')); - }); + // XXX: This method should only be used for highlighting ranges. The unescaped + // char/byte is not used. For simplicity, we return an arbitrary placeholder char. + fn unescape(s: &str, mut callback: impl FnMut(Range, Result)) { + unescape_cstr(s, |range, _res| callback(range, Ok('_'))) } } @@ -324,7 +312,7 @@ impl ast::CString { MixedUnit::Char(c) => buf.extend(c.get().encode_utf8(&mut [0; 4]).as_bytes()), MixedUnit::HighByte(b) => buf.push(b.get()), }; - unescape_cstr(text, &mut |char_range, unescaped| match (unescaped, buf.capacity() == 0) { + unescape_cstr(text, |char_range, unescaped| match (unescaped, buf.capacity() == 0) { (Ok(u), false) => extend_unit(&mut buf, u), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index 8cdfeb9d84557..275cc8a90e215 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -142,7 +142,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::String(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 1, '"') { - unescape_str(without_quotes, &mut |range, char| { + unescape_str(without_quotes,|range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -153,7 +153,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_byte_str(without_quotes, &mut |range, char| { + unescape_byte_str(without_quotes, |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -164,7 +164,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_cstr(without_quotes, &mut |range, char| { + unescape_cstr(without_quotes, |range, char| { if let Err(err) = char { push_err(1, range.start, err); }