Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ParserState method to get current utf16 position #374

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ use std::ops::Range;
pub struct ParserState {
pub(crate) position: usize,
pub(crate) current_line_start_position: usize,
pub(crate) current_line_start_difference: u16,
pub(crate) position_difference: u16,
pub(crate) current_line_number: u32,
pub(crate) at_start_of: Option<BlockType>,
}
Expand All @@ -34,9 +36,19 @@ impl ParserState {
pub fn source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
column: (
self.position - self.current_line_start_position -
(self.position_difference - self.current_line_start_difference) as usize +
1
) as u32,
}
}

/// The position from the start of the input, counted in UTF-16 code units
#[inline]
pub fn utf16_position(&self) -> u32 {
(self.position - self.position_difference as usize) as u32
}
}

/// When parsing until a given token, sometimes the caller knows that parsing is going to restart
Expand Down
6 changes: 3 additions & 3 deletions src/size_of_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ size_of_test!(token, Token, 32);
size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32);
size_of_test!(cow_rc_str, CowRcStr, 16);

size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72);
size_of_test!(parser_input, crate::parser::ParserInput, 136);
size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80);
size_of_test!(parser_input, crate::parser::ParserInput, 152);
size_of_test!(parser, crate::parser::Parser, 16);
size_of_test!(source_position, crate::SourcePosition, 8);
size_of_test!(parser_state, crate::ParserState, 24);
size_of_test!(parser_state, crate::ParserState, 32);

size_of_test!(basic_parse_error, crate::BasicParseError, 40, 48);
size_of_test!(parse_error_lower_bound, crate::ParseError<()>, 40, 48);
37 changes: 20 additions & 17 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ fn roundtrip_percentage_token() {
}

#[test]
fn utf16_columns() {
fn utf16_columns_and_positions() {
// This particular test serves two purposes. First, it checks
// that the column number computations are correct. Second, it
// checks that tokenizer code paths correctly differentiate
Expand All @@ -1278,24 +1278,26 @@ fn utf16_columns() {
// the column is in units of UTF-16, the 4-byte sequence results
// in two columns.
let tests = vec![
("", 1),
("ascii", 6),
("/*QΡ✈🆒*/", 10),
("'QΡ✈🆒*'", 9),
("\"\\\"'QΡ✈🆒*'", 12),
("\\Q\\Ρ\\✈\\🆒", 10),
("QΡ✈🆒", 6),
("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
("newline\r\nQΡ✈🆒", 6),
("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20),
("url(QΡ✈🆒)", 11),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17),
("QΡ✈🆒()", 8),
("", 1, 0),
("ascii", 6, 5),
("/*QΡ✈🆒*/", 10, 9),
("/*QΡ✈\r\n🆒*/", 5, 11),
("'QΡ✈🆒*'", 9, 8),
("\"\\\"'QΡ✈🆒*'", 12, 11),
("\\Q\\Ρ\\✈\\🆒", 10, 9),
("QΡ✈🆒", 6, 5),
("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 14),
("newline\r\nQΡ✈🆒", 6, 14),
("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 20, 19),
("url(QΡ✈🆒)", 11, 10),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 16, 21),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 15, 20),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 17, 22),
("url( \tQ)", 10, 9),
("QΡ✈🆒()", 8, 7),
// Test that under/over-flow of current_line_start_position is
// handled properly; see the special case in consume_4byte_intro.
("🆒", 3),
("🆒", 3, 2),
];

for test in tests {
Expand All @@ -1321,6 +1323,7 @@ fn utf16_columns() {

// Check the resulting column.
assert_eq!(parser.current_source_location().column, test.1);
assert_eq!(parser.state().utf16_position(), test.2, "test: {}", test.0);
}
}

Expand Down
31 changes: 23 additions & 8 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ pub struct Tokenizer<'a> {
/// ensure that computing the column will give the result in units
/// of UTF-16 characters.
current_line_start_position: usize,
position_difference: u16,
current_line_start_difference: u16,
current_line_number: u32,
var_or_env_functions: SeenStatus,
source_map_url: Option<&'a str>,
Expand All @@ -234,7 +236,9 @@ impl<'a> Tokenizer<'a> {
input,
position: 0,
current_line_start_position: 0,
current_line_start_difference: 0,
current_line_number: 0,
position_difference: 0,
var_or_env_functions: SeenStatus::DontCare,
source_map_url: None,
source_url: None,
Expand Down Expand Up @@ -277,7 +281,12 @@ impl<'a> Tokenizer<'a> {
pub fn current_source_location(&self) -> SourceLocation {
SourceLocation {
line: self.current_line_number,
column: (self.position - self.current_line_start_position + 1) as u32,
column: (
self.position -
self.current_line_start_position -
(self.position_difference - self.current_line_start_difference) as usize
+ 1
) as u32,
}
}

Expand All @@ -296,6 +305,8 @@ impl<'a> Tokenizer<'a> {
ParserState {
position: self.position,
current_line_start_position: self.current_line_start_position,
current_line_start_difference: self.current_line_start_difference,
position_difference: self.position_difference,
current_line_number: self.current_line_number,
at_start_of: None,
}
Expand All @@ -305,6 +316,8 @@ impl<'a> Tokenizer<'a> {
pub fn reset(&mut self, state: &ParserState) {
self.position = state.position;
self.current_line_start_position = state.current_line_start_position;
self.current_line_start_difference = state.current_line_start_difference;
self.position_difference = state.position_difference;
self.current_line_number = state.current_line_number;
}

Expand Down Expand Up @@ -391,7 +404,7 @@ impl<'a> Tokenizer<'a> {
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.position_difference = self.position_difference.wrapping_sub(1);
self.position += 1;
}

Expand All @@ -403,7 +416,7 @@ impl<'a> Tokenizer<'a> {
// Continuation bytes contribute to column overcount. Note
// that due to the special case for the 4-byte sequence intro,
// we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
self.position_difference = self.position_difference.wrapping_add(1);
self.position += 1;
}

Expand All @@ -416,11 +429,11 @@ impl<'a> Tokenizer<'a> {
if byte & 0xF0 == 0xF0 {
// This takes two UTF-16 characters to represent, so we
// actually have an undercount.
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
self.position_difference = self.position_difference.wrapping_sub(1);
} else if byte & 0xC0 == 0x80 {
// Note that due to the special case for the 4-byte
// sequence intro, we must use wrapping add here.
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
self.position_difference = self.position_difference.wrapping_add(1);
}
}

Expand All @@ -443,6 +456,7 @@ impl<'a> Tokenizer<'a> {
self.position += 1;
}
self.current_line_start_position = self.position;
self.current_line_start_difference = self.position_difference;
self.current_line_number += 1;
}

Expand All @@ -456,12 +470,13 @@ impl<'a> Tokenizer<'a> {
fn consume_char(&mut self) -> char {
let c = self.next_char();
let len_utf8 = c.len_utf8();
let len_utf16 = c.len_utf16();
self.position += len_utf8;
// Note that due to the special case for the 4-byte sequence
// intro, we must use wrapping add here.
self.current_line_start_position = self
.current_line_start_position
.wrapping_add(len_utf8 - c.len_utf16());
self.position_difference = self
.position_difference
.wrapping_add((len_utf8 - len_utf16) as u16);
c
}

Expand Down
Loading