From 17aa57c881c83b83fd3eb3ec9f93851cdf84312a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Haraldsson?= Date: Mon, 9 Oct 2023 19:23:55 +0000 Subject: [PATCH 1/6] EOF_CHAR isn't the common case, moving those first --- src/tokenize.jl | 99 +++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 739a24c6..dc2a18c9 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -458,42 +458,34 @@ function next_token(l::Lexer, start = true) end function _next_token(l::Lexer, c) - if c == EOF_CHAR - return emit(l, K"EndMarker") - elseif iswhitespace(c) - return lex_whitespace(l, c) - elseif c == '[' + if c == '[' return emit(l, K"[") elseif c == ']' return emit(l, K"]") - elseif c == '{' - return emit(l, K"{") - elseif c == ';' - return emit(l, K";") - elseif c == '}' - return emit(l, K"}") elseif c == '(' return emit(l, K"(") elseif c == ')' return emit(l, K")") + elseif c == ';' + return emit(l, K";") elseif c == ',' return emit(l, K",") - elseif c == '*' - return lex_star(l); - elseif c == '^' - return lex_circumflex(l); - elseif c == '@' - return emit(l, K"@") - elseif c == '?' - return emit(l, K"?") - elseif c == '$' - return lex_dollar(l); - elseif c == '⊻' - return lex_xor(l); - elseif c == '~' - return emit(l, K"~") - elseif c == '#' - return lex_comment(l) + elseif c == '+' + return lex_plus(l); + elseif c == '-' + return lex_minus(l) + elseif c == '−' # \minus '−' treated as hyphen '-' + return emit(l, accept(l, '=') ? K"-=" : K"-") + elseif c == '{' + return emit(l, K"{") + elseif c == '}' + return emit(l, K"}") + elseif c == '"' + return lex_quote(l); + elseif c == '|' + return lex_bar(l) + elseif c == '&' + return lex_amper(l) elseif c == '=' return lex_equal(l) elseif c == '!' @@ -502,38 +494,49 @@ function _next_token(l::Lexer, c) return lex_greater(l) elseif c == '<' return lex_less(l) + elseif c == '"' + return lex_quote(l) + elseif c == '#' + return lex_comment(l) elseif c == ':' return lex_colon(l) - elseif c == '|' - return lex_bar(l) - elseif c == '&' - return lex_amper(l) - elseif c == '\'' - return lex_prime(l) - elseif c == '÷' - return lex_division(l) - elseif c == '"' - return lex_quote(l); elseif c == '%' - return lex_percent(l); + return lex_percent(l) elseif c == '/' - return lex_forwardslash(l); + return lex_forwardslash(l) elseif c == '\\' - return lex_backslash(l); + return lex_backslash(l) elseif c == '.' - return lex_dot(l); - elseif c == '+' - return lex_plus(l); - elseif c == '-' - return lex_minus(l); - elseif c == '−' # \minus '−' treated as hyphen '-' - return emit(l, accept(l, '=') ? K"-=" : K"-") + return lex_dot(l) elseif c == '`' - return lex_backtick(l); + return lex_backtick(l) + elseif c == '\'' + return lex_prime(l) + elseif c == '÷' + return lex_division(l) + + elseif iswhitespace(c) + return lex_whitespace(l, c) elseif is_identifier_start_char(c) return lex_identifier(l, c) elseif isdigit(c) return lex_digit(l, K"Integer") + elseif c == '*' + return lex_star(l) + elseif c == '^' + return lex_circumflex(l) + elseif c == '@' + return emit(l, K"@") + elseif c == '?' + return emit(l, K"?") + elseif c == '$' + return lex_dollar(l) + elseif c == '⊻' + return lex_xor(l) + elseif c == '~' + return emit(l, K"~") + elseif c == EOF_CHAR + return emit(l, K"EndMarker") elseif (k = get(_unicode_ops, c, K"error")) != K"error" return emit(l, k) else From 411487ca9b0b4ff7fd31a8d9d6b48bf8799399d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Haraldsson?= Date: Mon, 9 Oct 2023 19:55:03 +0000 Subject: [PATCH 2/6] Fix accidental extra copy-paste --- src/tokenize.jl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index dc2a18c9..32e9d13a 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -225,7 +225,7 @@ end @inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F') @inline isbinary(c::Char) = c == '0' || c == '1' -@inline isoctal(c::Char) = '0' ≤ c ≤ '7' +@inline isoctal(c::Char) = '0' ≤ c ≤ '7' @inline iswhitespace(c::Char) = (isvalid(c) && Base.isspace(c)) || c === '\ufeff' struct StringState @@ -480,8 +480,6 @@ function _next_token(l::Lexer, c) return emit(l, K"{") elseif c == '}' return emit(l, K"}") - elseif c == '"' - return lex_quote(l); elseif c == '|' return lex_bar(l) elseif c == '&' @@ -494,8 +492,6 @@ function _next_token(l::Lexer, c) return lex_greater(l) elseif c == '<' return lex_less(l) - elseif c == '"' - return lex_quote(l) elseif c == '#' return lex_comment(l) elseif c == ':' @@ -514,7 +510,6 @@ function _next_token(l::Lexer, c) return lex_prime(l) elseif c == '÷' return lex_division(l) - elseif iswhitespace(c) return lex_whitespace(l, c) elseif is_identifier_start_char(c) From fb267bb96331494fc917068004607217d30df722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Haraldsson?= Date: Mon, 9 Oct 2023 20:44:59 +0000 Subject: [PATCH 3/6] Move more --- src/tokenize.jl | 64 +++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 32e9d13a..846a6a87 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -466,64 +466,66 @@ function _next_token(l::Lexer, c) return emit(l, K"(") elseif c == ')' return emit(l, K")") - elseif c == ';' - return emit(l, K";") elseif c == ',' return emit(l, K",") + elseif c == ';' + return emit(l, K";") + elseif c == '?' + return emit(l, K"?") + elseif c == '@' + return emit(l, K"@") + elseif c == '{' + return emit(l, K"{") + elseif c == '}' + return emit(l, K"}") + elseif c == '~' + return emit(l, K"~") + elseif iswhitespace(c) + return lex_whitespace(l, c) elseif c == '+' - return lex_plus(l); + return lex_plus(l) elseif c == '-' return lex_minus(l) elseif c == '−' # \minus '−' treated as hyphen '-' return emit(l, accept(l, '=') ? K"-=" : K"-") - elseif c == '{' - return emit(l, K"{") - elseif c == '}' - return emit(l, K"}") + elseif is_identifier_start_char(c) + return lex_identifier(l, c) + elseif isdigit(c) + return lex_digit(l, K"Integer") + elseif c == '=' + return lex_equal(l) + elseif c == '#' + return lex_comment(l) + elseif c == '*' + return lex_star(l) elseif c == '|' return lex_bar(l) elseif c == '&' return lex_amper(l) - elseif c == '=' - return lex_equal(l) elseif c == '!' return lex_exclaim(l) elseif c == '>' return lex_greater(l) elseif c == '<' return lex_less(l) - elseif c == '#' - return lex_comment(l) + elseif c == '.' + return lex_dot(l) elseif c == ':' return lex_colon(l) + elseif c == '\'' + return lex_prime(l) + elseif c == '\\' + return lex_backslash(l) elseif c == '%' return lex_percent(l) elseif c == '/' return lex_forwardslash(l) - elseif c == '\\' - return lex_backslash(l) - elseif c == '.' - return lex_dot(l) - elseif c == '`' - return lex_backtick(l) - elseif c == '\'' - return lex_prime(l) elseif c == '÷' return lex_division(l) - elseif iswhitespace(c) - return lex_whitespace(l, c) - elseif is_identifier_start_char(c) - return lex_identifier(l, c) - elseif isdigit(c) - return lex_digit(l, K"Integer") - elseif c == '*' - return lex_star(l) + elseif c == '`' + return lex_backtick(l) elseif c == '^' return lex_circumflex(l) - elseif c == '@' - return emit(l, K"@") - elseif c == '?' - return emit(l, K"?") elseif c == '$' return lex_dollar(l) elseif c == '⊻' From ecd716b9220212c3c524114671c47f0e87c4d4c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Haraldsson?= Date: Mon, 9 Oct 2023 21:03:16 +0000 Subject: [PATCH 4/6] Update tokenize.jl --- src/tokenize.jl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 846a6a87..6144f1a8 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -482,16 +482,16 @@ function _next_token(l::Lexer, c) return emit(l, K"~") elseif iswhitespace(c) return lex_whitespace(l, c) + elseif is_identifier_start_char(c) + return lex_identifier(l, c) + elseif isdigit(c) + return lex_digit(l, K"Integer") elseif c == '+' return lex_plus(l) elseif c == '-' return lex_minus(l) elseif c == '−' # \minus '−' treated as hyphen '-' return emit(l, accept(l, '=') ? K"-=" : K"-") - elseif is_identifier_start_char(c) - return lex_identifier(l, c) - elseif isdigit(c) - return lex_digit(l, K"Integer") elseif c == '=' return lex_equal(l) elseif c == '#' @@ -530,8 +530,6 @@ function _next_token(l::Lexer, c) return lex_dollar(l) elseif c == '⊻' return lex_xor(l) - elseif c == '~' - return emit(l, K"~") elseif c == EOF_CHAR return emit(l, K"EndMarker") elseif (k = get(_unicode_ops, c, K"error")) != K"error" From bbc5346a00e3aa04022704555ac6b5a6c7c6cdbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Haraldsson?= Date: Mon, 9 Oct 2023 21:18:10 +0000 Subject: [PATCH 5/6] Move quote/fix - Update tokenize.jl --- src/tokenize.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 6144f1a8..c06f4ab6 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -479,11 +479,13 @@ function _next_token(l::Lexer, c) elseif c == '}' return emit(l, K"}") elseif c == '~' - return emit(l, K"~") + return emit(l, K"~") elseif iswhitespace(c) return lex_whitespace(l, c) elseif is_identifier_start_char(c) return lex_identifier(l, c) + elseif c == '"' + return lex_quote(l); elseif isdigit(c) return lex_digit(l, K"Integer") elseif c == '+' From 6fe2f582ebf36d159d033d917edfa094b7498b45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Haraldsson?= Date: Mon, 9 Oct 2023 21:31:15 +0000 Subject: [PATCH 6/6] Fix strange bug - Update tokenize.jl --- src/tokenize.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index c06f4ab6..cbe9e271 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -484,10 +484,10 @@ function _next_token(l::Lexer, c) return lex_whitespace(l, c) elseif is_identifier_start_char(c) return lex_identifier(l, c) - elseif c == '"' - return lex_quote(l); elseif isdigit(c) - return lex_digit(l, K"Integer") + return lex_digit(l, K"Integer") + elseif c == '"' + return lex_quote(l) elseif c == '+' return lex_plus(l) elseif c == '-'