diff --git a/src/check/parse/tokenize/build.zig b/src/check/parse/tokenize/build.zig index 6b96d5b409..2e60797952 100644 --- a/src/check/parse/tokenize/build.zig +++ b/src/check/parse/tokenize/build.zig @@ -14,21 +14,9 @@ pub fn build(b: *std.Build) void { // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not // set a preferred release mode, allowing the user to decide how to optimize. const optimize = b.standardOptimizeOption(.{}); - const zg = b.dependency("zg", .{}); - - const lib = b.addStaticLibrary(.{ - .name = "tokenize", - // In this case the main source file is merely a path, however, in more - // complicated build scripts, this could be a generated file. - .root_source_file = b.path("src/root.zig"), - .target = target, - .optimize = optimize, - }); - // This declares intent for the library to be installed into the standard - // location when the user invokes the "install" step (the default step when - // running `zig build`). - b.installArtifact(lib); + // Zig unicode library - https://codeberg.org/atman/zg + const zg = b.dependency("zg", .{}); const exe = b.addExecutable(.{ .name = "tokenize", @@ -37,6 +25,8 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); + // Unicode data is required for the tokenizer to work correctly. + // This is "general category" data - e.g. is this a lowercase letter, a digit, etc. exe.root_module.addImport("GenCatData", zg.module("GenCatData")); // This declares intent for the executable to be installed into the @@ -67,16 +57,6 @@ pub fn build(b: *std.Build) void { const run_step = b.step("run", "Run the app"); run_step.dependOn(&run_cmd.step); - // Creates a step for unit testing. This only builds the test executable - // but does not run it. - const lib_unit_tests = b.addTest(.{ - .root_source_file = b.path("src/root.zig"), - .target = target, - .optimize = optimize, - }); - - const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests); - const exe_unit_tests = b.addTest(.{ .root_source_file = b.path("src/main.zig"), .target = target, @@ -89,6 +69,5 @@ pub fn build(b: *std.Build) void { // the `zig build --help` menu, providing a way for the user to request // running the unit tests. const test_step = b.step("test", "Run unit tests"); - test_step.dependOn(&run_lib_unit_tests.step); test_step.dependOn(&run_exe_unit_tests.step); } diff --git a/src/check/parse/tokenize/src/main.zig b/src/check/parse/tokenize/src/main.zig index 46da698538..c8122c5373 100644 --- a/src/check/parse/tokenize/src/main.zig +++ b/src/check/parse/tokenize/src/main.zig @@ -1,179 +1,192 @@ const std = @import("std"); -const GenCatData = @import("GenCatData"); - -/// The token kinds -pub const T = enum(u8) { - EndOfFile, - - // primitives - Float, - String, - SingleQuote, - - // a part of a string interpolation; generally you'll see something like: - // StringBegin, OpenCurly, , CloseCurly StringPart, OpenCurly, , CloseCurly - StringBegin, - StringPart, - - // These are not technically valid, but we can have the formatter fix them up. - SingleQuoteBegin, - SingleQuotePart, - - UpperIdent, - LowerIdent, - Underscore, - DotLowerIdent, - DotNumber, - - OpenRound, - CloseRound, - OpenSquare, - CloseSquare, - OpenCurly, - CloseCurly, - - OpPlus, - OpStar, - OpPizza, - OpAssign, - OpBinaryMinus, // trailing whitespace - OpUnaryMinus, // no trailing whitespace - OpNotEquals, - OpBang, - OpAnd, - OpAmpersand, - OpQuestion, - OpOr, - OpBar, - OpDoubleSlash, - OpSlash, - OpPercent, - OpCaret, - OpGreaterThanOrEq, - OpGreaterThan, - OpLessThanOrEq, - OpBackArrow, - OpLessThan, - OpEquals, - OpColonEqual, - - Comma, - Dot, - DoubleDot, - TripleDot, - OpColon, - OpArrow, - OpBackslash, - - // Keywords - KwIf, - KwThen, - KwElse, - KwWhen, - KwIs, - KwAs, - KwDbg, - KwCrash, - KwHas, - KwWhere, - KwImplements, - KwExposes, - KwImports, - KwImport, - KwWith, - KwGenerates, - KwPackage, - KwPackages, - KwRequires, - KwProvides, - KwTo, - KwInterface, - KwApp, - KwPlatform, - KwHosted, - KwDebug, - KwExpect, - - NamedUnderscore, - OpaqueName, - Int, - DotUpperIdent, - NoSpaceDotNumber, - NoSpaceDotLowerIdent, - NoSpaceDotUpperIdent, -}; -/// A helper function equivalent to the Rust impl T { is_keyword }. -pub fn isKeyword(tok: T) bool { - return switch (tok) { - .KwIf, - .KwThen, - .KwElse, - .KwWhen, - .KwIs, - .KwAs, - .KwDbg, - .KwExpect, - .KwCrash, - .KwHas, - .KwWhere, - .KwImplements, - .KwExposes, - .KwImport, - .KwImports, - .KwWith, - .KwGenerates, - .KwPackage, - .KwPackages, - .KwRequires, - .KwProvides, - .KwTo, - .KwInterface, - .KwApp, - .KwPlatform, - .KwHosted, - .KwExpectFx, - => true, - else => false, - }; -} +// Unicode data tables - allows us to identify upper/lowercase letters for non-ASCII characters. +const GenCatData = @import("GenCatData"); pub const Token = struct { - kind: T, + tag: Tag, offset: usize, length: usize, - pub const keywords = std.StaticStringMap(T).initComptime(.{ - .{ "if", .KwIf }, - .{ "then", .KwThen }, - .{ "else", .KwElse }, - .{ "when", .KwWhen }, - .{ "is", .KwIs }, + pub const Tag = enum(u8) { + EndOfFile, + + // primitives + Float, + String, + SingleQuote, + + // a part of a string interpolation; generally you'll see something like: + // StringBegin, OpenCurly, , CloseCurly StringPart, OpenCurly, , CloseCurly + StringBegin, + StringPart, + + // These are not technically valid, but we can have the formatter fix them up. + SingleQuoteBegin, + SingleQuotePart, + + UpperIdent, + LowerIdent, + Underscore, + DotLowerIdent, + DotInt, + DotUpperIdent, + NoSpaceDotInt, + NoSpaceDotLowerIdent, + NoSpaceDotUpperIdent, + + NamedUnderscore, + OpaqueName, + Int, + + OpenRound, + CloseRound, + OpenSquare, + CloseSquare, + OpenCurly, + CloseCurly, + + OpPlus, + OpStar, + OpPizza, + OpAssign, + OpBinaryMinus, // trailing whitespace + OpUnaryMinus, // no trailing whitespace + OpNotEquals, + OpBang, + OpAnd, + OpAmpersand, + OpQuestion, + OpOr, + OpBar, + OpDoubleSlash, + OpSlash, + OpPercent, + OpCaret, + OpGreaterThanOrEq, + OpGreaterThan, + OpLessThanOrEq, + OpBackArrow, + OpLessThan, + OpEquals, + OpColonEqual, + + Comma, + Dot, + DoubleDot, + TripleDot, + OpColon, + OpArrow, + OpBackslash, + + // Keywords + KwApp, + KwAs, + KwCrash, + KwDbg, + KwDebug, + KwElse, + KwExpect, + KwExposes, + KwGenerates, + KwHas, + KwHosted, + KwIf, + KwImplements, + KwImport, + KwImports, + KwInterface, + KwIs, + KwModule, + KwPackage, + KwPackages, + KwPlatform, + KwProvides, + KwRequires, + KwThen, + KwTo, + KwWhen, + KwWhere, + KwWith, + }; + + pub const keywords = std.StaticStringMap(Tag).initComptime(.{ + .{ "app", .KwApp }, .{ "as", .KwAs }, + .{ "crash", .KwCrash }, .{ "dbg", .KwDbg }, + .{ "else", .KwElse }, .{ "expect", .KwExpect }, - .{ "crash", .KwCrash }, + .{ "exposes", .KwExposes }, + .{ "generates", .KwGenerates }, .{ "has", .KwHas }, - .{ "where", .KwWhere }, + .{ "hosted", .KwHosted }, + .{ "if", .KwIf }, .{ "implements", .KwImplements }, - .{ "exposes", .KwExposes }, .{ "import", .KwImport }, .{ "imports", .KwImports }, - .{ "with", .KwWith }, - .{ "generates", .KwGenerates }, + .{ "interface", .KwInterface }, + .{ "is", .KwIs }, + .{ "module", .KwModule }, .{ "package", .KwPackage }, .{ "packages", .KwPackages }, - .{ "requires", .KwRequires }, + .{ "platform", .KwPlatform }, .{ "provides", .KwProvides }, + .{ "requires", .KwRequires }, + .{ "then", .KwThen }, .{ "to", .KwTo }, - .{ "interface", .KwInterface }, - .{ "app", .KwApp }, - .{ "platform", .KwPlatform }, - .{ "hosted", .KwHosted }, + .{ "when", .KwWhen }, + .{ "where", .KwWhere }, + .{ "with", .KwWith }, }); - pub fn getKeyword(bytes: []const u8) ?T { - return keywords.get(bytes); + pub const valid_number_suffixes = std.StaticStringMap(void).initComptime(.{ + .{ "dec", .{} }, + .{ "i128", .{} }, + .{ "i16", .{} }, + .{ "i32", .{} }, + .{ "i64", .{} }, + .{ "i8", .{} }, + .{ "nat", .{} }, + .{ "u128", .{} }, + .{ "u16", .{} }, + .{ "u32", .{} }, + .{ "u64", .{} }, + .{ "u8", .{} }, + }); + + pub fn isKeyword(tok: Tag) bool { + return switch (tok) { + .KwApp, + .KwAs, + .KwCrash, + .KwDbg, + .KwElse, + .KwExpect, + .KwExpectFx, + .KwExposes, + .KwGenerates, + .KwHas, + .KwHosted, + .KwIf, + .KwImplements, + .KwImport, + .KwImports, + .KwInterface, + .KwIs, + .KwPackage, + .KwPackages, + .KwPlatform, + .KwProvides, + .KwRequires, + .KwThen, + .KwTo, + .KwWhen, + .KwWhere, + .KwWith, + => true, + else => false, + }; } }; @@ -226,12 +239,17 @@ pub const TokenizedBuffer = struct { }; } - /// Pushes a token with the given kind, token offset, and length. - pub fn pushToken(self: *TokenizedBuffer, kind: T, tok_offset: usize, tok_length: usize) !void { + pub fn deinit(self: *TokenizedBuffer) void { + self.tokens.deinit(self.allocator); + self.lines.deinit(); + } + + /// Pushes a token with the given tag, token offset, and length. + pub fn pushToken(self: *TokenizedBuffer, tag: Token.Tag, tok_offset: u32, tok_length: u32) !void { try self.tokens.append(self.allocator, .{ - .kind = kind, - .offset = @intCast(tok_offset), - .length = @intCast(tok_length), + .tag = tag, + .offset = tok_offset, + .length = tok_length, }); } @@ -241,47 +259,44 @@ pub const TokenizedBuffer = struct { } }; -/// A comment is represented by its begin and end offsets. pub const Comment = struct { begin: usize, end: usize, }; -/// MessageKind enumerates different diagnostic messages. -pub const MessageKind = enum { - MisplacedCarriageReturn, - AsciiControl, - LeadingZero, - UnknownToken, - OpaqueNameWithoutName, - UppercaseBase, - InvalidUnicodeEscapeSequence, - InvalidEscapeSequence, - UnclosedString, - UnclosedSingleQuote, - BadNumberSuffix, - OverClosedBrace, - MismatchedBrace, -}; - -const UnicodeKind = enum { - LetterUpper, - LetterNotUpper, - Digit, - Other, - Invalid, -}; +const Unicode = struct { + tag: Tag, + length: u32, -const UnicodeInfo = struct { - kind: UnicodeKind, - length: usize, + const Tag = enum { + LetterUpper, + LetterNotUpper, + Digit, + Other, + Invalid, + }; }; -/// A message with its kind and offset. -pub const Message = struct { - kind: MessageKind, +pub const Diagnostic = struct { + tag: Tag, begin: u32, end: u32, + + pub const Tag = enum { + MisplacedCarriageReturn, + AsciiControl, + LeadingZero, + UnknownToken, + OpaqueNameWithoutName, + UppercaseBase, + InvalidUnicodeEscapeSequence, + InvalidEscapeSequence, + UnclosedString, + UnclosedSingleQuote, + BadNumberSuffix, + OverClosedBrace, + MismatchedBrace, + }; }; /// The cursor is our current position in the input text, and it collects messages. @@ -292,13 +307,13 @@ pub const Message = struct { /// allocate a larger slice and tokenize again. pub const Cursor = struct { buf: []const u8, - pos: usize, - messages: []Message, + pos: u32, + messages: []Diagnostic, message_count: usize, gc: *GenCatData, /// Initialize a Cursor with the given input buffer and a pre-allocated messages slice. - pub fn init(buf: []const u8, messages: []Message, gc: *GenCatData) Cursor { + pub fn init(buf: []const u8, messages: []Diagnostic, gc: *GenCatData) Cursor { return Cursor{ .buf = buf, .pos = 0, @@ -308,26 +323,19 @@ pub const Cursor = struct { }; } - fn pushMessageHere(self: *Cursor, kind: MessageKind) void { - if (self.message_count < self.messages.len) { - self.messages[self.message_count] = Message{ - .kind = kind, - .begin = @intCast(self.pos), - .end = @intCast(self.pos), - }; - self.message_count += 1; - } + fn pushMessageHere(self: *Cursor, tag: Diagnostic.Tag) void { + self.pushMessage(tag, self.pos, self.pos); } - fn pushMessage(self: *Cursor, kind: MessageKind, begin: u32, end: u32) void { + fn pushMessage(self: *Cursor, tag: Diagnostic.Tag, begin: u32, end: u32) void { if (self.message_count < self.messages.len) { - self.messages[self.message_count] = Message{ - .kind = kind, + self.messages[self.message_count] = .{ + .tag = tag, .begin = begin, .end = end, }; - self.message_count += 1; } + self.message_count += 1; } /// Returns the current byte, or null if at the end. @@ -355,15 +363,17 @@ pub const Cursor = struct { } /// Requires that the next byte is `ch`, otherwise pushes a message. - pub fn require(self: *Cursor, ch: u8, kind: MessageKind) void { + pub fn require(self: *Cursor, ch: u8, tag: Diagnostic.Tag) void { if (self.peek() == ch) { self.pos += 1; } else { - self.pushMessageHere(kind); + self.pushMessageHere(tag); } } /// Chomps “trivia” (whitespace, comments, etc.) and returns an optional indent. + /// If the chomped trivia includes a newline, returns the indent of the next (real) line. + /// Otherwise, returns null. pub fn chompTrivia(self: *Cursor) ?Indent { var sawNewline = false; var indent = Indent.init(); @@ -411,28 +421,32 @@ pub const Cursor = struct { } } - pub fn peekUnicode(self: *Cursor) UnicodeInfo { - const len3 = std.unicode.utf8ByteSequenceLength(self.buf[self.pos]) catch { - return .{ .kind = .Invalid, .length = 1 }; + /// Decodes a Unicode character starting at `self.pos` and returns its category. + /// Note this assumes the caller has already peek'd the first byte. + pub fn decodeUnicode(self: *Cursor, first_byte: u8) Unicode { + std.debug.assert(first_byte == self.buf[self.pos]); + const len3 = std.unicode.utf8ByteSequenceLength(first_byte) catch { + return .{ .tag = .Invalid, .length = 1 }; }; - const len: usize = @intCast(len3); + const len: u32 = @intCast(len3); const utf8_char = std.unicode.utf8Decode(self.buf[self.pos .. self.pos + len]) catch { - return .{ .kind = .Invalid, .length = 1 }; + return .{ .tag = .Invalid, .length = len }; }; switch (self.gc.gc(utf8_char)) { - .Lu, .Lt => return .{ .kind = .LetterUpper, .length = len }, - .Ll, .Lm, .Lo => return .{ .kind = .LetterNotUpper, .length = len }, - .Nd, .Nl, .No => return .{ .kind = .Digit, .length = len }, - else => return .{ .kind = .Other, .length = len }, + .Lu, .Lt => return .{ .tag = .LetterUpper, .length = len }, + .Ll, .Lm, .Lo => return .{ .tag = .LetterNotUpper, .length = len }, + .Nd, .Nl, .No => return .{ .tag = .Digit, .length = len }, + else => return .{ .tag = .Other, .length = len }, } } - pub fn chompNumber(self: *Cursor, b: u8) T { + pub fn chompNumber(self: *Cursor, initialDigit: u8) Token.Tag { // Consume the initial digit. + std.debug.assert(initialDigit == self.buf[self.pos]); self.pos += 1; - var tok: T = undefined; - if (b == '0') { + var tok: Token.Tag = undefined; + if (initialDigit == '0') { while (true) { const c = self.peek() orelse 0; switch (c) { @@ -441,7 +455,7 @@ pub const Cursor = struct { self.pos += 1; self.chompIntegerBase16(); self.chompNumberSuffix(); - tok = T.Int; + tok = .Int; break; }, 'o', 'O' => { @@ -449,7 +463,7 @@ pub const Cursor = struct { self.pos += 1; self.chompIntegerBase8(); self.chompNumberSuffix(); - tok = T.Int; + tok = .Int; break; }, 'b', 'B' => { @@ -457,14 +471,14 @@ pub const Cursor = struct { self.pos += 1; self.chompIntegerBase2(); self.chompNumberSuffix(); - tok = T.Int; + tok = .Int; break; }, '0'...'9' => { self.pushMessageHere(.LeadingZero); _ = self.chompNumberBase10(); self.chompNumberSuffix(); - tok = T.Int; + tok = .Int; break; }, '_' => { @@ -474,12 +488,12 @@ pub const Cursor = struct { '.' => { self.pos += 1; _ = self.chompIntegerBase10(); - tok = T.Float; + tok = .Float; _ = self.chompExponent(); break; }, else => { - tok = T.Int; + tok = .Int; break; }, } @@ -487,7 +501,7 @@ pub const Cursor = struct { } else { _ = self.chompNumberBase10(); self.chompNumberSuffix(); - tok = T.Int; + tok = .Int; } return tok; } @@ -510,7 +524,7 @@ pub const Cursor = struct { return; } const start = self.pos; - var pos: usize = self.pos + 1; + var pos = self.pos + 1; while (pos < self.buf.len) : (pos += 1) { const c = self.buf[pos]; if (std.ascii.isAlphabetic(c) or std.ascii.isDigit(c)) { @@ -520,34 +534,22 @@ pub const Cursor = struct { } } const suffix = self.buf[start..pos]; - if (!(std.mem.eql(u8, suffix, "dec") or - std.mem.eql(u8, suffix, "i128") or - std.mem.eql(u8, suffix, "i16") or - std.mem.eql(u8, suffix, "i32") or - std.mem.eql(u8, suffix, "i64") or - std.mem.eql(u8, suffix, "i8") or - std.mem.eql(u8, suffix, "nat") or - std.mem.eql(u8, suffix, "u128") or - std.mem.eql(u8, suffix, "u16") or - std.mem.eql(u8, suffix, "u32") or - std.mem.eql(u8, suffix, "u64") or - std.mem.eql(u8, suffix, "u8"))) - { + if (Token.valid_number_suffixes.get(suffix) != null) { self.pushMessageHere(.BadNumberSuffix); } self.pos = pos; } - pub fn chompNumberBase10(self: *Cursor) T { + pub fn chompNumberBase10(self: *Cursor) Token.Tag { self.chompIntegerBase10(); - var token_type: T = T.Int; + var token_type: Token.Tag = .Int; if (self.peek() orelse 0 == '.') { self.pos += 1; self.chompIntegerBase10(); - token_type = T.Float; + token_type = .Float; } if (self.chompExponent()) { - token_type = T.Float; + token_type = .Float; } return token_type; } @@ -600,38 +602,55 @@ pub const Cursor = struct { } } - pub fn chompIdentLower(self: *Cursor) T { + /// Chomps an identifier starting with a lowercase letter. + /// Also checks if the resulting identifier is a keyword. + /// Returns the token type - LowerIdent or Kw* + pub fn chompIdentLower(self: *Cursor) Token.Tag { const start = self.pos; var kwCheck: bool = true; while (self.peek()) |c| { if (c >= 'a' and c <= 'z') { self.pos += 1; - } else if ((c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9')) { + } else if ((c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9') or c == '_') { self.pos += 1; kwCheck = false; } else { - break; + const info = self.decodeUnicode(c); + if (info.tag != .Other and info.tag != .Invalid) { + self.pos += info.length; + kwCheck = false; + } else { + break; + } } } - if (kwCheck and (self.pos - start) <= 10) { + if (kwCheck) { const ident = self.buf[start..self.pos]; - const kw = Token.getKeyword(ident); - return kw orelse T.LowerIdent; + const kw = Token.keywords.get(ident); + return kw orelse .LowerIdent; } else { - return T.LowerIdent; + return .LowerIdent; } } - pub fn chompIdentUpper(self: *Cursor) T { + /// Chomps a general identifier - either upper or lower case. + /// Doesn't check if the identifier is a keyword, since we assume the caller already + /// determined that was impossible (e.g. because the first character was uppercase), + /// or otherwise not relevant. + pub fn chompIdentGeneral(self: *Cursor) void { while (self.pos < self.buf.len) { const c = self.buf[self.pos]; - if ((c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9')) { + if ((c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9') or c == '_') { self.pos += 1; } else { - break; + const info = self.decodeUnicode(c); + if (info.tag != .Other and info.tag != .Invalid) { + self.pos += info.length; + } else { + break; + } } } - return T.UpperIdent; } pub fn chompInteger(self: *Cursor) void { @@ -644,29 +663,13 @@ pub const Cursor = struct { } } } - - pub fn chompIdentGeneral(self: *Cursor) void { - while (self.pos < self.buf.len) { - const c = self.buf[self.pos]; - if ((c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or (c >= '0' and c <= '9') or c == '_') { - self.pos += 1; - } else { - const info = self.peekUnicode(); - if (info.kind != UnicodeKind.Other and info.kind != UnicodeKind.Invalid) { - self.pos += info.length; - } else { - break; - } - } - } - } }; /// The output of the tokenizer. pub const TokenOutput = struct { tokens: TokenizedBuffer, /// The messages slice is the same one provided by the caller. - messages: []Message, + messages: []Diagnostic, message_count: usize, }; @@ -688,7 +691,7 @@ pub const Tokenizer = struct { /// Creates a new Tokenizer. /// Note that the caller must also provide a pre-allocated messages buffer. - pub fn init(text: []const u8, messages: []Message, gc: *GenCatData, allocator: std.mem.Allocator) !Tokenizer { + pub fn init(text: []const u8, messages: []Diagnostic, gc: *GenCatData, allocator: std.mem.Allocator) !Tokenizer { const cursor = Cursor.init(text, messages, gc); var output = try TokenizedBuffer.init(allocator); // Push an initial line with indent 0. @@ -703,19 +706,23 @@ pub const Tokenizer = struct { }; } - pub fn destroy(self: *Tokenizer) void { - // self.output.kinds.deinit(); - // self.output.offsets.deinit(); - // self.output.lengths.deinit(); - // self.output.lines.deinit(); + pub fn deinit(self: *Tokenizer) void { + self.output.deinit(); self.stack.deinit(); - self.gc.deinit(); } - // A simplified equivalent to the Rust macros: - fn pushToken(self: *Tokenizer, kind: T, start: usize) !void { + pub fn finish_and_deinit(self: Tokenizer) TokenOutput { + self.stack.deinit(); + return .{ + .tokens = self.output, + .messages = self.cursor.messages, + .message_count = self.cursor.message_count, + }; + } + + fn pushToken(self: *Tokenizer, tag: Token.Tag, start: u32) !void { const len = self.cursor.pos - start; - try self.output.pushToken(kind, start, len); + try self.output.pushToken(tag, start, len); } fn consumeBraceCloseAndContinueStringInterp(self: *Tokenizer, brace: BraceKind) !void { @@ -760,7 +767,7 @@ pub const Tokenizer = struct { } } - /// The main tokenize loop. + /// The main tokenize loop. This loops over the whole input buffer, tokenizing as it goes. pub fn tokenize(self: *Tokenizer) !void { var sawWhitespace: bool = true; while (self.cursor.pos < self.cursor.buf.len) { @@ -787,40 +794,40 @@ pub const Tokenizer = struct { if (n == '.') { if (self.cursor.peekAt(2) == '.') { self.cursor.pos += 3; - try self.output.pushToken(T.TripleDot, start, 3); + try self.output.pushToken(.TripleDot, start, 3); } else { self.cursor.pos += 2; - try self.output.pushToken(T.DoubleDot, start, 2); + try self.output.pushToken(.DoubleDot, start, 2); } } else if (n >= '0' and n <= '9') { self.cursor.pos += 1; self.cursor.chompInteger(); const len = self.cursor.pos - start; - try self.output.pushToken(if (sp) T.DotNumber else T.NoSpaceDotNumber, start, len); + try self.output.pushToken(if (sp) .DotInt else .NoSpaceDotInt, start, len); } else if (n >= 'a' and n <= 'z') { self.cursor.pos += 1; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(if (sp) T.DotLowerIdent else T.NoSpaceDotLowerIdent, start, len); + try self.output.pushToken(if (sp) .DotLowerIdent else .NoSpaceDotLowerIdent, start, len); } else if (n >= 'A' and n <= 'Z') { self.cursor.pos += 1; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(if (sp) T.DotUpperIdent else T.NoSpaceDotUpperIdent, start, len); + try self.output.pushToken(if (sp) .DotUpperIdent else .NoSpaceDotUpperIdent, start, len); } else if (n >= 0b11000000 and n <= 0xff) { - const info = self.cursor.peekUnicode(); - switch (info.kind) { + const info = self.cursor.decodeUnicode(n); + switch (info.tag) { .LetterUpper => { self.cursor.pos += info.length; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(if (sp) T.DotUpperIdent else T.NoSpaceDotUpperIdent, start, len); + try self.output.pushToken(if (sp) .DotUpperIdent else .NoSpaceDotUpperIdent, start, len); }, .LetterNotUpper => { self.cursor.pos += info.length; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(if (sp) T.DotLowerIdent else T.NoSpaceDotLowerIdent, start, len); + try self.output.pushToken(if (sp) .DotLowerIdent else .NoSpaceDotLowerIdent, start, len); }, else => { self.cursor.pos += info.length; @@ -829,13 +836,13 @@ pub const Tokenizer = struct { } } else if (n == '{') { self.cursor.pos += 1; - try self.output.pushToken(T.Dot, start, 2); + try self.output.pushToken(.Dot, start, 2); } else { return error.UnhandledToken; } } else { self.cursor.pos += 1; - try self.output.pushToken(T.Dot, start, 1); + try self.output.pushToken(.Dot, start, 1); } }, @@ -845,25 +852,25 @@ pub const Tokenizer = struct { if (next) |n| { if (n == '>') { self.cursor.pos += 2; - try self.output.pushToken(T.OpArrow, start, 2); + try self.output.pushToken(.OpArrow, start, 2); } else if (n == ' ' or n == '\t' or n == '\n' or n == '\r' or n == '#') { self.cursor.pos += 1; - try self.output.pushToken(T.OpBinaryMinus, start, 1); + try self.output.pushToken(.OpBinaryMinus, start, 1); } else if (n >= '0' and n <= '9' and sp) { self.cursor.pos += 1; while (self.cursor.pos < self.cursor.buf.len and std.ascii.isDigit(self.cursor.buf[self.cursor.pos])) { self.cursor.pos += 1; } const len = self.cursor.pos - start; - try self.output.pushToken(T.Int, start, len); + try self.output.pushToken(.Int, start, len); } else { self.cursor.pos += 1; - const tokenType = if (sp) T.OpUnaryMinus else T.OpBinaryMinus; + const tokenType: Token.Tag = if (sp) .OpUnaryMinus else .OpBinaryMinus; try self.output.pushToken(tokenType, start, 1); } } else { self.cursor.pos += 1; - try self.output.pushToken(if (sp) T.OpUnaryMinus else T.OpBinaryMinus, start, 1); + try self.output.pushToken(if (sp) .OpUnaryMinus else .OpBinaryMinus, start, 1); } }, @@ -871,10 +878,10 @@ pub const Tokenizer = struct { '!' => { if (self.cursor.peekAt(1) == '=') { self.cursor.pos += 2; - try self.output.pushToken(T.OpNotEquals, start, 2); + try self.output.pushToken(.OpNotEquals, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpBang, start, 1); + try self.output.pushToken(.OpBang, start, 1); } }, @@ -882,88 +889,88 @@ pub const Tokenizer = struct { '&' => { if (self.cursor.peekAt(1) == '&') { self.cursor.pos += 2; - try self.output.pushToken(T.OpAnd, start, 2); + try self.output.pushToken(.OpAnd, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpAmpersand, start, 1); + try self.output.pushToken(.OpAmpersand, start, 1); } }, // Comma (,) ',' => { self.cursor.pos += 1; - try self.output.pushToken(T.Comma, start, 1); + try self.output.pushToken(.Comma, start, 1); }, // Question mark (?) '?' => { self.cursor.pos += 1; - try self.output.pushToken(T.OpQuestion, start, 1); + try self.output.pushToken(.OpQuestion, start, 1); }, // Pipe (|) '|' => { if (self.cursor.peekAt(1) == '|') { self.cursor.pos += 2; - try self.output.pushToken(T.OpOr, start, 2); + try self.output.pushToken(.OpOr, start, 2); } else if (self.cursor.peekAt(1) == '>') { self.cursor.pos += 2; - try self.output.pushToken(T.OpPizza, start, 2); + try self.output.pushToken(.OpPizza, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpBar, start, 1); + try self.output.pushToken(.OpBar, start, 1); } }, // Plus (+) '+' => { self.cursor.pos += 1; - try self.output.pushToken(T.OpPlus, start, 1); + try self.output.pushToken(.OpPlus, start, 1); }, // Star (*) '*' => { self.cursor.pos += 1; - try self.output.pushToken(T.OpStar, start, 1); + try self.output.pushToken(.OpStar, start, 1); }, // Slash (/) '/' => { if (self.cursor.peekAt(1) == '/') { self.cursor.pos += 2; - try self.output.pushToken(T.OpDoubleSlash, start, 2); + try self.output.pushToken(.OpDoubleSlash, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpSlash, start, 1); + try self.output.pushToken(.OpSlash, start, 1); } }, // Backslash (\) '\\' => { self.cursor.pos += 1; - try self.output.pushToken(T.OpBackslash, start, 1); + try self.output.pushToken(.OpBackslash, start, 1); }, // Percent (%) '%' => { self.cursor.pos += 1; - try self.output.pushToken(T.OpPercent, start, 1); + try self.output.pushToken(.OpPercent, start, 1); }, // Caret (^) '^' => { self.cursor.pos += 1; - try self.output.pushToken(T.OpCaret, start, 1); + try self.output.pushToken(.OpCaret, start, 1); }, // Greater-than (>) '>' => { if (self.cursor.peekAt(1) == '=') { self.cursor.pos += 2; - try self.output.pushToken(T.OpGreaterThanOrEq, start, 2); + try self.output.pushToken(.OpGreaterThanOrEq, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpGreaterThan, start, 1); + try self.output.pushToken(.OpGreaterThan, start, 1); } }, @@ -971,13 +978,13 @@ pub const Tokenizer = struct { '<' => { if (self.cursor.peekAt(1) == '=') { self.cursor.pos += 2; - try self.output.pushToken(T.OpLessThanOrEq, start, 2); + try self.output.pushToken(.OpLessThanOrEq, start, 2); } else if (self.cursor.peekAt(1) == '-') { self.cursor.pos += 2; - try self.output.pushToken(T.OpBackArrow, start, 2); + try self.output.pushToken(.OpBackArrow, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpLessThan, start, 1); + try self.output.pushToken(.OpLessThan, start, 1); } }, @@ -985,10 +992,10 @@ pub const Tokenizer = struct { '=' => { if (self.cursor.peekAt(1) == '=') { self.cursor.pos += 2; - try self.output.pushToken(T.OpEquals, start, 2); + try self.output.pushToken(.OpEquals, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpAssign, start, 1); + try self.output.pushToken(.OpAssign, start, 1); } }, @@ -996,53 +1003,42 @@ pub const Tokenizer = struct { ':' => { if (self.cursor.peekAt(1) == '=') { self.cursor.pos += 2; - try self.output.pushToken(T.OpColonEqual, start, 2); + try self.output.pushToken(.OpColonEqual, start, 2); } else { self.cursor.pos += 1; - try self.output.pushToken(T.OpColon, start, 1); + try self.output.pushToken(.OpColon, start, 1); } }, - // Open parenthesis (() '(' => { self.cursor.pos += 1; - try self.stack.append(BraceKind.Round); - try self.output.pushToken(T.OpenRound, start, 1); - }, - - // Close parenthesis ()) - ')' => { - try self.output.pushToken(T.CloseRound, start, 1); - try self.consumeBraceCloseAndContinueStringInterp(.Round); + try self.stack.append(.Round); + try self.output.pushToken(.OpenRound, start, 1); }, - - // Open square bracket ([) '[' => { self.cursor.pos += 1; - try self.stack.append(BraceKind.Square); - try self.output.pushToken(T.OpenSquare, start, 1); + try self.stack.append(.Square); + try self.output.pushToken(.OpenSquare, start, 1); }, - - // Close square bracket (]) - ']' => { - try self.output.pushToken(T.CloseSquare, start, 1); - try self.consumeBraceCloseAndContinueStringInterp(.Square); - }, - - // Open curly brace ({) '{' => { self.cursor.pos += 1; - try self.stack.append(BraceKind.Curly); - try self.output.pushToken(T.OpenCurly, start, 1); + try self.stack.append(.Curly); + try self.output.pushToken(.OpenCurly, start, 1); }, - // Close curly brace (}) + ')' => { + try self.output.pushToken(.CloseRound, start, 1); + try self.consumeBraceCloseAndContinueStringInterp(.Round); + }, + ']' => { + try self.output.pushToken(.CloseSquare, start, 1); + try self.consumeBraceCloseAndContinueStringInterp(.Square); + }, '}' => { - try self.output.pushToken(T.CloseCurly, start, 1); + try self.output.pushToken(.CloseCurly, start, 1); try self.consumeBraceCloseAndContinueStringInterp(.Curly); }, - // Underscore (_) '_' => { const next = self.cursor.peekAt(1); if (next) |n| { @@ -1050,18 +1046,17 @@ pub const Tokenizer = struct { self.cursor.pos += 2; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(T.NamedUnderscore, start, len); + try self.output.pushToken(.NamedUnderscore, start, len); } else { self.cursor.pos += 1; - try self.output.pushToken(T.Underscore, start, 1); + try self.output.pushToken(.Underscore, start, 1); } } else { self.cursor.pos += 1; - try self.output.pushToken(T.Underscore, start, 1); + try self.output.pushToken(.Underscore, start, 1); } }, - // At-sign (@) '@' => { const next = self.cursor.peekAt(1); if (next) |n| { @@ -1069,16 +1064,16 @@ pub const Tokenizer = struct { self.cursor.pos += 2; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(T.OpaqueName, start, len); + try self.output.pushToken(.OpaqueName, start, len); } else { self.cursor.pushMessageHere(.OpaqueNameWithoutName); self.cursor.pos += 1; - try self.output.pushToken(T.OpaqueName, start, 1); + try self.output.pushToken(.OpaqueName, start, 1); } } else { self.cursor.pushMessageHere(.OpaqueNameWithoutName); self.cursor.pos += 1; - try self.output.pushToken(T.OpaqueName, start, 1); + try self.output.pushToken(.OpaqueName, start, 1); } }, @@ -1086,21 +1081,23 @@ pub const Tokenizer = struct { '0'...'9' => { _ = self.cursor.chompNumber(b); const len = self.cursor.pos - start; - try self.output.pushToken(T.Int, start, len); + try self.output.pushToken(.Int, start, len); }, // Lowercase identifiers 'a'...'z' => { + self.cursor.pos += 1; _ = self.cursor.chompIdentLower(); const len = self.cursor.pos - start; - try self.output.pushToken(T.LowerIdent, start, len); + try self.output.pushToken(.LowerIdent, start, len); }, // Uppercase identifiers 'A'...'Z' => { - _ = self.cursor.chompIdentUpper(); + self.cursor.pos += 1; + _ = self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(T.UpperIdent, start, len); + try self.output.pushToken(.UpperIdent, start, len); }, // String-like literal starting with a single or double quote @@ -1113,19 +1110,19 @@ pub const Tokenizer = struct { // first byte of a UTF-8 sequence 0b11000000...0xff => { - const info = self.cursor.peekUnicode(); - switch (info.kind) { + const info = self.cursor.decodeUnicode(b); + switch (info.tag) { .LetterUpper => { self.cursor.pos += info.length; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(T.UpperIdent, start, len); + try self.output.pushToken(.UpperIdent, start, len); }, .LetterNotUpper => { self.cursor.pos += info.length; self.cursor.chompIdentGeneral(); const len = self.cursor.pos - start; - try self.output.pushToken(T.LowerIdent, start, len); + try self.output.pushToken(.LowerIdent, start, len); }, else => { self.cursor.pos += info.length; @@ -1146,10 +1143,10 @@ pub const Tokenizer = struct { _ = self.output.lines.pop(); } - try self.pushToken(T.EndOfFile, 0); + try self.pushToken(.EndOfFile, 0); } - pub fn tokenizeStringLikeLiteral(self: *Tokenizer, term: u8) !T { + pub fn tokenizeStringLikeLiteral(self: *Tokenizer, term: u8) !Token.Tag { const start = self.cursor.pos; // Skip the initial quote. self.cursor.pos += 1; @@ -1161,7 +1158,7 @@ pub const Tokenizer = struct { return try self.tokenizeStringLikeLiteralBody(false, term, start, multiline); } - pub fn tokenizeStringLikeLiteralBody(self: *Tokenizer, already_started: bool, term: u8, start: usize, multiline: bool) !T { + pub fn tokenizeStringLikeLiteralBody(self: *Tokenizer, already_started: bool, term: u8, start: u32, multiline: bool) !Token.Tag { var escape: bool = false; while (self.cursor.pos < self.cursor.buf.len) { const c = self.cursor.buf[self.cursor.pos]; @@ -1174,7 +1171,7 @@ pub const Tokenizer = struct { 'u' => { escape = false; self.cursor.pos += 1; - self.cursor.require('(', MessageKind.InvalidUnicodeEscapeSequence); + self.cursor.require('(', .InvalidUnicodeEscapeSequence); while (true) { if (self.cursor.peek() == ')') { self.cursor.pos += 1; @@ -1224,53 +1221,45 @@ pub const Tokenizer = struct { try self.stack.append(brace); if (term == '"') { if (already_started) { - return T.StringPart; + return .StringPart; } else { - return T.StringBegin; + return .StringBegin; } } else { std.debug.assert(term == '\''); if (already_started) { - return T.SingleQuotePart; + return .SingleQuotePart; } else { - return T.SingleQuoteBegin; + return .SingleQuoteBegin; } } } else if (c == '\n') { if (!multiline) { self.cursor.pushMessage(.UnclosedString, @intCast(start), @intCast(self.cursor.pos)); - return T.String; + return .String; } else { self.cursor.pos += 1; } } else { if (!multiline and c == term) { self.cursor.pos += 1; - return T.String; + return .String; } else if (multiline and c == term and self.cursor.peekAt(1) == term and self.cursor.peekAt(2) == term) { self.cursor.pos += 3; - return T.String; + return .String; } self.cursor.pos += 1; } } } - const kind: MessageKind = if (term == '"') .UnclosedString else .UnclosedSingleQuote; - self.cursor.pushMessage(kind, @intCast(start), @intCast(self.cursor.pos)); + const diag: Diagnostic.Tag = if (term == '"') .UnclosedString else .UnclosedSingleQuote; + self.cursor.pushMessage(diag, start, self.cursor.pos); if (already_started) { - return if (term == '"') T.StringPart else T.SingleQuotePart; + return if (term == '"') .StringPart else .SingleQuotePart; } else { - return if (term == '"') T.String else T.SingleQuote; + return if (term == '"') .String else .SingleQuote; } } - - pub fn finish(self: Tokenizer) TokenOutput { - return .{ - .tokens = self.output, - .messages = self.cursor.messages, - .message_count = self.cursor.message_count, - }; - } }; fn tokenizeAndCheckMessages(gc: *GenCatData, base_dir: std.fs.Dir, file_path: []const u8) !void { @@ -1289,11 +1278,12 @@ fn tokenizeAndCheckMessages(gc: *GenCatData, base_dir: std.fs.Dir, file_path: [] } const alloc = std.heap.page_allocator; - var messages: [128]Message = undefined; + var messages: [128]Diagnostic = undefined; const messages_slice = messages[0..]; var tokenizer = try Tokenizer.init(buffer, messages_slice, gc, alloc); try tokenizer.tokenize(); - const result = tokenizer.finish(); + var result = tokenizer.finish_and_deinit(); + defer result.tokens.deinit(); if (result.message_count != 0) { try stdout.print("Messages:\n", .{}); @@ -1304,7 +1294,7 @@ fn tokenizeAndCheckMessages(gc: *GenCatData, base_dir: std.fs.Dir, file_path: [] break; } i += 1; - try stdout.print(" {s} from {d} to {d}\n", .{ @tagName(msg.kind), msg.begin, msg.end }); + try stdout.print(" {s} from {d} to {d}\n", .{ @tagName(msg.tag), msg.begin, msg.end }); } // Print the whole file with error caret markers inserted. @@ -1354,7 +1344,7 @@ fn tokenizeAndCheckMessages(gc: *GenCatData, base_dir: std.fs.Dir, file_path: [] @memset(caret_line[highlight_start..caret_line_len], '^'); // Write the caret line. try stdout.writeAll(caret_line); - try stdout.print(" {s}", .{@tagName(result.messages[msg_index].kind)}); + try stdout.print(" {s}", .{@tagName(result.messages[msg_index].tag)}); try stdout.writeAll("\n"); std.heap.page_allocator.free(caret_line); diff --git a/src/check/parse/tokenize/src/root.zig b/src/check/parse/tokenize/src/root.zig deleted file mode 100644 index ecfeade1a3..0000000000 --- a/src/check/parse/tokenize/src/root.zig +++ /dev/null @@ -1,10 +0,0 @@ -const std = @import("std"); -const testing = std.testing; - -export fn add(a: i32, b: i32) i32 { - return a + b; -} - -test "basic add functionality" { - try testing.expect(add(3, 7) == 10); -}