Merge pull request #7569 from joshuawarner32/zig-tokenizer

lukewilliamboswell · web-flow · commit 25b076b9454d · 2025-02-06T11:07:42.000+11:00
Implement initial roc tokenizer in zig
diff --git a/build.zig b/build.zig
@@ -9,12 +9,16 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
+    // Zig unicode library - https://codeberg.org/atman/zg
+    const zg = b.dependency("zg", .{});
+
     const exe = b.addExecutable(.{
         .name = "roc",
         .root_source_file = b.path("src/main.zig"),
         .target = target,
         .optimize = optimize,
     });
+    exe.root_module.addImport("GenCatData", zg.module("GenCatData"));
 
     b.installArtifact(exe);
 
@@ -34,6 +38,7 @@ pub fn build(b: *std.Build) void {
         .target = target,
         .optimize = optimize,
     });
+    all_tests.root_module.addImport("GenCatData", zg.module("GenCatData"));
 
     // Install the test binary so we can run separately
     // ```sh
diff --git a/build.zig.zon b/build.zig.zon
@@ -7,6 +7,10 @@
             .url = "git+https://github.com/kristoff-it/zig-afl-kit#88c6b71377767c1b8d26979b0adfa12a58d988dd",
             .hash = "1220796f7d2d9a2d4d7f8339ee0b14aa4bf133a15ae9ba39c941cc68e08d5c5ce9a2",
         },
+        .zg = .{
+            .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.13.2.tar.gz",
+            .hash = "122055beff332830a391e9895c044d33b15ea21063779557024b46169fb1984c6e40",
+        },
     },
     .paths = .{
         "build.zig",
diff --git a/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.formatted.roc b/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.formatted.roc
@@ -1,4 +1,4 @@
-1P : (
+11 : (
     I,
     s,
     Mw
diff --git a/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.result-ast b/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.result-ast
@@ -1,74 +1,79 @@
-@0-21 Defs(
-    Defs {
-        tags: [
-            EitherIndex(2147483648),
-        ],
-        regions: [
-            @0-17,
-        ],
-        space_before: [
-            Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
-        ],
-        space_after: [
-            Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
-        ],
-        spaces: [],
-        type_defs: [],
-        value_defs: [
-            Annotation(
-                @0-2 SpaceAfter(
-                    NumLiteral(
-                        "1P",
+@0-21 SpaceAfter(
+    Defs(
+        Defs {
+            tags: [
+                EitherIndex(2147483648),
+            ],
+            regions: [
+                @0-17,
+            ],
+            space_before: [
+                Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
+            ],
+            space_after: [
+                Slice<roc_parse::ast::CommentOrNewline> { start: 0, length: 0 },
+            ],
+            spaces: [],
+            type_defs: [],
+            value_defs: [
+                Annotation(
+                    @0-2 SpaceAfter(
+                        NumLiteral(
+                            "11",
+                        ),
+                        [
+                            Newline,
+                        ],
                     ),
-                    [
-                        Newline,
-                    ],
-                ),
-                @4-17 Tuple {
-                    elems: [
-                        @5-15 Function(
-                            [
-                                @5-6 Apply(
-                                    "",
-                                    "I",
-                                    [],
-                                ),
-                                @7-8 SpaceAfter(
-                                    BoundVariable(
-                                        "s",
+                    @4-17 Tuple {
+                        elems: [
+                            @5-15 Function(
+                                [
+                                    @5-6 Apply(
+                                        "",
+                                        "I",
+                                        [],
                                     ),
-                                    [
-                                        Newline,
-                                    ],
-                                ),
-                                @10-12 Apply(
-                                    "",
-                                    "Mw",
-                                    [],
+                                    @7-8 SpaceAfter(
+                                        BoundVariable(
+                                            "s",
+                                        ),
+                                        [
+                                            Newline,
+                                        ],
+                                    ),
+                                    @10-12 Apply(
+                                        "",
+                                        "Mw",
+                                        [],
+                                    ),
+                                ],
+                                Pure,
+                                @14-15 BoundVariable(
+                                    "r",
                                 ),
-                            ],
-                            Pure,
-                            @14-15 BoundVariable(
-                                "r",
+                            ),
+                        ],
+                        ext: Some(
+                            @16-17 BoundVariable(
+                                "l",
                             ),
                         ),
-                    ],
-                    ext: Some(
-                        @16-17 BoundVariable(
-                            "l",
-                        ),
-                    ),
-                },
-            ),
-        ],
-    },
-    @18-21 SpaceBefore(
-        Var {
-            module_name: "",
-            ident: "asl",
+                    },
+                ),
+            ],
         },
-        [
-            Newline,
-        ],
+        @18-21 SpaceBefore(
+            Var {
+                module_name: "",
+                ident: "asl",
+            },
+            [
+                Newline,
+            ],
+        ),
     ),
+    [
+        Newline,
+    ],
 )
diff --git a/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.roc b/crates/compiler/test_syntax/tests/snapshots/pass/tuple_function_annotation.expr.roc
@@ -1,4 +1,4 @@
-1P
+11
 :(I,s
 ,Mw->r)l
-asl
+asl
diff --git a/src/check/parse.zig b/src/check/parse.zig
@@ -0,0 +1,195 @@
+const std = @import("std");
+const tokenize = @import("tokenize.zig");
+
+pub const Region = struct {
+    start: usize,
+    end: usize,
+};
+
+pub const Node = struct {
+    tag: Tag,
+    data: Data,
+    region: Region,
+
+    pub const Tag = enum {
+        Unary,
+        Binary,
+        // TODO
+    };
+
+    pub const Data = union {
+        Unary: UnaryOpData,
+        Binary: BinaryOpData,
+        // Add more node data as needed
+    };
+
+    pub const UnaryOpData = struct {
+        // TODO
+    };
+
+    pub const BinaryOpData = struct {
+        // TODO
+    };
+};
+
+pub const Diagnostic = struct {
+    tag: Tag,
+    region: Region,
+
+    pub const Tag = enum {
+        // TODO
+    };
+};
+
+pub const Parser = struct {
+    pos: usize,
+    tokens: tokenize.TokenizedBuffer,
+    nodes: std.MultiArrayList(Node),
+    diagnostics: std.ArrayList(tokenize.Diagnostic),
+    allocator: std.mem.Allocator,
+
+    pub fn init(tokens: tokenize.TokenizedBuffer, allocator: std.mem.Allocator) Parser {
+        return Parser{
+            .pos = 0,
+            .tokens = tokens,
+            .nodes = std.MultiArrayList(Node){},
+            .diagnostics = std.ArrayList(tokenize.Diagnostic).init(allocator),
+            .allocator = allocator,
+        };
+    }
+
+    pub fn advance(self: *Parser) void {
+        if (self.pos >= self.tokens.tokens.len) {
+            return;
+        }
+        std.debug.print("advance {s}\n", .{@tagName(self.tokens.tokens.items(.tag)[self.pos])});
+        self.pos += 1;
+    }
+
+    pub fn peek(self: *Parser) tokenize.Token.Tag {
+        if (self.pos >= self.tokens.tokens.len) {
+            return .EndOfFile;
+        }
+        return self.tokens.tokens.items(.tag)[self.pos];
+    }
+
+    // If the next token is a newline, consume it
+    // Returns the indent level of the next line if it is a newline, otherwise null
+    pub fn consumeNewline(self: *Parser) ?u16 {
+        if (self.peek() != .Newline) {
+            return null;
+        }
+        const indent = self.tokens.tokens.items(.offset)[self.pos];
+        self.advance();
+        return @intCast(indent);
+    }
+
+    // Returns the indent level of the next line if the next token is a newline, otherwise null
+    pub fn peekNewline(self: *Parser) ?u16 {
+        if (self.peek() != .Newline) {
+            return null;
+        }
+        const indent = self.tokens.tokens.items(.offset)[self.pos];
+        return @intCast(indent);
+    }
+
+    pub fn parseFile(self: *Parser) !void {
+        while (self.peek() != .EndOfFile) {
+            if (self.consumeNewline()) |indent| {
+                std.debug.print("parseFile indent {d}\n", .{indent});
+                std.debug.assert(indent == 0); // TODO: report an error
+            }
+            if (self.peek() == .EndOfFile) {
+                break;
+            }
+            self.parseStmt(0);
+        }
+    }
+
+    pub fn parseStmt(self: *Parser, base_indent: u16) void {
+        switch (self.peek()) {
+            .LowerIdent => {
+                self.advance();
+                if (self.peek() == .OpEquals) {
+                    self.finishParseAssign(base_indent);
+                    std.debug.print("parseStmt assign\n", .{});
+                } else {
+                    std.debug.print("parseStmt expr\n", .{});
+                }
+            },
+            else => {
+                std.debug.panic("todo: emit error, unexpected token {s}", .{@tagName(self.peek())});
+            },
+        }
+    }
+
+    pub fn parseExpr(self: *Parser) void {
+        switch (self.peek()) {
+            .LowerIdent => {
+                self.advance();
+                std.debug.print("parseExpr {s}\n", .{@tagName(self.peek())});
+                // TODO: add node
+            },
+            .Int => {
+                self.advance();
+                std.debug.print("parseExpr {s}\n", .{@tagName(self.peek())});
+                // TODO: add node
+            },
+            else => {
+                std.debug.panic("todo: emit error", .{});
+            },
+        }
+    }
+
+    pub fn finishParseAssign(self: *Parser, base_indent: u16) void {
+        std.debug.assert(self.peek() == .OpEquals);
+        self.advance();
+        if (self.consumeNewline()) |indent| {
+            std.debug.print("startParseAssign indent {d}\n", .{indent});
+            if (indent <= base_indent) {
+                std.debug.panic("todo: emit error", .{});
+            }
+
+            self.parseStmt(indent);
+
+            while (true) {
+                if (self.peekNewline()) |i| {
+                    if (i <= base_indent) {
+                        break;
+                    }
+                    self.advance();
+                } else {
+                    break;
+                }
+                self.parseStmt(indent);
+            }
+        } else {
+            self.parseExpr();
+        }
+
+        std.debug.print("finishParseAssign\n", .{});
+    }
+};
+test "Parser advance and peek" {
+    const allocator = std.heap.page_allocator;
+    var tokens = try tokenize.TokenizedBuffer.init(allocator);
+    // x =
+    //     y = 1
+    //     y
+    try tokens.pushToken(.LowerIdent, 0, 1);
+    try tokens.pushToken(.OpEquals, 0, 0);
+    try tokens.pushNewline(4);
+    try tokens.pushToken(.LowerIdent, 0, 0);
+    try tokens.pushToken(.OpEquals, 0, 0);
+    try tokens.pushToken(.Int, 0, 0);
+    try tokens.pushNewline(4);
+    try tokens.pushToken(.LowerIdent, 0, 0);
+    try tokens.pushNewline(0);
+    try tokens.pushToken(.EndOfFile, 0, 0);
+
+    var parser = Parser.init(tokens, allocator);
+
+    try parser.parseFile();
+
+    // std.debug.assert(parser.nodes)
+}
diff --git a/src/check/tokenize.zig b/src/check/tokenize.zig
diff --git a/src/main.zig b/src/main.zig

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-1P : (`
	`1`	`+11 : (`
`2`	`2`	`I,`
`3`	`3`	`s,`
`4`	`4`	`Mw`
-Original file line number
+Diff line change
@@ @@ -1,4 +1,4 @@ @@
 -1P
 +11
 :(I,s
 ,Mw->r)l
 -asl
 +asl