Skip to content

Commit

Permalink
refactor json parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-zh committed Jan 27, 2025
1 parent 9dbb833 commit a1799ff
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 150 deletions.
19 changes: 15 additions & 4 deletions json/internal_types.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,25 @@

///|
priv struct ParseContext {
mut offset : Int
input : String
end_offset : Int
mut view : @string.StringView
mut line : Int
mut column : Int
}

///|
priv struct Builder {
start_pos : Position
builder : StringBuilder
}

///|
fn Builder::new(start_pos : Position) -> Builder {
{ start_pos, builder: StringBuilder::new() }
}

///|
fn ParseContext::make(input : String) -> ParseContext {
{ offset: 0, input, end_offset: input.length() }
{ view: input[:], line: 1, column: 0 }
}

///|
Expand Down
52 changes: 38 additions & 14 deletions json/lex_main.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -30,67 +30,91 @@ fn lex_value(
allow_rbracket~ : Bool = false
) -> Token!ParseError {
for {
match read_char(ctx) {
Some('\t' | ' ' | '\n' | '\r') => continue
Some('{') => return LBrace
Some('[') => return LBracket
Some(']') =>
match ctx.peek() {
Some('\t' | ' ' | '\n' | '\r') => {
ctx.bump()
continue
}
Some('{') => {
ctx.bump()
return LBrace
}
Some('[') => {
ctx.bump()
return LBracket
}
Some(']' as ch) =>
if allow_rbracket {
ctx.bump()
return RBracket
} else {
invalid_char!(ctx, shift=-1)
invalid_char!(ctx, ch)
}
Some('n') => {
ctx.bump()
lex_assert_char!(ctx, 'u')
lex_assert_char!(ctx, 'l')
lex_assert_char!(ctx, 'l')
return Null
}
Some('t') => {
ctx.bump()
lex_assert_char!(ctx, 'r')
lex_assert_char!(ctx, 'u')
lex_assert_char!(ctx, 'e')
return True
}
Some('f') => {
ctx.bump()
lex_assert_char!(ctx, 'a')
lex_assert_char!(ctx, 'l')
lex_assert_char!(ctx, 's')
lex_assert_char!(ctx, 'e')
return False
}
Some('-') =>
match read_char(ctx) {
Some('-') => {
let builder = Builder::new(ctx.position())
ctx.bump(builder~)
match ctx.peek() {
Some('0') => {
let n = lex_zero!(ctx, start=ctx.offset - 2)
ctx.bump(builder~)
let n = lex_zero!(ctx, builder)
return Number(n)
}
Some(c2) => {
if c2 >= '1' && c2 <= '9' {
let n = lex_decimal_integer!(ctx, start=ctx.offset - 2)
ctx.bump(builder~)
let n = lex_decimal_integer!(ctx, builder)
return Number(n)
}
invalid_char!(ctx, shift=-1)
invalid_char!(ctx, c2)
}
None => raise InvalidEof
}
}
Some('0') => {
let n = lex_zero!(ctx, start=ctx.offset - 1)
let builder = Builder::new(ctx.position())
ctx.bump(builder~)
let n = lex_zero!(ctx, builder)
return Number(n)
}
Some('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') => {
let n = lex_decimal_integer!(ctx, start=ctx.offset - 1)
let builder = Builder::new(ctx.position())
ctx.bump(builder~)
let n = lex_decimal_integer!(ctx, builder)
return Number(n)
}
Some('"') => {
ctx.bump()
let s = lex_string!(ctx)
return String(s)
}
Some(c) => {
if c > '\x7f' && non_ascii_whitespace.contains(c) {
ctx.bump()
continue
}
invalid_char!(ctx, shift=-1)
invalid_char!(ctx, c)
}
None => raise InvalidEof
}
Expand Down
110 changes: 73 additions & 37 deletions json/lex_misc.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,53 @@
// limitations under the License.

///|
fn read_char(ctx : ParseContext) -> Char? {
if ctx.offset < ctx.end_offset {
let c = ctx.input[ctx.offset]
ctx.offset += 1
let c1 = c.to_int()
if c1 >= 0xD800 && c1 <= 0xDBFF {
if ctx.offset < ctx.end_offset {
let c2 = ctx.input[ctx.offset].to_int()
if c2 >= 0xDC00 && c2 <= 0xDFFF {
ctx.offset += 1
let c3 = (c1 << 10) + c2 - 0x35fdc00
return Some(Char::from_int(c3))
}
fn peek(self : ParseContext) -> Char? {
match self.view {
[] => None
[c, ..] => Some(c)
}
}

///|
fn bump(self : ParseContext, builder? : Builder) -> Unit {
match self.view {
[] =>
// this should be a misuse of the peek & bump api
abort("empty view")
[c, .. rest] => {
if c == '\n' {
self.line += 1
self.column = 0
} else {
self.column += 1
}
self.view = rest
match builder {
None => ()
Some(builder) => builder.builder.write_char(c)
}
}
Some(c)
} else {
None
}
}

///|
fn position(self : ParseContext) -> Position {
Position::{ line: self.line, column: self.column }
}

///|
fn lex_skip_whitespace(ctx : ParseContext) -> Unit {
for {
match read_char(ctx) {
Some('\t' | '\u000B' | '\u000C' | ' ' | '\n' | '\r') => continue
match ctx.peek() {
Some('\t' | '\u000B' | '\u000C' | ' ' | '\n' | '\r') => {
ctx.bump()
continue
}
Some(c) => {
if c > '\x7f' && non_ascii_whitespace.contains(c) {
ctx.bump()
continue
}
ctx.offset -= 1
break
}
None => break
Expand All @@ -54,66 +70,86 @@ fn lex_skip_whitespace(ctx : ParseContext) -> Unit {
///|
fn lex_after_array_value(ctx : ParseContext) -> Token!ParseError {
lex_skip_whitespace(ctx)
match read_char(ctx) {
Some(']') => RBracket
Some(',') => Comma
Some(_) => invalid_char!(ctx, shift=-1)
match ctx.peek() {
Some(']') => {
ctx.bump()
RBracket
}
Some(',') => {
ctx.bump()
Comma
}
Some(c) => invalid_char!(ctx, c)
None => raise InvalidEof
}
}

///|
fn lex_after_property_name(ctx : ParseContext) -> Token!ParseError {
lex_skip_whitespace(ctx)
match read_char(ctx) {
Some(':') => Colon
Some(_) => invalid_char!(ctx, shift=-1)
match ctx.peek() {
Some(':') => {
ctx.bump()
Colon
}
Some(c) => invalid_char!(ctx, c)
None => raise InvalidEof
}
}

///|
fn lex_after_object_value(ctx : ParseContext) -> Token!ParseError {
lex_skip_whitespace(ctx)
match read_char(ctx) {
Some('}') => Token::RBrace
Some(',') => Token::Comma
Some(_) => invalid_char!(ctx, shift=-1)
match ctx.peek() {
Some('}') => {
ctx.bump()
RBrace
}
Some(',') => {
ctx.bump()
Comma
}
Some(c) => invalid_char!(ctx, c)
None => raise InvalidEof
}
}

///|
fn lex_assert_char(ctx : ParseContext, c : Char) -> Unit!ParseError {
match read_char(ctx) {
Some(c2) => if c == c2 { () } else { invalid_char!(ctx, shift=-1) }
match ctx.peek() {
Some(c2) => if c == c2 { ctx.bump() } else { invalid_char!(ctx, c2) }
None => raise InvalidEof
}
}

///|
fn lex_property_name(ctx : ParseContext) -> Token!ParseError {
lex_skip_whitespace(ctx)
match read_char(ctx) {
Some('}') => RBrace
match ctx.peek() {
Some('}') => {
ctx.bump()
RBrace
}
Some('"') => {
ctx.bump()
let s = lex_string!(ctx)
String(s)
}
Some(_) => invalid_char!(ctx, shift=-1)
Some(c) => invalid_char!(ctx, c)
None => raise InvalidEof
}
}

///|
fn lex_property_name2(ctx : ParseContext) -> Token!ParseError {
lex_skip_whitespace(ctx)
match read_char(ctx) {
match ctx.peek() {
Some('"') => {
ctx.bump()
let s = lex_string!(ctx)
String(s)
}
Some(_) => invalid_char!(ctx, shift=-1)
Some(c) => invalid_char!(ctx, c)
None => raise InvalidEof
}
}
Loading

0 comments on commit a1799ff

Please sign in to comment.