Skip to content

Commit a1799ff

Browse files
committed
refactor json parser
1 parent 9dbb833 commit a1799ff

File tree

7 files changed

+231
-150
lines changed

7 files changed

+231
-150
lines changed

json/internal_types.mbt

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,25 @@
1414

1515
///|
1616
priv struct ParseContext {
17-
mut offset : Int
18-
input : String
19-
end_offset : Int
17+
mut view : @string.StringView
18+
mut line : Int
19+
mut column : Int
20+
}
21+
22+
///|
23+
priv struct Builder {
24+
start_pos : Position
25+
builder : StringBuilder
26+
}
27+
28+
///|
29+
fn Builder::new(start_pos : Position) -> Builder {
30+
{ start_pos, builder: StringBuilder::new() }
2031
}
2132

2233
///|
2334
fn ParseContext::make(input : String) -> ParseContext {
24-
{ offset: 0, input, end_offset: input.length() }
35+
{ view: input[:], line: 1, column: 0 }
2536
}
2637

2738
///|

json/lex_main.mbt

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,67 +30,91 @@ fn lex_value(
3030
allow_rbracket~ : Bool = false
3131
) -> Token!ParseError {
3232
for {
33-
match read_char(ctx) {
34-
Some('\t' | ' ' | '\n' | '\r') => continue
35-
Some('{') => return LBrace
36-
Some('[') => return LBracket
37-
Some(']') =>
33+
match ctx.peek() {
34+
Some('\t' | ' ' | '\n' | '\r') => {
35+
ctx.bump()
36+
continue
37+
}
38+
Some('{') => {
39+
ctx.bump()
40+
return LBrace
41+
}
42+
Some('[') => {
43+
ctx.bump()
44+
return LBracket
45+
}
46+
Some(']' as ch) =>
3847
if allow_rbracket {
48+
ctx.bump()
3949
return RBracket
4050
} else {
41-
invalid_char!(ctx, shift=-1)
51+
invalid_char!(ctx, ch)
4252
}
4353
Some('n') => {
54+
ctx.bump()
4455
lex_assert_char!(ctx, 'u')
4556
lex_assert_char!(ctx, 'l')
4657
lex_assert_char!(ctx, 'l')
4758
return Null
4859
}
4960
Some('t') => {
61+
ctx.bump()
5062
lex_assert_char!(ctx, 'r')
5163
lex_assert_char!(ctx, 'u')
5264
lex_assert_char!(ctx, 'e')
5365
return True
5466
}
5567
Some('f') => {
68+
ctx.bump()
5669
lex_assert_char!(ctx, 'a')
5770
lex_assert_char!(ctx, 'l')
5871
lex_assert_char!(ctx, 's')
5972
lex_assert_char!(ctx, 'e')
6073
return False
6174
}
62-
Some('-') =>
63-
match read_char(ctx) {
75+
Some('-') => {
76+
let builder = Builder::new(ctx.position())
77+
ctx.bump(builder~)
78+
match ctx.peek() {
6479
Some('0') => {
65-
let n = lex_zero!(ctx, start=ctx.offset - 2)
80+
ctx.bump(builder~)
81+
let n = lex_zero!(ctx, builder)
6682
return Number(n)
6783
}
6884
Some(c2) => {
6985
if c2 >= '1' && c2 <= '9' {
70-
let n = lex_decimal_integer!(ctx, start=ctx.offset - 2)
86+
ctx.bump(builder~)
87+
let n = lex_decimal_integer!(ctx, builder)
7188
return Number(n)
7289
}
73-
invalid_char!(ctx, shift=-1)
90+
invalid_char!(ctx, c2)
7491
}
7592
None => raise InvalidEof
7693
}
94+
}
7795
Some('0') => {
78-
let n = lex_zero!(ctx, start=ctx.offset - 1)
96+
let builder = Builder::new(ctx.position())
97+
ctx.bump(builder~)
98+
let n = lex_zero!(ctx, builder)
7999
return Number(n)
80100
}
81101
Some('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') => {
82-
let n = lex_decimal_integer!(ctx, start=ctx.offset - 1)
102+
let builder = Builder::new(ctx.position())
103+
ctx.bump(builder~)
104+
let n = lex_decimal_integer!(ctx, builder)
83105
return Number(n)
84106
}
85107
Some('"') => {
108+
ctx.bump()
86109
let s = lex_string!(ctx)
87110
return String(s)
88111
}
89112
Some(c) => {
90113
if c > '\x7f' && non_ascii_whitespace.contains(c) {
114+
ctx.bump()
91115
continue
92116
}
93-
invalid_char!(ctx, shift=-1)
117+
invalid_char!(ctx, c)
94118
}
95119
None => raise InvalidEof
96120
}

json/lex_misc.mbt

Lines changed: 73 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -13,37 +13,53 @@
1313
// limitations under the License.
1414

1515
///|
16-
fn read_char(ctx : ParseContext) -> Char? {
17-
if ctx.offset < ctx.end_offset {
18-
let c = ctx.input[ctx.offset]
19-
ctx.offset += 1
20-
let c1 = c.to_int()
21-
if c1 >= 0xD800 && c1 <= 0xDBFF {
22-
if ctx.offset < ctx.end_offset {
23-
let c2 = ctx.input[ctx.offset].to_int()
24-
if c2 >= 0xDC00 && c2 <= 0xDFFF {
25-
ctx.offset += 1
26-
let c3 = (c1 << 10) + c2 - 0x35fdc00
27-
return Some(Char::from_int(c3))
28-
}
16+
fn peek(self : ParseContext) -> Char? {
17+
match self.view {
18+
[] => None
19+
[c, ..] => Some(c)
20+
}
21+
}
22+
23+
///|
24+
fn bump(self : ParseContext, builder? : Builder) -> Unit {
25+
match self.view {
26+
[] =>
27+
// this should be a misuse of the peek & bump api
28+
abort("empty view")
29+
[c, .. rest] => {
30+
if c == '\n' {
31+
self.line += 1
32+
self.column = 0
33+
} else {
34+
self.column += 1
35+
}
36+
self.view = rest
37+
match builder {
38+
None => ()
39+
Some(builder) => builder.builder.write_char(c)
2940
}
3041
}
31-
Some(c)
32-
} else {
33-
None
3442
}
3543
}
3644

45+
///|
46+
fn position(self : ParseContext) -> Position {
47+
Position::{ line: self.line, column: self.column }
48+
}
49+
3750
///|
3851
fn lex_skip_whitespace(ctx : ParseContext) -> Unit {
3952
for {
40-
match read_char(ctx) {
41-
Some('\t' | '\u000B' | '\u000C' | ' ' | '\n' | '\r') => continue
53+
match ctx.peek() {
54+
Some('\t' | '\u000B' | '\u000C' | ' ' | '\n' | '\r') => {
55+
ctx.bump()
56+
continue
57+
}
4258
Some(c) => {
4359
if c > '\x7f' && non_ascii_whitespace.contains(c) {
60+
ctx.bump()
4461
continue
4562
}
46-
ctx.offset -= 1
4763
break
4864
}
4965
None => break
@@ -54,66 +70,86 @@ fn lex_skip_whitespace(ctx : ParseContext) -> Unit {
5470
///|
5571
fn lex_after_array_value(ctx : ParseContext) -> Token!ParseError {
5672
lex_skip_whitespace(ctx)
57-
match read_char(ctx) {
58-
Some(']') => RBracket
59-
Some(',') => Comma
60-
Some(_) => invalid_char!(ctx, shift=-1)
73+
match ctx.peek() {
74+
Some(']') => {
75+
ctx.bump()
76+
RBracket
77+
}
78+
Some(',') => {
79+
ctx.bump()
80+
Comma
81+
}
82+
Some(c) => invalid_char!(ctx, c)
6183
None => raise InvalidEof
6284
}
6385
}
6486

6587
///|
6688
fn lex_after_property_name(ctx : ParseContext) -> Token!ParseError {
6789
lex_skip_whitespace(ctx)
68-
match read_char(ctx) {
69-
Some(':') => Colon
70-
Some(_) => invalid_char!(ctx, shift=-1)
90+
match ctx.peek() {
91+
Some(':') => {
92+
ctx.bump()
93+
Colon
94+
}
95+
Some(c) => invalid_char!(ctx, c)
7196
None => raise InvalidEof
7297
}
7398
}
7499

75100
///|
76101
fn lex_after_object_value(ctx : ParseContext) -> Token!ParseError {
77102
lex_skip_whitespace(ctx)
78-
match read_char(ctx) {
79-
Some('}') => Token::RBrace
80-
Some(',') => Token::Comma
81-
Some(_) => invalid_char!(ctx, shift=-1)
103+
match ctx.peek() {
104+
Some('}') => {
105+
ctx.bump()
106+
RBrace
107+
}
108+
Some(',') => {
109+
ctx.bump()
110+
Comma
111+
}
112+
Some(c) => invalid_char!(ctx, c)
82113
None => raise InvalidEof
83114
}
84115
}
85116

86117
///|
87118
fn lex_assert_char(ctx : ParseContext, c : Char) -> Unit!ParseError {
88-
match read_char(ctx) {
89-
Some(c2) => if c == c2 { () } else { invalid_char!(ctx, shift=-1) }
119+
match ctx.peek() {
120+
Some(c2) => if c == c2 { ctx.bump() } else { invalid_char!(ctx, c2) }
90121
None => raise InvalidEof
91122
}
92123
}
93124

94125
///|
95126
fn lex_property_name(ctx : ParseContext) -> Token!ParseError {
96127
lex_skip_whitespace(ctx)
97-
match read_char(ctx) {
98-
Some('}') => RBrace
128+
match ctx.peek() {
129+
Some('}') => {
130+
ctx.bump()
131+
RBrace
132+
}
99133
Some('"') => {
134+
ctx.bump()
100135
let s = lex_string!(ctx)
101136
String(s)
102137
}
103-
Some(_) => invalid_char!(ctx, shift=-1)
138+
Some(c) => invalid_char!(ctx, c)
104139
None => raise InvalidEof
105140
}
106141
}
107142

108143
///|
109144
fn lex_property_name2(ctx : ParseContext) -> Token!ParseError {
110145
lex_skip_whitespace(ctx)
111-
match read_char(ctx) {
146+
match ctx.peek() {
112147
Some('"') => {
148+
ctx.bump()
113149
let s = lex_string!(ctx)
114150
String(s)
115151
}
116-
Some(_) => invalid_char!(ctx, shift=-1)
152+
Some(c) => invalid_char!(ctx, c)
117153
None => raise InvalidEof
118154
}
119155
}

0 commit comments

Comments
 (0)