diff --git a/json/internal_types.mbt b/json/internal_types.mbt index 3c8b2174e..1376f1736 100644 --- a/json/internal_types.mbt +++ b/json/internal_types.mbt @@ -14,14 +14,25 @@ ///| priv struct ParseContext { - mut offset : Int - input : String - end_offset : Int + mut view : @string.StringView + mut line : Int + mut column : Int +} + +///| +priv struct Builder { + start_pos : Position + builder : StringBuilder +} + +///| +fn Builder::new(start_pos : Position) -> Builder { + { start_pos, builder: StringBuilder::new() } } ///| fn ParseContext::make(input : String) -> ParseContext { - { offset: 0, input, end_offset: input.length() } + { view: input[:], line: 1, column: 0 } } ///| diff --git a/json/lex_main.mbt b/json/lex_main.mbt index 7ea0f2c5e..5e1c0ae00 100644 --- a/json/lex_main.mbt +++ b/json/lex_main.mbt @@ -30,67 +30,91 @@ fn lex_value( allow_rbracket~ : Bool = false ) -> Token!ParseError { for { - match read_char(ctx) { - Some('\t' | ' ' | '\n' | '\r') => continue - Some('{') => return LBrace - Some('[') => return LBracket - Some(']') => + match ctx.peek() { + Some('\t' | ' ' | '\n' | '\r') => { + ctx.bump() + continue + } + Some('{') => { + ctx.bump() + return LBrace + } + Some('[') => { + ctx.bump() + return LBracket + } + Some(']' as ch) => if allow_rbracket { + ctx.bump() return RBracket } else { - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, ch) } Some('n') => { + ctx.bump() lex_assert_char!(ctx, 'u') lex_assert_char!(ctx, 'l') lex_assert_char!(ctx, 'l') return Null } Some('t') => { + ctx.bump() lex_assert_char!(ctx, 'r') lex_assert_char!(ctx, 'u') lex_assert_char!(ctx, 'e') return True } Some('f') => { + ctx.bump() lex_assert_char!(ctx, 'a') lex_assert_char!(ctx, 'l') lex_assert_char!(ctx, 's') lex_assert_char!(ctx, 'e') return False } - Some('-') => - match read_char(ctx) { + Some('-') => { + let builder = Builder::new(ctx.position()) + ctx.bump(builder~) + match ctx.peek() { Some('0') => { - let n = lex_zero!(ctx, start=ctx.offset - 2) + ctx.bump(builder~) + let n = lex_zero!(ctx, builder) return Number(n) } Some(c2) => { if c2 >= '1' && c2 <= '9' { - let n = lex_decimal_integer!(ctx, start=ctx.offset - 2) + ctx.bump(builder~) + let n = lex_decimal_integer!(ctx, builder) return Number(n) } - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, c2) } None => raise InvalidEof } + } Some('0') => { - let n = lex_zero!(ctx, start=ctx.offset - 1) + let builder = Builder::new(ctx.position()) + ctx.bump(builder~) + let n = lex_zero!(ctx, builder) return Number(n) } Some('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') => { - let n = lex_decimal_integer!(ctx, start=ctx.offset - 1) + let builder = Builder::new(ctx.position()) + ctx.bump(builder~) + let n = lex_decimal_integer!(ctx, builder) return Number(n) } Some('"') => { + ctx.bump() let s = lex_string!(ctx) return String(s) } Some(c) => { if c > '\x7f' && non_ascii_whitespace.contains(c) { + ctx.bump() continue } - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, c) } None => raise InvalidEof } diff --git a/json/lex_misc.mbt b/json/lex_misc.mbt index 115ba1543..c3f604293 100644 --- a/json/lex_misc.mbt +++ b/json/lex_misc.mbt @@ -13,37 +13,53 @@ // limitations under the License. ///| -fn read_char(ctx : ParseContext) -> Char? { - if ctx.offset < ctx.end_offset { - let c = ctx.input[ctx.offset] - ctx.offset += 1 - let c1 = c.to_int() - if c1 >= 0xD800 && c1 <= 0xDBFF { - if ctx.offset < ctx.end_offset { - let c2 = ctx.input[ctx.offset].to_int() - if c2 >= 0xDC00 && c2 <= 0xDFFF { - ctx.offset += 1 - let c3 = (c1 << 10) + c2 - 0x35fdc00 - return Some(Char::from_int(c3)) - } +fn peek(self : ParseContext) -> Char? { + match self.view { + [] => None + [c, ..] => Some(c) + } +} + +///| +fn bump(self : ParseContext, builder? : Builder) -> Unit { + match self.view { + [] => + // this should be a misuse of the peek & bump api + abort("empty view") + [c, .. rest] => { + if c == '\n' { + self.line += 1 + self.column = 0 + } else { + self.column += 1 + } + self.view = rest + match builder { + None => () + Some(builder) => builder.builder.write_char(c) } } - Some(c) - } else { - None } } +///| +fn position(self : ParseContext) -> Position { + Position::{ line: self.line, column: self.column } +} + ///| fn lex_skip_whitespace(ctx : ParseContext) -> Unit { for { - match read_char(ctx) { - Some('\t' | '\u000B' | '\u000C' | ' ' | '\n' | '\r') => continue + match ctx.peek() { + Some('\t' | '\u000B' | '\u000C' | ' ' | '\n' | '\r') => { + ctx.bump() + continue + } Some(c) => { if c > '\x7f' && non_ascii_whitespace.contains(c) { + ctx.bump() continue } - ctx.offset -= 1 break } None => break @@ -54,10 +70,16 @@ fn lex_skip_whitespace(ctx : ParseContext) -> Unit { ///| fn lex_after_array_value(ctx : ParseContext) -> Token!ParseError { lex_skip_whitespace(ctx) - match read_char(ctx) { - Some(']') => RBracket - Some(',') => Comma - Some(_) => invalid_char!(ctx, shift=-1) + match ctx.peek() { + Some(']') => { + ctx.bump() + RBracket + } + Some(',') => { + ctx.bump() + Comma + } + Some(c) => invalid_char!(ctx, c) None => raise InvalidEof } } @@ -65,9 +87,12 @@ fn lex_after_array_value(ctx : ParseContext) -> Token!ParseError { ///| fn lex_after_property_name(ctx : ParseContext) -> Token!ParseError { lex_skip_whitespace(ctx) - match read_char(ctx) { - Some(':') => Colon - Some(_) => invalid_char!(ctx, shift=-1) + match ctx.peek() { + Some(':') => { + ctx.bump() + Colon + } + Some(c) => invalid_char!(ctx, c) None => raise InvalidEof } } @@ -75,18 +100,24 @@ fn lex_after_property_name(ctx : ParseContext) -> Token!ParseError { ///| fn lex_after_object_value(ctx : ParseContext) -> Token!ParseError { lex_skip_whitespace(ctx) - match read_char(ctx) { - Some('}') => Token::RBrace - Some(',') => Token::Comma - Some(_) => invalid_char!(ctx, shift=-1) + match ctx.peek() { + Some('}') => { + ctx.bump() + RBrace + } + Some(',') => { + ctx.bump() + Comma + } + Some(c) => invalid_char!(ctx, c) None => raise InvalidEof } } ///| fn lex_assert_char(ctx : ParseContext, c : Char) -> Unit!ParseError { - match read_char(ctx) { - Some(c2) => if c == c2 { () } else { invalid_char!(ctx, shift=-1) } + match ctx.peek() { + Some(c2) => if c == c2 { ctx.bump() } else { invalid_char!(ctx, c2) } None => raise InvalidEof } } @@ -94,13 +125,17 @@ fn lex_assert_char(ctx : ParseContext, c : Char) -> Unit!ParseError { ///| fn lex_property_name(ctx : ParseContext) -> Token!ParseError { lex_skip_whitespace(ctx) - match read_char(ctx) { - Some('}') => RBrace + match ctx.peek() { + Some('}') => { + ctx.bump() + RBrace + } Some('"') => { + ctx.bump() let s = lex_string!(ctx) String(s) } - Some(_) => invalid_char!(ctx, shift=-1) + Some(c) => invalid_char!(ctx, c) None => raise InvalidEof } } @@ -108,12 +143,13 @@ fn lex_property_name(ctx : ParseContext) -> Token!ParseError { ///| fn lex_property_name2(ctx : ParseContext) -> Token!ParseError { lex_skip_whitespace(ctx) - match read_char(ctx) { + match ctx.peek() { Some('"') => { + ctx.bump() let s = lex_string!(ctx) String(s) } - Some(_) => invalid_char!(ctx, shift=-1) + Some(c) => invalid_char!(ctx, c) None => raise InvalidEof } } diff --git a/json/lex_number.mbt b/json/lex_number.mbt index 9db17665d..8f6ec0277 100644 --- a/json/lex_number.mbt +++ b/json/lex_number.mbt @@ -13,63 +13,88 @@ // limitations under the License. ///| -fn lex_decimal_integer(ctx : ParseContext, start~ : Int) -> Double!ParseError { +fn lex_decimal_integer( + ctx : ParseContext, + builder : Builder +) -> Double!ParseError { for { - match read_char(ctx) { - Some('.') => return lex_decimal_point!(ctx, start~) - Some('e' | 'E') => return lex_decimal_exponent!(ctx, start~) + match ctx.peek() { + Some('.') => { + ctx.bump(builder~) + return lex_decimal_point!(ctx, builder) + } + Some('e' | 'E') => { + ctx.bump(builder~) + return lex_decimal_exponent!(ctx, builder) + } Some(c) => { if c >= '0' && c <= '9' { + ctx.bump(builder~) continue } - ctx.offset -= 1 - return lex_number_end!(ctx, start, ctx.offset) + return build_number!(builder) } - None => return lex_number_end!(ctx, start, ctx.offset) + None => return build_number!(builder) } } } ///| -fn lex_decimal_point(ctx : ParseContext, start~ : Int) -> Double!ParseError { - match read_char(ctx) { +fn lex_decimal_point( + ctx : ParseContext, + builder : Builder +) -> Double!ParseError { + match ctx.peek() { Some(c) => if c >= '0' && c <= '9' { - lex_decimal_fraction!(ctx, start~) + ctx.bump(builder~) + lex_decimal_fraction!(ctx, builder) } else { - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, c) } None => raise InvalidEof } } ///| -fn lex_decimal_fraction(ctx : ParseContext, start~ : Int) -> Double!ParseError { +fn lex_decimal_fraction( + ctx : ParseContext, + builder : Builder +) -> Double!ParseError { for { - match read_char(ctx) { - Some('e' | 'E') => return lex_decimal_exponent!(ctx, start~) + match ctx.peek() { + Some('e' | 'E') => { + ctx.bump(builder~) + return lex_decimal_exponent!(ctx, builder) + } Some(c) => { if c >= '0' && c <= '9' { + ctx.bump(builder~) continue } - ctx.offset -= 1 - return lex_number_end!(ctx, start, ctx.offset) + return build_number!(builder) } - None => return lex_number_end!(ctx, start, ctx.offset) + None => return build_number!(builder) } } } ///| -fn lex_decimal_exponent(ctx : ParseContext, start~ : Int) -> Double!ParseError { - match read_char(ctx) { - Some('+') | Some('-') => return lex_decimal_exponent_sign!(ctx, start~) +fn lex_decimal_exponent( + ctx : ParseContext, + builder : Builder +) -> Double!ParseError { + match ctx.peek() { + Some('+') | Some('-') => { + ctx.bump(builder~) + return lex_decimal_exponent_sign!(ctx, builder) + } Some(c) => { if c >= '0' && c <= '9' { - return lex_decimal_exponent_integer!(ctx, start~) + ctx.bump(builder~) + return lex_decimal_exponent_integer!(ctx, builder) } - ctx.offset -= 1 - invalid_char!(ctx) + invalid_char!(ctx, c) } None => raise InvalidEof } @@ -78,15 +103,15 @@ fn lex_decimal_exponent(ctx : ParseContext, start~ : Int) -> Double!ParseError { ///| fn lex_decimal_exponent_sign( ctx : ParseContext, - start~ : Int + builder : Builder ) -> Double!ParseError { - match read_char(ctx) { + match ctx.peek() { Some(c) => { if c >= '0' && c <= '9' { - return lex_decimal_exponent_integer!(ctx, start~) + ctx.bump(builder~) + return lex_decimal_exponent_integer!(ctx, builder) } - ctx.offset -= 1 - invalid_char!(ctx) + invalid_char!(ctx, c) } None => raise InvalidEof } @@ -95,49 +120,49 @@ fn lex_decimal_exponent_sign( ///| fn lex_decimal_exponent_integer( ctx : ParseContext, - start~ : Int + builder : Builder ) -> Double!ParseError { for { - match read_char(ctx) { + match ctx.peek() { Some(c) => { if c >= '0' && c <= '9' { + ctx.bump(builder~) continue } - ctx.offset -= 1 - return lex_number_end!(ctx, start, ctx.offset) + return build_number!(builder) } - None => return lex_number_end!(ctx, start, ctx.offset) + None => return build_number!(builder) } } } ///| -fn lex_zero(ctx : ParseContext, start~ : Int) -> Double!ParseError { - match read_char(ctx) { - Some('.') => lex_decimal_point!(ctx, start~) - Some('e' | 'E') => lex_decimal_exponent!(ctx, start~) +fn lex_zero(ctx : ParseContext, builder : Builder) -> Double!ParseError { + match ctx.peek() { + Some('.') => { + ctx.bump(builder~) + lex_decimal_point!(ctx, builder) + } + Some('e' | 'E') => { + ctx.bump(builder~) + lex_decimal_exponent!(ctx, builder) + } Some(c) => { if c >= '0' && c <= '9' { - ctx.offset -= 1 - invalid_char!(ctx) + invalid_char!(ctx, c) } - ctx.offset -= 1 - return lex_number_end!(ctx, start, ctx.offset) + return build_number!(builder) } - None => return lex_number_end!(ctx, start, ctx.offset) + None => return build_number!(builder) } } ///| -fn lex_number_end( - ctx : ParseContext, - start : Int, - end : Int -) -> Double!ParseError { - let s = ctx.input.substring(start~, end~) +fn build_number(builder : Builder) -> Double!ParseError { + let s = builder.builder.to_string() try { @strconv.parse_double!(s) } catch { - _ => raise InvalidNumber(offset_to_position(ctx.input, start), s) + _ => raise InvalidNumber(builder.start_pos, s) } } diff --git a/json/lex_string.mbt b/json/lex_string.mbt index cd13e0801..3d719a09c 100644 --- a/json/lex_string.mbt +++ b/json/lex_string.mbt @@ -14,45 +14,45 @@ ///| fn lex_string(ctx : ParseContext) -> String!ParseError { + // the first quotation mark is already consumed let buf = StringBuilder::new() - let mut start = ctx.offset - fn flush(end : Int) { - if start > 0 && end > start { - buf.write_substring(ctx.input, start, end - start) - } + fn bump_and_write(char : Char) { + ctx.bump() + buf.write_char(char) } for { - match read_char(ctx) { + match ctx.peek() { Some('"') => { - flush(ctx.offset - 1) + ctx.bump() break } - Some('\n' | '\r') => invalid_char!(ctx, shift=-1) + Some('\n' | '\r' as ch) => invalid_char!(ctx, ch) Some('\\') => { - flush(ctx.offset - 1) - match read_char(ctx) { - Some('b') => buf.write_char('\b') - Some('f') => buf.write_char('\x0C') - Some('n') => buf.write_char('\n') - Some('r') => buf.write_char('\r') - Some('t') => buf.write_char('\t') - Some('"') => buf.write_char('"') - Some('\\') => buf.write_char('\\') - Some('/') => buf.write_char('/') + ctx.bump() + match ctx.peek() { + Some('b') => bump_and_write('\b') + Some('f') => bump_and_write('\x0C') + Some('n') => bump_and_write('\n') + Some('r') => bump_and_write('\r') + Some('t') => bump_and_write('\t') + Some('"') => bump_and_write('"') + Some('\\') => bump_and_write('\\') + Some('/') => bump_and_write('/') Some('u') => { + ctx.bump() let c = lex_hex_digits!(ctx, 4) buf.write_char(Char::from_int(c)) } - Some(_) => invalid_char!(ctx, shift=-1) + Some(c) => invalid_char!(ctx, c) None => raise InvalidEof } - start = ctx.offset } Some(ch) => if ch.to_int() < 32 { - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, ch) } else { + bump_and_write(ch) continue } None => raise InvalidEof @@ -65,22 +65,24 @@ fn lex_string(ctx : ParseContext) -> String!ParseError { fn lex_hex_digits(ctx : ParseContext, n : Int) -> Int!ParseError { let mut r = 0 for i = 0; i < n; i = i + 1 { - match read_char(ctx) { + match ctx.peek() { Some(c) => if c >= 'A' { let d = (c.to_int() & (32).lnot()) - 'A'.to_int() + 10 if d > 15 { - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, c) } + ctx.bump() r = (r << 4) | d } else if c >= '0' { let d = c.to_int() - '0'.to_int() if d > 9 { - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, c) } + ctx.bump() r = (r << 4) | d } else { - invalid_char!(ctx, shift=-1) + invalid_char!(ctx, c) } None => raise InvalidEof } diff --git a/json/parse.mbt b/json/parse.mbt index a9b3ae2c7..86fb0b2d2 100644 --- a/json/parse.mbt +++ b/json/parse.mbt @@ -27,10 +27,10 @@ pub fn parse(input : String) -> JsonValue!ParseError { let ctx = ParseContext::make(input) let val = parse_value!(ctx) lex_skip_whitespace(ctx) - if ctx.offset >= ctx.end_offset { + if ctx.view.is_empty() { val } else { - invalid_char!(ctx) + invalid_char!(ctx, ctx.view[0]) } } diff --git a/json/utils.mbt b/json/utils.mbt index 43adf0a84..e9fc53501 100644 --- a/json/utils.mbt +++ b/json/utils.mbt @@ -13,23 +13,6 @@ // limitations under the License. ///| -fn offset_to_position(input : String, offset : Int) -> Position { - let mut line = 1 - let mut column = 0 - for i = 0; i < offset; i = i + 1 { - let c = input[i] - if c == '\n' { - line += 1 - column = 0 - } else { - column += 1 - } - } - return Position::{ line, column } -} - -///| -fn invalid_char[T](ctx : ParseContext, shift~ : Int = 0) -> T!ParseError { - let offset = ctx.offset + shift - raise InvalidChar(offset_to_position(ctx.input, offset), ctx.input[offset]) +fn invalid_char[T](ctx : ParseContext, char : Char) -> T!ParseError { + raise InvalidChar(ctx.position(), char) }