Skip to content

Commit fc81591

Browse files
committed
Parser: Use lexer for quoted strings in code block metadata
1 parent 12716ae commit fc81591

File tree

3 files changed

+116
-115
lines changed

3 files changed

+116
-115
lines changed

src/parser/lexer.mll

Lines changed: 66 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ type input = {
4242
lexbuf : Lexing.lexbuf;
4343
}
4444

45+
let string_buffer = Buffer.create 256
46+
4547
let with_location_adjustments
4648
k input ?start_offset ?adjust_start_by ?end_offset ?adjust_end_by value =
4749

@@ -83,40 +85,6 @@ let warning_loc =
8385
let warning =
8486
with_location_adjustments warning_loc
8587

86-
let unescape_tag s offset input =
87-
let start_loc = Lexing.lexeme_start input.lexbuf + String.length offset in
88-
let warn n c =
89-
let start = input.offset_to_location @@ start_loc + n in
90-
let end_ = input.offset_to_location @@ start_loc + n + 2 in
91-
let loc =
92-
{
93-
Loc.file = input.file; start; end_;
94-
}
95-
in
96-
warning_loc input loc (Parse_error.should_not_be_escaped c)
97-
in
98-
(* The common case is that there are no escape sequences. *)
99-
match String.index s '\\' with
100-
| exception Not_found -> s
101-
| _ ->
102-
let maybe_warn index = function '\\' | '"' -> () | _ as c -> warn index c in
103-
let buffer = Buffer.create (String.length s) in
104-
let rec scan_word index =
105-
if index >= String.length s then ()
106-
else
107-
let c, increment =
108-
match s.[index] with
109-
| '\\' when index + 1 < String.length s ->
110-
maybe_warn index s.[index + 1];
111-
(s.[index + 1], 2)
112-
| _ as c -> (c, 1)
113-
in
114-
Buffer.add_char buffer c;
115-
scan_word (index + increment)
116-
in
117-
scan_word 0;
118-
Buffer.contents buffer
119-
12088
let reference_token media start target input lexbuf =
12189
match start with
12290
| "{!" -> `Simple_reference target
@@ -402,7 +370,7 @@ and token input = parse
402370
code_block allow_result_block start_offset content_offset metadata
403371
prefix delim input lexbuf
404372
in
405-
match code_block_metadata_tail input [] lexbuf with
373+
match code_block_metadata_tail input None [] lexbuf with
406374
| `Ok metadata -> code_block_with_metadata metadata
407375
| `Eof ->
408376
warning input ~start_offset Parse_error.truncated_code_block_meta;
@@ -724,8 +692,6 @@ and verbatim buffer last_false_terminator start_offset input = parse
724692
{ Buffer.add_char buffer c;
725693
verbatim buffer last_false_terminator start_offset input lexbuf }
726694

727-
728-
729695
and bad_markup_recovery start_offset input = parse
730696
| [^ '}']+ as text '}' as rest
731697
{ let suggestion =
@@ -736,44 +702,72 @@ and bad_markup_recovery start_offset input = parse
736702
(Parse_error.bad_markup ("{" ^ rest) ~suggestion);
737703
emit input (`Code_span text) ~start_offset}
738704

705+
(* Based on OCaml's parsing/lexer.mll
706+
We're missing a bunch of cases here, and can add them
707+
if necessary. Using the missing cases will cause a warning *)
708+
and string input = parse
709+
| '\"'
710+
{ Buffer.contents string_buffer }
711+
| '\\' newline [' ' '\t']*
712+
{ string input lexbuf }
713+
| '\\' (['\\' '\'' '\"' 'n' 't' 'b' 'r' ' '] as c)
714+
{ Buffer.add_char string_buffer
715+
(match c with
716+
| '\\' -> '\\'
717+
| '\'' -> '\''
718+
| '\"' -> '\"'
719+
| 'n' -> '\n'
720+
| 't' -> '\t'
721+
| 'b' -> '\b'
722+
| 'r' -> '\r'
723+
| ' ' -> ' '
724+
| _ -> assert false);
725+
string input lexbuf }
726+
| '\\' (_ as c)
727+
{ warning input (Parse_error.should_not_be_escaped c);
728+
Buffer.add_char string_buffer c;
729+
string input lexbuf }
730+
| eof
731+
{ warning input Parse_error.truncated_string;
732+
Buffer.contents string_buffer }
733+
| (_ as c)
734+
{ Buffer.add_char string_buffer c;
735+
string input lexbuf }
736+
739737
and code_block_metadata_atom input = parse
740-
| '"' (tag_quoted_atom as value) '"'
738+
| '"'
741739
{
742-
let adjust_start_by = "\"" in
743-
with_location_adjustments ~adjust_start_by ~adjust_end_by:"\"" (fun _ -> Loc.at) input (unescape_tag value adjust_start_by input) }
740+
let start_offset = Lexing.lexeme_start input.lexbuf + 1 in
741+
Buffer.clear string_buffer;
742+
let s = string input lexbuf in
743+
with_location_adjustments ~start_offset ~adjust_end_by:"\"" (fun _ -> Loc.at) input s }
744744
| (tag_unquoted_atom as value)
745-
{ with_location_adjustments (fun _ -> Loc.at) input value }
746-
747-
and code_block_metadata_tail input acc = parse
748-
| (space_char* '[') { `Ok (List.rev acc) }
749-
| (space_char+ as prefix)
750-
'"' (tag_quoted_atom as key) '"' '='
751-
{
752-
let adjust_start_by = prefix ^ "\"" in
753-
let key = with_location_adjustments ~adjust_start_by ~adjust_end_by:"\"="
754-
(fun _ -> Loc.at) input (unescape_tag key adjust_start_by input) in
755-
let value = code_block_metadata_atom input lexbuf in
756-
code_block_metadata_tail input (`Binding (key, value) :: acc) lexbuf }
757-
| (space_char+ as prefix)
758-
'"' (tag_quoted_atom as tag) '"' {
759-
let adjust_start_by = prefix ^ "\"" in
760-
let tag = with_location_adjustments ~adjust_start_by ~adjust_end_by:"\""
761-
(fun _ -> Loc.at) input (unescape_tag tag adjust_start_by input) in
762-
code_block_metadata_tail input (`Tag tag :: acc) lexbuf }
763-
| (space_char+ as _prefix)
764-
(tag_unquoted_atom as key) '='
765-
{
766-
let key = with_location_adjustments (fun _ -> Loc.at) input key in
767-
let value = code_block_metadata_atom input lexbuf in
768-
code_block_metadata_tail input (`Binding (key, value) :: acc) lexbuf }
769-
| (space_char+ as _prefix)
770-
(tag_unquoted_atom as tag)
771-
{
772-
let tag = with_location_adjustments (fun _ -> Loc.at) input tag in
773-
code_block_metadata_tail input (`Tag tag :: acc) lexbuf }
774-
| _ as c
775-
{ `Invalid_char c }
776-
| eof
745+
{ with_location_adjustments (fun _ -> Loc.at) input value }
746+
| (_ as c)
747+
{ warning input (Parse_error.code_block_tag_invalid_char c);
748+
with_location_adjustments (fun _ -> Loc.at) input "" }
749+
750+
and code_block_metadata_tail input tag acc = parse
751+
| space_char+
752+
{ let acc = match tag with | Some t -> `Tag t :: acc | None -> acc in
753+
let tag = code_block_metadata_atom input lexbuf in
754+
code_block_metadata_tail input (Some tag) acc lexbuf }
755+
| space_char* '['
756+
{
757+
let acc = match tag with | Some t -> `Tag t :: acc | None -> acc in
758+
`Ok (List.rev acc) }
759+
| '='
760+
{ match tag with
761+
| Some t ->
762+
let value = code_block_metadata_atom input lexbuf in
763+
code_block_metadata_tail input None (`Binding (t, value) :: acc) lexbuf
764+
| None ->
765+
warning input (Parse_error.code_block_tag_invalid_char '=');
766+
code_block_metadata_tail input None acc lexbuf }
767+
| (_ # space_char # '[' # '=' as c) [^ '[']* '['
768+
{ warning input (Parse_error.code_block_tag_invalid_char c);
769+
`Ok (List.rev acc)}
770+
| eof
777771
{ `Eof }
778772

779773
and code_block allow_result_block start_offset content_offset metadata prefix delim input = parse

src/parser/parse_error.ml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ let truncated_see : Loc.span -> Warning.t =
6565
Warning.make
6666
"'@see' should be followed by <url>, 'file', or \"document title\"."
6767

68+
let truncated_string : Loc.span -> Warning.t =
69+
Warning.make "Truncated string literal"
70+
6871
let unknown_tag : string -> Loc.span -> Warning.t =
6972
Warning.make "Unknown tag '%s'."
7073

@@ -82,6 +85,9 @@ let language_tag_invalid_char lang_tag : char -> Loc.span -> Warning.t =
8285
let suggestion = "try '{@" ^ lang_tag ^ "[ ... ]}'." in
8386
Warning.make ~suggestion "Invalid character '%c' in language tag."
8487

88+
let code_block_tag_invalid_char : char -> Loc.span -> Warning.t =
89+
Warning.make "Invalid character in code block metadata tag '%c'."
90+
8591
let truncated_code_block_meta : Loc.span -> Warning.t =
8692
Warning.make ~suggestion:"try '{@ocaml[ ... ]}'." "Missing end of code block."
8793

0 commit comments

Comments
 (0)