Skip to content

Commit 7009dae

Browse files
author
Hugo Heuzard
committed
POC: Add utf8 support for string literal
1 parent c9ea65b commit 7009dae

File tree

3 files changed

+147
-30
lines changed

3 files changed

+147
-30
lines changed

src/syntax/ppx_sedlex.ml

Lines changed: 69 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -285,14 +285,35 @@ let codepoint i =
285285
failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
286286
i
287287

288-
let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))
288+
let fold_bytes ~f acc s =
289+
let rec loop acc n =
290+
if n = String.length s then acc
291+
else (
292+
let acc = f acc s.[n] in
293+
loop acc (succ n))
294+
in
295+
loop acc 0
289296

290-
let regexp_for_string s =
291-
let rec aux n =
292-
if n = String.length s then Sedlex.eps
293-
else Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
297+
let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))
298+
let regexp_for_uchar c = Sedlex.chars (Cset.singleton (Uchar.to_int c))
299+
300+
let regexp_for_string ~utf8 s =
301+
let l =
302+
if utf8 then
303+
List.rev
304+
(Utf8.fold
305+
~f:(fun acc _ uchar ->
306+
match uchar with
307+
| `Malformed _ -> assert false
308+
| `Uchar uchar -> uchar :: acc)
309+
[] s)
310+
else List.rev (fold_bytes ~f:(fun acc c -> Uchar.of_char c :: acc) [] s)
311+
in
312+
let rec aux = function
313+
| [] -> Sedlex.eps
314+
| x :: xs -> Sedlex.seq (regexp_for_uchar x) (aux xs)
294315
in
295-
aux 0
316+
aux l
296317

297318
let err loc s =
298319
raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s))
@@ -303,11 +324,11 @@ let rec repeat r = function
303324
| n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))
304325

305326
let regexp_of_pattern env =
306-
let rec char_pair_op func name p tuple =
327+
let rec char_pair_op func name ~utf8 p tuple =
307328
(* Construct something like Sub(a,b) *)
308329
match tuple with
309330
| Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
310-
match func (aux p0) (aux p1) with
331+
match func (aux ~utf8 p0) (aux ~utf8 p1) with
311332
| Some r -> r
312333
| None ->
313334
err p.ppat_loc @@ "the " ^ name
@@ -317,16 +338,20 @@ let regexp_of_pattern env =
317338
| _ ->
318339
err p.ppat_loc @@ "the " ^ name
319340
^ " operator requires two arguments, like " ^ name ^ "(a,b)"
320-
and aux p =
341+
and aux ~utf8 p =
321342
(* interpret one pattern node *)
322343
match p.ppat_desc with
323-
| Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
344+
| Ppat_or (p1, p2) -> Sedlex.alt (aux ~utf8 p1) (aux ~utf8 p2)
324345
| Ppat_tuple (p :: pl) ->
325-
List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl
346+
List.fold_left
347+
(fun r p -> Sedlex.seq r (aux ~utf8 p))
348+
(aux ~utf8 p) pl
326349
| Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) ->
327-
Sedlex.rep (aux p)
350+
Sedlex.rep (aux ~utf8 p)
328351
| Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) ->
329-
Sedlex.plus (aux p)
352+
Sedlex.plus (aux ~utf8 p)
353+
| Ppat_construct ({ txt = Lident "Utf8" }, Some (_, p)) ->
354+
aux ~utf8:true p
330355
| Ppat_construct
331356
( { txt = Lident "Rep" },
332357
Some
@@ -346,19 +371,19 @@ let regexp_of_pattern env =
346371
| Pconst_integer (i1, _), Pconst_integer (i2, _) ->
347372
let i1 = int_of_string i1 in
348373
let i2 = int_of_string i2 in
349-
if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
374+
if 0 <= i1 && i1 <= i2 then repeat (aux ~utf8 p0) (i1, i2)
350375
else err p.ppat_loc "Invalid range for Rep operator"
351376
| _ ->
352377
err p.ppat_loc "Rep must take an integer constant or interval"
353378
end
354379
| Ppat_construct ({ txt = Lident "Rep" }, _) ->
355380
err p.ppat_loc "the Rep operator takes 2 arguments"
356381
| Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) ->
357-
Sedlex.alt Sedlex.eps (aux p)
382+
Sedlex.alt Sedlex.eps (aux ~utf8 p)
358383
| Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin
359384
match arg with
360385
| Some (_, p0) -> begin
361-
match Sedlex.compl (aux p0) with
386+
match Sedlex.compl (aux ~utf8 p0) with
362387
| Some r -> r
363388
| None ->
364389
err p.ppat_loc
@@ -368,26 +393,40 @@ let regexp_of_pattern env =
368393
| _ -> err p.ppat_loc "the Compl operator requires an argument"
369394
end
370395
| Ppat_construct ({ txt = Lident "Sub" }, arg) ->
371-
char_pair_op Sedlex.subtract "Sub" p
396+
char_pair_op ~utf8 Sedlex.subtract "Sub" p
372397
(Option.map (fun (_, arg) -> arg) arg)
373398
| Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
374-
char_pair_op Sedlex.intersection "Intersect" p
399+
char_pair_op ~utf8 Sedlex.intersection "Intersect" p
375400
(Option.map (fun (_, arg) -> arg) arg)
376-
| Ppat_construct ({ txt = Lident "Chars" }, arg) -> (
401+
| Ppat_construct ({ txt = Lident "Chars" }, arg) ->
377402
let const =
378403
match arg with
379404
| Some (_, { ppat_desc = Ppat_constant const }) -> Some const
380405
| _ -> None
381406
in
382-
match const with
383-
| Some (Pconst_string (s, _, _)) ->
384-
let c = ref Cset.empty in
385-
for i = 0 to String.length s - 1 do
386-
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
387-
done;
388-
Sedlex.chars !c
389-
| _ ->
390-
err p.ppat_loc "the Chars operator requires a string argument")
407+
begin
408+
match const with
409+
| Some (Pconst_string (s, _, _)) ->
410+
let chars =
411+
if utf8 then
412+
Utf8.fold
413+
~f:(fun acc _ uchar ->
414+
match uchar with
415+
| `Malformed _ -> assert false
416+
| `Uchar uchar ->
417+
Cset.union acc
418+
(Cset.singleton (Uchar.to_int uchar)))
419+
Cset.empty s
420+
else
421+
fold_bytes
422+
~f:(fun acc c ->
423+
Cset.union acc (Cset.singleton (Char.code c)))
424+
Cset.empty s
425+
in
426+
Sedlex.chars chars
427+
| _ ->
428+
err p.ppat_loc "the Chars operator requires a string argument"
429+
end
391430
| Ppat_interval (i_start, i_end) -> begin
392431
match (i_start, i_end) with
393432
| Pconst_char c1, Pconst_char c2 ->
@@ -401,7 +440,7 @@ let regexp_of_pattern env =
401440
end
402441
| Ppat_constant const -> begin
403442
match const with
404-
| Pconst_string (s, _, _) -> regexp_for_string s
443+
| Pconst_string (s, _, _) -> regexp_for_string ~utf8 s
405444
| Pconst_char c -> regexp_for_char c
406445
| Pconst_integer (i, _) ->
407446
Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
@@ -414,7 +453,7 @@ let regexp_of_pattern env =
414453
end
415454
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
416455
in
417-
aux
456+
aux ~utf8:false
418457

419458
let previous = ref []
420459
let regexps = ref []

src/syntax/utf8.ml

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
let unsafe_byte s j = Char.code (String.unsafe_get s j)
2+
let malformed s j l = `Malformed (String.sub s j l)
3+
4+
let width = function
5+
| '\000' .. '\127' -> 1
6+
| '\192' .. '\223' -> 2
7+
| '\224' .. '\239' -> 3
8+
| '\240' .. '\247' -> 4
9+
| _ -> 0
10+
11+
let r_utf_8 s j l =
12+
(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
13+
let uchar c = `Uchar (Uchar.unsafe_of_int c) in
14+
match l with
15+
| 1 -> uchar (unsafe_byte s j)
16+
| 2 ->
17+
let b0 = unsafe_byte s j in
18+
let b1 = unsafe_byte s (j + 1) in
19+
if b1 lsr 6 != 0b10 then malformed s j l
20+
else uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F))
21+
| 3 ->
22+
let b0 = unsafe_byte s j in
23+
let b1 = unsafe_byte s (j + 1) in
24+
let b2 = unsafe_byte s (j + 2) in
25+
let c =
26+
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
27+
in
28+
if b2 lsr 6 != 0b10 then malformed s j l
29+
else begin
30+
match b0 with
31+
| 0xE0 ->
32+
if b1 < 0xA0 || 0xBF < b1 then malformed s j l else uchar c
33+
| 0xED ->
34+
if b1 < 0x80 || 0x9F < b1 then malformed s j l else uchar c
35+
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
36+
end
37+
| 4 ->
38+
let b0 = unsafe_byte s j in
39+
let b1 = unsafe_byte s (j + 1) in
40+
let b2 = unsafe_byte s (j + 2) in
41+
let b3 = unsafe_byte s (j + 3) in
42+
let c =
43+
((b0 land 0x07) lsl 18)
44+
lor ((b1 land 0x3F) lsl 12)
45+
lor ((b2 land 0x3F) lsl 6)
46+
lor (b3 land 0x3F)
47+
in
48+
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l
49+
else begin
50+
match b0 with
51+
| 0xF0 ->
52+
if b1 < 0x90 || 0xBF < b1 then malformed s j l else uchar c
53+
| 0xF4 ->
54+
if b1 < 0x80 || 0x8F < b1 then malformed s j l else uchar c
55+
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
56+
end
57+
| _ -> assert false
58+
59+
let fold ~f acc s =
60+
let rec loop acc f s i last =
61+
if i > last then acc
62+
else (
63+
let need = width (String.unsafe_get s i) in
64+
if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) last
65+
else (
66+
let rem = last - i + 1 in
67+
if rem < need then f acc i (malformed s i rem)
68+
else loop (f acc i (r_utf_8 s i need)) f s (i + need) last))
69+
in
70+
let pos = 0 in
71+
let len = String.length s in
72+
let last = pos + len - 1 in
73+
loop acc f s pos last

src/syntax/utf8.mli

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
val fold :
2+
f:('a -> int -> [> `Malformed of string | `Uchar of Uchar.t ] -> 'a) ->
3+
'a ->
4+
string ->
5+
'a

0 commit comments

Comments
 (0)