@@ -285,14 +285,35 @@ let codepoint i =
285
285
failwith (Printf. sprintf " Invalid Unicode code point: %i" i);
286
286
i
287
287
288
- let regexp_for_char c = Sedlex. chars (Cset. singleton (Char. code c))
288
+ let fold_bytes ~f acc s =
289
+ let rec loop acc n =
290
+ if n = String. length s then acc
291
+ else (
292
+ let acc = f acc s.[n] in
293
+ loop acc (succ n))
294
+ in
295
+ loop acc 0
289
296
290
- let regexp_for_string s =
291
- let rec aux n =
292
- if n = String. length s then Sedlex. eps
293
- else Sedlex. seq (regexp_for_char s.[n]) (aux (succ n))
297
+ let regexp_for_char c = Sedlex. chars (Cset. singleton (Char. code c))
298
+ let regexp_for_uchar c = Sedlex. chars (Cset. singleton (Uchar. to_int c))
299
+
300
+ let regexp_for_string ~utf8 s =
301
+ let l =
302
+ if utf8 then
303
+ List. rev
304
+ (Utf8. fold
305
+ ~f: (fun acc _ uchar ->
306
+ match uchar with
307
+ | `Malformed _ -> assert false
308
+ | `Uchar uchar -> uchar :: acc)
309
+ [] s)
310
+ else List. rev (fold_bytes ~f: (fun acc c -> Uchar. of_char c :: acc) [] s)
311
+ in
312
+ let rec aux = function
313
+ | [] -> Sedlex. eps
314
+ | x :: xs -> Sedlex. seq (regexp_for_uchar x) (aux xs)
294
315
in
295
- aux 0
316
+ aux l
296
317
297
318
let err loc s =
298
319
raise (Location. Error (Location.Error. createf ~loc " Sedlex: %s" s))
@@ -303,11 +324,11 @@ let rec repeat r = function
303
324
| n , m -> Sedlex. seq r (repeat r (n - 1 , m - 1 ))
304
325
305
326
let regexp_of_pattern env =
306
- let rec char_pair_op func name p tuple =
327
+ let rec char_pair_op func name ~ utf8 p tuple =
307
328
(* Construct something like Sub(a,b) *)
308
329
match tuple with
309
330
| Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
310
- match func (aux p0) (aux p1) with
331
+ match func (aux ~utf8 p0) (aux ~utf8 p1) with
311
332
| Some r -> r
312
333
| None ->
313
334
err p.ppat_loc @@ " the " ^ name
@@ -317,16 +338,20 @@ let regexp_of_pattern env =
317
338
| _ ->
318
339
err p.ppat_loc @@ " the " ^ name
319
340
^ " operator requires two arguments, like " ^ name ^ " (a,b)"
320
- and aux p =
341
+ and aux ~ utf8 p =
321
342
(* interpret one pattern node *)
322
343
match p.ppat_desc with
323
- | Ppat_or (p1 , p2 ) -> Sedlex. alt (aux p1) (aux p2)
344
+ | Ppat_or (p1 , p2 ) -> Sedlex. alt (aux ~utf8 p1) (aux ~utf8 p2)
324
345
| Ppat_tuple (p :: pl ) ->
325
- List. fold_left (fun r p -> Sedlex. seq r (aux p)) (aux p) pl
346
+ List. fold_left
347
+ (fun r p -> Sedlex. seq r (aux ~utf8 p))
348
+ (aux ~utf8 p) pl
326
349
| Ppat_construct ({ txt = Lident "Star" } , Some (_ , p )) ->
327
- Sedlex. rep (aux p)
350
+ Sedlex. rep (aux ~utf8 p)
328
351
| Ppat_construct ({ txt = Lident "Plus" } , Some (_ , p )) ->
329
- Sedlex. plus (aux p)
352
+ Sedlex. plus (aux ~utf8 p)
353
+ | Ppat_construct ({ txt = Lident "Utf8" } , Some (_ , p )) ->
354
+ aux ~utf8: true p
330
355
| Ppat_construct
331
356
( { txt = Lident " Rep" },
332
357
Some
@@ -346,19 +371,19 @@ let regexp_of_pattern env =
346
371
| Pconst_integer (i1 , _ ), Pconst_integer (i2 , _ ) ->
347
372
let i1 = int_of_string i1 in
348
373
let i2 = int_of_string i2 in
349
- if 0 < = i1 && i1 < = i2 then repeat (aux p0) (i1, i2)
374
+ if 0 < = i1 && i1 < = i2 then repeat (aux ~utf8 p0) (i1, i2)
350
375
else err p.ppat_loc " Invalid range for Rep operator"
351
376
| _ ->
352
377
err p.ppat_loc " Rep must take an integer constant or interval"
353
378
end
354
379
| Ppat_construct ({ txt = Lident "Rep" } , _ ) ->
355
380
err p.ppat_loc " the Rep operator takes 2 arguments"
356
381
| Ppat_construct ({ txt = Lident "Opt" } , Some (_ , p )) ->
357
- Sedlex. alt Sedlex. eps (aux p)
382
+ Sedlex. alt Sedlex. eps (aux ~utf8 p)
358
383
| Ppat_construct ({ txt = Lident "Compl" } , arg ) -> begin
359
384
match arg with
360
385
| Some (_ , p0 ) -> begin
361
- match Sedlex. compl (aux p0) with
386
+ match Sedlex. compl (aux ~utf8 p0) with
362
387
| Some r -> r
363
388
| None ->
364
389
err p.ppat_loc
@@ -368,26 +393,40 @@ let regexp_of_pattern env =
368
393
| _ -> err p.ppat_loc " the Compl operator requires an argument"
369
394
end
370
395
| Ppat_construct ({ txt = Lident "Sub" } , arg ) ->
371
- char_pair_op Sedlex. subtract " Sub" p
396
+ char_pair_op ~utf8 Sedlex. subtract " Sub" p
372
397
(Option. map (fun (_ , arg ) -> arg) arg)
373
398
| Ppat_construct ({ txt = Lident "Intersect" } , arg ) ->
374
- char_pair_op Sedlex. intersection " Intersect" p
399
+ char_pair_op ~utf8 Sedlex. intersection " Intersect" p
375
400
(Option. map (fun (_ , arg ) -> arg) arg)
376
- | Ppat_construct ({ txt = Lident "Chars" } , arg ) -> (
401
+ | Ppat_construct ({ txt = Lident "Chars" } , arg ) ->
377
402
let const =
378
403
match arg with
379
404
| Some (_ , { ppat_desc = Ppat_constant const } ) -> Some const
380
405
| _ -> None
381
406
in
382
- match const with
383
- | Some (Pconst_string (s , _ , _ )) ->
384
- let c = ref Cset. empty in
385
- for i = 0 to String. length s - 1 do
386
- c := Cset. union ! c (Cset. singleton (Char. code s.[i]))
387
- done ;
388
- Sedlex. chars ! c
389
- | _ ->
390
- err p.ppat_loc " the Chars operator requires a string argument" )
407
+ begin
408
+ match const with
409
+ | Some (Pconst_string (s , _ , _ )) ->
410
+ let chars =
411
+ if utf8 then
412
+ Utf8. fold
413
+ ~f: (fun acc _ uchar ->
414
+ match uchar with
415
+ | `Malformed _ -> assert false
416
+ | `Uchar uchar ->
417
+ Cset. union acc
418
+ (Cset. singleton (Uchar. to_int uchar)))
419
+ Cset. empty s
420
+ else
421
+ fold_bytes
422
+ ~f: (fun acc c ->
423
+ Cset. union acc (Cset. singleton (Char. code c)))
424
+ Cset. empty s
425
+ in
426
+ Sedlex. chars chars
427
+ | _ ->
428
+ err p.ppat_loc " the Chars operator requires a string argument"
429
+ end
391
430
| Ppat_interval (i_start , i_end ) -> begin
392
431
match (i_start, i_end) with
393
432
| Pconst_char c1 , Pconst_char c2 ->
@@ -401,7 +440,7 @@ let regexp_of_pattern env =
401
440
end
402
441
| Ppat_constant const -> begin
403
442
match const with
404
- | Pconst_string (s , _ , _ ) -> regexp_for_string s
443
+ | Pconst_string (s , _ , _ ) -> regexp_for_string ~utf8 s
405
444
| Pconst_char c -> regexp_for_char c
406
445
| Pconst_integer (i , _ ) ->
407
446
Sedlex. chars (Cset. singleton (codepoint (int_of_string i)))
@@ -414,7 +453,7 @@ let regexp_of_pattern env =
414
453
end
415
454
| _ -> err p.ppat_loc " this pattern is not a valid regexp"
416
455
in
417
- aux
456
+ aux ~utf8: false
418
457
419
458
let previous = ref []
420
459
let regexps = ref []
0 commit comments