6
6
import com .semmle .js .ast .regexp .Caret ;
7
7
import com .semmle .js .ast .regexp .CharacterClass ;
8
8
import com .semmle .js .ast .regexp .CharacterClassEscape ;
9
+ import com .semmle .js .ast .regexp .CharacterClassQuotedString ;
9
10
import com .semmle .js .ast .regexp .CharacterClassRange ;
11
+ import com .semmle .js .ast .regexp .CharacterClassSubtraction ;
10
12
import com .semmle .js .ast .regexp .Constant ;
11
13
import com .semmle .js .ast .regexp .ControlEscape ;
12
14
import com .semmle .js .ast .regexp .ControlLetter ;
18
20
import com .semmle .js .ast .regexp .Group ;
19
21
import com .semmle .js .ast .regexp .HexEscapeSequence ;
20
22
import com .semmle .js .ast .regexp .IdentityEscape ;
23
+ import com .semmle .js .ast .regexp .CharacterClassIntersection ;
21
24
import com .semmle .js .ast .regexp .NamedBackReference ;
22
25
import com .semmle .js .ast .regexp .NonWordBoundary ;
23
26
import com .semmle .js .ast .regexp .OctalEscape ;
36
39
import com .semmle .js .ast .regexp .ZeroWidthPositiveLookbehind ;
37
40
import java .util .ArrayList ;
38
41
import java .util .Arrays ;
42
+ import java .util .Collections ;
39
43
import java .util .List ;
40
44
41
45
/** A parser for ECMAScript 2018 regular expressions. */
@@ -67,6 +71,8 @@ public List<Error> getErrors() {
67
71
private List <Error > errors ;
68
72
private List <BackReference > backrefs ;
69
73
private int maxbackref ;
74
+ private boolean vFlagEnabled = false ;
75
+ private boolean uFlagEnabled = false ;
70
76
71
77
/** Parse the given string as a regular expression. */
72
78
public Result parse (String src ) {
@@ -82,6 +88,12 @@ public Result parse(String src) {
82
88
return new Result (root , errors );
83
89
}
84
90
91
+ public Result parse (String src , String flags ) {
92
+ vFlagEnabled = flags != null && flags .contains ("v" );
93
+ uFlagEnabled = flags != null && flags .contains ("u" );
94
+ return parse (src );
95
+ }
96
+
85
97
private static String fromCodePoint (int codepoint ) {
86
98
if (Character .isValidCodePoint (codepoint )) return new String (Character .toChars (codepoint ));
87
99
// replacement character
@@ -277,6 +289,43 @@ private RegExpTerm parseTerm() {
277
289
return this .finishTerm (this .parseQuantifierOpt (loc , this .parseAtom ()));
278
290
}
279
291
292
+ private RegExpTerm parseDisjunctionInsideQuotedString () {
293
+ SourceLocation loc = new SourceLocation (pos ());
294
+ List <RegExpTerm > disjuncts = new ArrayList <>();
295
+ disjuncts .add (this .parseAlternativeInsideQuotedString ());
296
+ while (this .match ("|" )) {
297
+ disjuncts .add (this .parseAlternativeInsideQuotedString ());
298
+ }
299
+ if (disjuncts .size () == 1 ) return disjuncts .get (0 );
300
+ return this .finishTerm (new Disjunction (loc , disjuncts ));
301
+ }
302
+
303
+ private RegExpTerm parseAlternativeInsideQuotedString () {
304
+ SourceLocation loc = new SourceLocation (pos ());
305
+ int startPos = this .pos ;
306
+ boolean escaped = false ;
307
+ while (true ) {
308
+ // If we're at the end of the string, something went wrong.
309
+ if (this .atEOS ()) {
310
+ this .error (Error .UNEXPECTED_EOS );
311
+ break ;
312
+ }
313
+ // We can end parsing if we're not escaped and we see a `|` which would mean Alternation
314
+ // or `}` which would mean the end of the Quoted String.
315
+ if (!escaped && this .lookahead (null , "|" , "}" )){
316
+ break ;
317
+ }
318
+ char c = this .nextChar ();
319
+ // Track whether the character is an escape character.
320
+ escaped = !escaped && (c == '\\' );
321
+ }
322
+ String literal = src .substring (startPos , pos );
323
+ loc .setEnd (pos ());
324
+ loc .setSource (literal );
325
+
326
+ return new Constant (loc , literal );
327
+ }
328
+
280
329
private RegExpTerm parseQuantifierOpt (SourceLocation loc , RegExpTerm atom ) {
281
330
if (this .match ("*" )) return this .finishTerm (new Star (loc , atom , !this .match ("?" )));
282
331
if (this .match ("+" )) return this .finishTerm (new Plus (loc , atom , !this .match ("?" )));
@@ -421,7 +470,13 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
421
470
return this .finishTerm (new NamedBackReference (loc , name , "\\ k<" + name + ">" ));
422
471
}
423
472
424
- if (this .match ("p{" , "P{" )) {
473
+ if (vFlagEnabled && this .match ("q{" )) {
474
+ RegExpTerm term = parseDisjunctionInsideQuotedString ();
475
+ this .expectRBrace ();
476
+ return this .finishTerm (new CharacterClassQuotedString (loc , term ));
477
+ }
478
+
479
+ if ((vFlagEnabled || uFlagEnabled ) && this .match ("p{" , "P{" )) {
425
480
String name = this .readIdentifier ();
426
481
if (this .match ("=" )) {
427
482
value = this .readIdentifier ();
@@ -493,6 +548,7 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
493
548
}
494
549
495
550
private RegExpTerm parseCharacterClass () {
551
+ if (vFlagEnabled ) return parseNestedCharacterClass ();
496
552
SourceLocation loc = new SourceLocation (pos ());
497
553
List <RegExpTerm > elements = new ArrayList <>();
498
554
@@ -508,6 +564,43 @@ private RegExpTerm parseCharacterClass() {
508
564
return this .finishTerm (new CharacterClass (loc , elements , inverted ));
509
565
}
510
566
567
+ private enum CharacterClassType {
568
+ STANDARD ,
569
+ INTERSECTION ,
570
+ SUBTRACTION
571
+ }
572
+
573
+ // ECMA 2024 `v` flag allows nested character classes.
574
+ private RegExpTerm parseNestedCharacterClass () {
575
+ SourceLocation loc = new SourceLocation (pos ());
576
+ List <RegExpTerm > elements = new ArrayList <>();
577
+ CharacterClassType classType = CharacterClassType .STANDARD ;
578
+
579
+ this .match ("[" );
580
+ boolean inverted = this .match ("^" );
581
+ while (!this .match ("]" )) {
582
+ if (this .atEOS ()) {
583
+ this .error (Error .EXPECTED_RBRACKET );
584
+ break ;
585
+ }
586
+ if (lookahead ("[" )) elements .add (parseNestedCharacterClass ());
587
+ else if (this .match ("&&" )) classType = CharacterClassType .INTERSECTION ;
588
+ else if (this .match ("--" )) classType = CharacterClassType .SUBTRACTION ;
589
+ else elements .add (this .parseCharacterClassElement ());
590
+ }
591
+
592
+ // Create appropriate RegExpTerm based on the detected class type
593
+ switch (classType ) {
594
+ case INTERSECTION :
595
+ return this .finishTerm (new CharacterClass (loc , Collections .singletonList (new CharacterClassIntersection (loc , elements )), inverted ));
596
+ case SUBTRACTION :
597
+ return this .finishTerm (new CharacterClass (loc , Collections .singletonList (new CharacterClassSubtraction (loc , elements )), inverted ));
598
+ case STANDARD :
599
+ default :
600
+ return this .finishTerm (new CharacterClass (loc , elements , inverted ));
601
+ }
602
+ }
603
+
511
604
private static final List <String > escapeClasses = Arrays .asList ("d" , "D" , "s" , "S" , "w" , "W" );
512
605
513
606
private RegExpTerm parseCharacterClassElement () {
@@ -519,7 +612,7 @@ private RegExpTerm parseCharacterClassElement() {
519
612
return atom ;
520
613
}
521
614
}
522
- if (!this .lookahead ("-]" ) && this .match ("-" ) && !(atom instanceof CharacterClassEscape ))
615
+ if (!this .lookahead ("-]" ) && ! this . lookahead ( "--" ) && this .match ("-" ) && !(atom instanceof CharacterClassEscape ))
523
616
return this .finishTerm (new CharacterClassRange (loc , atom , this .parseCharacterClassAtom ()));
524
617
return atom ;
525
618
}
0 commit comments