Skip to content

Commit a4f2264

Browse files
authored
Merge pull request #18899 from Napalys/js/ecma-2024-regex
JS: Add ECMAScript 2024 `v` Flag Operators for Regex Parsing
2 parents 22b36a8 + a900f2c commit a4f2264

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+8106
-6
lines changed

javascript/downgrades/5b5db607d20c7b449cef2d1c926b24d77c69bebb/old.dbscheme

+1,193
Large diffs are not rendered by default.

javascript/downgrades/5b5db607d20c7b449cef2d1c926b24d77c69bebb/semmlecode.javascript.dbscheme

+1,190
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
description: Add support for quoted string, intersection and subtraction
2+
compatibility: backwards
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.semmle.js.ast.regexp;
2+
3+
import com.semmle.js.ast.SourceLocation;
4+
import java.util.List;
5+
6+
/**
7+
* A character class intersection in a regular expression available only with the `v` flag.
8+
* Example: [[abc]&&[ab]&&[b]] matches character `b` only.
9+
*/
10+
public class CharacterClassIntersection extends RegExpTerm {
11+
private final List<RegExpTerm> elements;
12+
13+
public CharacterClassIntersection(SourceLocation loc, List<RegExpTerm> elements) {
14+
super(loc, "CharacterClassIntersection");
15+
this.elements = elements;
16+
}
17+
18+
@Override
19+
public void accept(Visitor v) {
20+
v.visit(this);
21+
}
22+
23+
public List<RegExpTerm> getElements() {
24+
return elements;
25+
}
26+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package com.semmle.js.ast.regexp;
2+
3+
import com.semmle.js.ast.SourceLocation;
4+
5+
/**
6+
* A quoted string escape sequence '\q{}' in a regular expression.
7+
* This feature is a non-standard extension that requires the 'v' flag.
8+
*
9+
* Example: [\q{abc|def}] creates a character class that matches either the string
10+
* "abc" or "def". Within the quoted string, only the alternation operator '|' is supported.
11+
*/
12+
public class CharacterClassQuotedString extends RegExpTerm {
13+
private final RegExpTerm term;
14+
15+
public CharacterClassQuotedString(SourceLocation loc, RegExpTerm term) {
16+
super(loc, "CharacterClassQuotedString");
17+
this.term = term;
18+
}
19+
20+
public RegExpTerm getTerm() {
21+
return term;
22+
}
23+
24+
@Override
25+
public void accept(Visitor v) {
26+
v.visit(this);
27+
}
28+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.semmle.js.ast.regexp;
2+
3+
import com.semmle.js.ast.SourceLocation;
4+
import java.util.List;
5+
6+
/**
7+
* A character class subtraction in a regular expression available only with the `v` flag.
8+
* Example: [[abc]--[a]--[b]] matches character `c` only.
9+
*/
10+
public class CharacterClassSubtraction extends RegExpTerm {
11+
private final List<RegExpTerm> elements;
12+
13+
public CharacterClassSubtraction(SourceLocation loc, List<RegExpTerm> elements) {
14+
super(loc, "CharacterClassSubtraction");
15+
this.elements = elements;
16+
}
17+
18+
@Override
19+
public void accept(Visitor v) {
20+
v.visit(this);
21+
}
22+
23+
public List<RegExpTerm> getElements() {
24+
return elements;
25+
}
26+
}

javascript/extractor/src/com/semmle/js/ast/regexp/Visitor.java

+6
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,10 @@ public interface Visitor {
6161
public void visit(ZeroWidthNegativeLookbehind nd);
6262

6363
public void visit(UnicodePropertyEscape nd);
64+
65+
public void visit(CharacterClassQuotedString nd);
66+
67+
public void visit(CharacterClassIntersection nd);
68+
69+
public void visit(CharacterClassSubtraction nd);
6470
}

javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ public Label visit(Literal nd, Context c) {
600600
SourceMap sourceMap =
601601
SourceMap.legacyWithStartPos(
602602
SourceMap.fromString(nd.getRaw()).offsetBy(0, offsets), startPos);
603-
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), sourceMap, nd, false);
603+
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), sourceMap, nd, false, source.substring(source.lastIndexOf('/'), source.length()));
604604
} else if (nd.isStringLiteral()
605605
&& !c.isInsideType()
606606
&& nd.getRaw().length() < 1000

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

+34-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
import com.semmle.js.ast.regexp.Caret;
1111
import com.semmle.js.ast.regexp.CharacterClass;
1212
import com.semmle.js.ast.regexp.CharacterClassEscape;
13+
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
1314
import com.semmle.js.ast.regexp.CharacterClassRange;
15+
import com.semmle.js.ast.regexp.CharacterClassSubtraction;
1416
import com.semmle.js.ast.regexp.Constant;
1517
import com.semmle.js.ast.regexp.ControlEscape;
1618
import com.semmle.js.ast.regexp.ControlLetter;
@@ -22,6 +24,7 @@
2224
import com.semmle.js.ast.regexp.Group;
2325
import com.semmle.js.ast.regexp.HexEscapeSequence;
2426
import com.semmle.js.ast.regexp.IdentityEscape;
27+
import com.semmle.js.ast.regexp.CharacterClassIntersection;
2528
import com.semmle.js.ast.regexp.Literal;
2629
import com.semmle.js.ast.regexp.NamedBackReference;
2730
import com.semmle.js.ast.regexp.NonWordBoundary;
@@ -92,6 +95,9 @@ public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
9295
termkinds.put("ZeroWidthPositiveLookbehind", 25);
9396
termkinds.put("ZeroWidthNegativeLookbehind", 26);
9497
termkinds.put("UnicodePropertyEscape", 27);
98+
termkinds.put("CharacterClassQuotedString", 28);
99+
termkinds.put("CharacterClassIntersection", 29);
100+
termkinds.put("CharacterClassSubtraction", 30);
95101
}
96102

97103
private static final String[] errmsgs =
@@ -344,10 +350,32 @@ public void visit(CharacterClassRange nd) {
344350
visit(nd.getLeft(), lbl, 0);
345351
visit(nd.getRight(), lbl, 1);
346352
}
353+
354+
@Override
355+
public void visit(CharacterClassQuotedString nd) {
356+
Label lbl = extractTerm(nd, parent, idx);
357+
visit(nd.getTerm(), lbl, 0);
358+
}
359+
360+
@Override
361+
public void visit(CharacterClassIntersection nd) {
362+
Label lbl = extractTerm(nd, parent, idx);
363+
int i = 0;
364+
for (RegExpTerm element : nd.getElements())
365+
visit(element, lbl, i++);
366+
}
367+
368+
@Override
369+
public void visit(CharacterClassSubtraction nd) {
370+
Label lbl = extractTerm(nd, parent, idx);
371+
int i = 0;
372+
for (RegExpTerm element : nd.getElements())
373+
visit(element, lbl, i++);
374+
}
347375
}
348376

349-
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing) {
350-
Result res = parser.parse(src);
377+
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {
378+
Result res = parser.parse(src, flags);
351379
if (isSpeculativeParsing && res.getErrors().size() > 0) {
352380
return;
353381
}
@@ -364,4 +392,8 @@ public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpec
364392
this.emitLocation(err, lbl);
365393
}
366394
}
395+
396+
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing) {
397+
extract(src, sourceMap, parent, isSpeculativeParsing, "");
398+
}
367399
}

javascript/extractor/src/com/semmle/js/parser/RegExpParser.java

+95-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
import com.semmle.js.ast.regexp.Caret;
77
import com.semmle.js.ast.regexp.CharacterClass;
88
import com.semmle.js.ast.regexp.CharacterClassEscape;
9+
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
910
import com.semmle.js.ast.regexp.CharacterClassRange;
11+
import com.semmle.js.ast.regexp.CharacterClassSubtraction;
1012
import com.semmle.js.ast.regexp.Constant;
1113
import com.semmle.js.ast.regexp.ControlEscape;
1214
import com.semmle.js.ast.regexp.ControlLetter;
@@ -18,6 +20,7 @@
1820
import com.semmle.js.ast.regexp.Group;
1921
import com.semmle.js.ast.regexp.HexEscapeSequence;
2022
import com.semmle.js.ast.regexp.IdentityEscape;
23+
import com.semmle.js.ast.regexp.CharacterClassIntersection;
2124
import com.semmle.js.ast.regexp.NamedBackReference;
2225
import com.semmle.js.ast.regexp.NonWordBoundary;
2326
import com.semmle.js.ast.regexp.OctalEscape;
@@ -36,6 +39,7 @@
3639
import com.semmle.js.ast.regexp.ZeroWidthPositiveLookbehind;
3740
import java.util.ArrayList;
3841
import java.util.Arrays;
42+
import java.util.Collections;
3943
import java.util.List;
4044

4145
/** A parser for ECMAScript 2018 regular expressions. */
@@ -67,6 +71,8 @@ public List<Error> getErrors() {
6771
private List<Error> errors;
6872
private List<BackReference> backrefs;
6973
private int maxbackref;
74+
private boolean vFlagEnabled = false;
75+
private boolean uFlagEnabled = false;
7076

7177
/** Parse the given string as a regular expression. */
7278
public Result parse(String src) {
@@ -82,6 +88,12 @@ public Result parse(String src) {
8288
return new Result(root, errors);
8389
}
8490

91+
public Result parse(String src, String flags) {
92+
vFlagEnabled = flags != null && flags.contains("v");
93+
uFlagEnabled = flags != null && flags.contains("u");
94+
return parse(src);
95+
}
96+
8597
private static String fromCodePoint(int codepoint) {
8698
if (Character.isValidCodePoint(codepoint)) return new String(Character.toChars(codepoint));
8799
// replacement character
@@ -277,6 +289,43 @@ private RegExpTerm parseTerm() {
277289
return this.finishTerm(this.parseQuantifierOpt(loc, this.parseAtom()));
278290
}
279291

292+
private RegExpTerm parseDisjunctionInsideQuotedString() {
293+
SourceLocation loc = new SourceLocation(pos());
294+
List<RegExpTerm> disjuncts = new ArrayList<>();
295+
disjuncts.add(this.parseAlternativeInsideQuotedString());
296+
while (this.match("|")) {
297+
disjuncts.add(this.parseAlternativeInsideQuotedString());
298+
}
299+
if (disjuncts.size() == 1) return disjuncts.get(0);
300+
return this.finishTerm(new Disjunction(loc, disjuncts));
301+
}
302+
303+
private RegExpTerm parseAlternativeInsideQuotedString() {
304+
SourceLocation loc = new SourceLocation(pos());
305+
int startPos = this.pos;
306+
boolean escaped = false;
307+
while (true) {
308+
// If we're at the end of the string, something went wrong.
309+
if (this.atEOS()) {
310+
this.error(Error.UNEXPECTED_EOS);
311+
break;
312+
}
313+
// We can end parsing if we're not escaped and we see a `|` which would mean Alternation
314+
// or `}` which would mean the end of the Quoted String.
315+
if(!escaped && this.lookahead(null, "|", "}")){
316+
break;
317+
}
318+
char c = this.nextChar();
319+
// Track whether the character is an escape character.
320+
escaped = !escaped && (c == '\\');
321+
}
322+
String literal = src.substring(startPos, pos);
323+
loc.setEnd(pos());
324+
loc.setSource(literal);
325+
326+
return new Constant(loc, literal);
327+
}
328+
280329
private RegExpTerm parseQuantifierOpt(SourceLocation loc, RegExpTerm atom) {
281330
if (this.match("*")) return this.finishTerm(new Star(loc, atom, !this.match("?")));
282331
if (this.match("+")) return this.finishTerm(new Plus(loc, atom, !this.match("?")));
@@ -421,7 +470,13 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
421470
return this.finishTerm(new NamedBackReference(loc, name, "\\k<" + name + ">"));
422471
}
423472

424-
if (this.match("p{", "P{")) {
473+
if (vFlagEnabled && this.match("q{")) {
474+
RegExpTerm term = parseDisjunctionInsideQuotedString();
475+
this.expectRBrace();
476+
return this.finishTerm(new CharacterClassQuotedString(loc, term));
477+
}
478+
479+
if ((vFlagEnabled || uFlagEnabled) && this.match("p{", "P{")) {
425480
String name = this.readIdentifier();
426481
if (this.match("=")) {
427482
value = this.readIdentifier();
@@ -493,6 +548,7 @@ private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
493548
}
494549

495550
private RegExpTerm parseCharacterClass() {
551+
if (vFlagEnabled) return parseNestedCharacterClass();
496552
SourceLocation loc = new SourceLocation(pos());
497553
List<RegExpTerm> elements = new ArrayList<>();
498554

@@ -508,6 +564,43 @@ private RegExpTerm parseCharacterClass() {
508564
return this.finishTerm(new CharacterClass(loc, elements, inverted));
509565
}
510566

567+
private enum CharacterClassType {
568+
STANDARD,
569+
INTERSECTION,
570+
SUBTRACTION
571+
}
572+
573+
// ECMA 2024 `v` flag allows nested character classes.
574+
private RegExpTerm parseNestedCharacterClass() {
575+
SourceLocation loc = new SourceLocation(pos());
576+
List<RegExpTerm> elements = new ArrayList<>();
577+
CharacterClassType classType = CharacterClassType.STANDARD;
578+
579+
this.match("[");
580+
boolean inverted = this.match("^");
581+
while (!this.match("]")) {
582+
if (this.atEOS()) {
583+
this.error(Error.EXPECTED_RBRACKET);
584+
break;
585+
}
586+
if (lookahead("[")) elements.add(parseNestedCharacterClass());
587+
else if (this.match("&&")) classType = CharacterClassType.INTERSECTION;
588+
else if (this.match("--")) classType = CharacterClassType.SUBTRACTION;
589+
else elements.add(this.parseCharacterClassElement());
590+
}
591+
592+
// Create appropriate RegExpTerm based on the detected class type
593+
switch (classType) {
594+
case INTERSECTION:
595+
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassIntersection(loc, elements)), inverted));
596+
case SUBTRACTION:
597+
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassSubtraction(loc, elements)), inverted));
598+
case STANDARD:
599+
default:
600+
return this.finishTerm(new CharacterClass(loc, elements, inverted));
601+
}
602+
}
603+
511604
private static final List<String> escapeClasses = Arrays.asList("d", "D", "s", "S", "w", "W");
512605

513606
private RegExpTerm parseCharacterClassElement() {
@@ -519,7 +612,7 @@ private RegExpTerm parseCharacterClassElement() {
519612
return atom;
520613
}
521614
}
522-
if (!this.lookahead("-]") && this.match("-") && !(atom instanceof CharacterClassEscape))
615+
if (!this.lookahead("-]") && !this.lookahead("--") && this.match("-") && !(atom instanceof CharacterClassEscape))
523616
return this.finishTerm(new CharacterClassRange(loc, atom, this.parseCharacterClassAtom()));
524617
return atom;
525618
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/^p(ost)?[ |\.]*o(ffice)?[ |\.]*(box)?[ 0-9]*[^[a-z ]]*/g;
2+
/([ ]*[a-z0-9&#*=?@\\><:,()$[\]_.{}!+%^-]+)+X/;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
/[[abc]&&[bcd]]/v; // Valid use of intersection operator, matches b or c
2+
/abc&&bcd/v; //Valid regex, but no intersection operation: Matches the literal string "abc&&bcd"
3+
/[abc]&&[bcd]/v; // Valid regex, but incorrect intersection operation:
4+
// - Matches a single character from [abc]
5+
// - Then the literal "&&"
6+
// - Then a single character from [bcd]
7+
/[[abc]&&[bcd]&&[c]]/v; // Valid use of intersection operator, matches c
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/[[]]/v; //Previously not allowed to nest character classes now completely valid with v flag.
2+
/[[a]]/v;
3+
/[ [] [ [] [] ] ]/v;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/[\q{abc}]/v;
2+
/[\q{abc|cbd|dcb}]/v;
3+
/[\q{\}}]/v;
4+
/[\q{\{}]/v;
5+
/[\q{cc|\}a|cc}]/v;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/[\p{Script_Extensions=Greek}--\p{Letter}]/v;
2+
/[[abc]--[cbd]]/v;
3+
/[[abc]--[cbd]--[bde]]/v;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
const regex = /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
/[\p{Script_Extensions=Greek}\p{RGI_Emoji}]/v;
2+
/[[abc][cbd]]/v;
3+
/[\p{Emoji}\q{a&}byz]/v;
4+
/[\q{\\\}a&}byz]/v;
5+
/[\q{\\}]/v;
6+
/[\q{abc|cbd|\}}]/v;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"experimental": true
3+
}

0 commit comments

Comments
 (0)