Skip to content

Commit 84027b8

Browse files
fix: capture groups with quantifiers are not repeated
fixes: 31
1 parent bead49e commit 84027b8

File tree

8 files changed

+170
-46
lines changed

8 files changed

+170
-46
lines changed

assembly/__spec_tests__/generated.spec.ts

+110-19
Original file line numberDiff line numberDiff line change
@@ -373,17 +373,41 @@ it("line: 51 - matches ^(b+?|a){1,2}?c against 'bc'", () => {
373373
expect(match.matches[0]).toBe("bc".substring(0, 2));
374374
expect(match.matches[1]).toBe("bc".substring(0, 1));
375375
});
376-
xit("line: 52 - issues with repeated capture groups", () => {});
377-
xit("line: 53 - issues with repeated capture groups", () => {});
378-
xit("line: 54 - issues with repeated capture groups", () => {});
379-
xit("line: 55 - issues with repeated capture groups", () => {});
376+
it("line: 52 - matches ^(b+?|a){1,2}?c against 'bbc'", () => {
377+
const match = exec("^(b+?|a){1,2}?c", "bbc", "s");
378+
expect(match.matches[0]).toBe("bbc".substring(0, 3));
379+
expect(match.matches[1]).toBe("bbc".substring(1, 2));
380+
});
381+
it("line: 53 - matches ^(b+?|a){1,2}?c against 'bbbc'", () => {
382+
const match = exec("^(b+?|a){1,2}?c", "bbbc", "s");
383+
expect(match.matches[0]).toBe("bbbc".substring(0, 4));
384+
expect(match.matches[1]).toBe("bbbc".substring(1, 3));
385+
});
386+
it("line: 54 - matches ^(b+?|a){1,2}?c against 'bac'", () => {
387+
const match = exec("^(b+?|a){1,2}?c", "bac", "s");
388+
expect(match.matches[0]).toBe("bac".substring(0, 3));
389+
expect(match.matches[1]).toBe("bac".substring(1, 2));
390+
});
391+
it("line: 55 - matches ^(b+?|a){1,2}?c against 'bbac'", () => {
392+
const match = exec("^(b+?|a){1,2}?c", "bbac", "s");
393+
expect(match.matches[0]).toBe("bbac".substring(0, 4));
394+
expect(match.matches[1]).toBe("bbac".substring(2, 3));
395+
});
380396
it("line: 56 - matches ^(b+?|a){1,2}?c against 'aac'", () => {
381397
const match = exec("^(b+?|a){1,2}?c", "aac", "s");
382398
expect(match.matches[0]).toBe("aac".substring(0, 3));
383399
expect(match.matches[1]).toBe("aac".substring(1, 2));
384400
});
385-
xit("line: 57 - issues with repeated capture groups", () => {});
386-
xit("line: 58 - issues with repeated capture groups", () => {});
401+
it("line: 57 - matches ^(b+?|a){1,2}?c against 'abbbbbbbbbbbc'", () => {
402+
const match = exec("^(b+?|a){1,2}?c", "abbbbbbbbbbbc", "s");
403+
expect(match.matches[0]).toBe("abbbbbbbbbbbc".substring(0, 13));
404+
expect(match.matches[1]).toBe("abbbbbbbbbbbc".substring(1, 12));
405+
});
406+
it("line: 58 - matches ^(b+?|a){1,2}?c against 'bbbbbbbbbbbac'", () => {
407+
const match = exec("^(b+?|a){1,2}?c", "bbbbbbbbbbbac", "s");
408+
expect(match.matches[0]).toBe("bbbbbbbbbbbac".substring(0, 13));
409+
expect(match.matches[1]).toBe("bbbbbbbbbbbac".substring(11, 12));
410+
});
387411
it("line: 59 - matches ^(b+?|a){1,2}?c against 'aaac'", () => {
388412
expectNotMatch("^(b+?|a){1,2}?c", ["aaac"]);
389413
});
@@ -400,12 +424,36 @@ it("line: 62 - matches ^(b+|a){1,2}c against 'bbc'", () => {
400424
expect(match.matches[0]).toBe("bbc".substring(0, 3));
401425
expect(match.matches[1]).toBe("bbc".substring(0, 2));
402426
});
403-
xit("line: 63 - issues with repeated capture groups", () => {});
404-
xit("line: 64 - issues with repeated capture groups", () => {});
405-
xit("line: 65 - issues with repeated capture groups", () => {});
406-
xit("line: 66 - issues with repeated capture groups", () => {});
407-
xit("line: 67 - issues with repeated capture groups", () => {});
408-
xit("line: 68 - issues with repeated capture groups", () => {});
427+
it("line: 63 - matches ^(b+|a){1,2}c against 'bbbc'", () => {
428+
const match = exec("^(b+|a){1,2}c", "bbbc", "s");
429+
expect(match.matches[0]).toBe("bbbc".substring(0, 4));
430+
expect(match.matches[1]).toBe("bbbc".substring(0, 3));
431+
});
432+
it("line: 64 - matches ^(b+|a){1,2}c against 'bac'", () => {
433+
const match = exec("^(b+|a){1,2}c", "bac", "s");
434+
expect(match.matches[0]).toBe("bac".substring(0, 3));
435+
expect(match.matches[1]).toBe("bac".substring(1, 2));
436+
});
437+
it("line: 65 - matches ^(b+|a){1,2}c against 'bbac'", () => {
438+
const match = exec("^(b+|a){1,2}c", "bbac", "s");
439+
expect(match.matches[0]).toBe("bbac".substring(0, 4));
440+
expect(match.matches[1]).toBe("bbac".substring(2, 3));
441+
});
442+
it("line: 66 - matches ^(b+|a){1,2}c against 'aac'", () => {
443+
const match = exec("^(b+|a){1,2}c", "aac", "s");
444+
expect(match.matches[0]).toBe("aac".substring(0, 3));
445+
expect(match.matches[1]).toBe("aac".substring(1, 2));
446+
});
447+
it("line: 67 - matches ^(b+|a){1,2}c against 'abbbbbbbbbbbc'", () => {
448+
const match = exec("^(b+|a){1,2}c", "abbbbbbbbbbbc", "s");
449+
expect(match.matches[0]).toBe("abbbbbbbbbbbc".substring(0, 13));
450+
expect(match.matches[1]).toBe("abbbbbbbbbbbc".substring(1, 12));
451+
});
452+
it("line: 68 - matches ^(b+|a){1,2}c against 'bbbbbbbbbbbac'", () => {
453+
const match = exec("^(b+|a){1,2}c", "bbbbbbbbbbbac", "s");
454+
expect(match.matches[0]).toBe("bbbbbbbbbbbac".substring(0, 13));
455+
expect(match.matches[1]).toBe("bbbbbbbbbbbac".substring(11, 12));
456+
});
409457
it("line: 69 - matches ^(b+|a){1,2}c against 'aaac'", () => {
410458
expectNotMatch("^(b+|a){1,2}c", ["aaac"]);
411459
});
@@ -417,8 +465,16 @@ it("line: 71 - matches ^(b+|a){1,2}?bc against 'bbc'", () => {
417465
expect(match.matches[0]).toBe("bbc".substring(0, 3));
418466
expect(match.matches[1]).toBe("bbc".substring(0, 1));
419467
});
420-
xit("line: 72 - issues with repeated capture groups", () => {});
421-
xit("line: 73 - issues with repeated capture groups", () => {});
468+
it("line: 72 - matches ^(b*|ba){1,2}?bc against 'babc'", () => {
469+
const match = exec("^(b*|ba){1,2}?bc", "babc", "s");
470+
expect(match.matches[0]).toBe("babc".substring(0, 4));
471+
expect(match.matches[1]).toBe("babc".substring(0, 2));
472+
});
473+
it("line: 73 - matches ^(b*|ba){1,2}?bc against 'bbabc'", () => {
474+
const match = exec("^(b*|ba){1,2}?bc", "bbabc", "s");
475+
expect(match.matches[0]).toBe("bbabc".substring(0, 5));
476+
expect(match.matches[1]).toBe("bbabc".substring(1, 3));
477+
});
422478
it("line: 74 - matches ^(b*|ba){1,2}?bc against 'bababc'", () => {
423479
const match = exec("^(b*|ba){1,2}?bc", "bababc", "s");
424480
expect(match.matches[0]).toBe("bababc".substring(0, 6));
@@ -435,7 +491,11 @@ it("line: 77 - matches ^(ba|b*){1,2}?bc against 'babc'", () => {
435491
expect(match.matches[0]).toBe("babc".substring(0, 4));
436492
expect(match.matches[1]).toBe("babc".substring(0, 2));
437493
});
438-
xit("line: 78 - issues with repeated capture groups", () => {});
494+
it("line: 78 - matches ^(ba|b*){1,2}?bc against 'bbabc'", () => {
495+
const match = exec("^(ba|b*){1,2}?bc", "bbabc", "s");
496+
expect(match.matches[0]).toBe("bbabc".substring(0, 5));
497+
expect(match.matches[1]).toBe("bbabc".substring(1, 3));
498+
});
439499
it("line: 79 - matches ^(ba|b*){1,2}?bc against 'bababc'", () => {
440500
const match = exec("^(ba|b*){1,2}?bc", "bababc", "s");
441501
expect(match.matches[0]).toBe("bababc".substring(0, 6));
@@ -1199,8 +1259,32 @@ it("line: 261 - matches ^From +([^ ]+) +[a-zA-Z][a-zA-Z][a-zA-Z] +[a-zA-Z][a-zA-
11991259
"From abcd Mon Sep 01 12:33:02 1997".substring(5, 9)
12001260
);
12011261
});
1202-
xit("line: 262 - issues with repeated capture groups", () => {});
1203-
xit("line: 263 - issues with repeated capture groups", () => {});
1262+
it("line: 262 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Mon Sep 01 12:33:02 1997'", () => {
1263+
const match = exec(
1264+
"^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d",
1265+
"From abcd Mon Sep 01 12:33:02 1997",
1266+
"s"
1267+
);
1268+
expect(match.matches[0]).toBe(
1269+
"From abcd Mon Sep 01 12:33:02 1997".substring(0, 27)
1270+
);
1271+
expect(match.matches[1]).toBe(
1272+
"From abcd Mon Sep 01 12:33:02 1997".substring(15, 19)
1273+
);
1274+
});
1275+
it("line: 263 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Mon Sep 1 12:33:02 1997'", () => {
1276+
const match = exec(
1277+
"^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d",
1278+
"From abcd Mon Sep 1 12:33:02 1997",
1279+
"s"
1280+
);
1281+
expect(match.matches[0]).toBe(
1282+
"From abcd Mon Sep 1 12:33:02 1997".substring(0, 27)
1283+
);
1284+
expect(match.matches[1]).toBe(
1285+
"From abcd Mon Sep 1 12:33:02 1997".substring(15, 20)
1286+
);
1287+
});
12041288
it("line: 264 - matches ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d against 'From abcd Sep 01 12:33:02 1997'", () => {
12051289
expectNotMatch(
12061290
"^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d",
@@ -2089,8 +2173,15 @@ it("line: 1390 - matches ^[abc]{12} against 'abcabcabcabc'", () => {
20892173
const match = exec("^[abc]{12}", "abcabcabcabc", "s");
20902174
expect(match.matches[0]).toBe("abcabcabcabc".substring(0, 12));
20912175
});
2092-
xit("line: 1391 - issues with repeated capture groups", () => {});
2093-
xit("line: 1392 - issues with repeated capture groups", () => {});
2176+
it("line: 1391 - matches ^[a-c]{12} against 'abcabcabcabc'", () => {
2177+
const match = exec("^[a-c]{12}", "abcabcabcabc", "s");
2178+
expect(match.matches[0]).toBe("abcabcabcabc".substring(0, 12));
2179+
});
2180+
it("line: 1392 - matches ^(a|b|c){12} against 'abcabcabcabc '", () => {
2181+
const match = exec("^(a|b|c){12}", "abcabcabcabc ", "s");
2182+
expect(match.matches[0]).toBe("abcabcabcabc ".substring(0, 12));
2183+
expect(match.matches[1]).toBe("abcabcabcabc ".substring(11, 12));
2184+
});
20942185
it("line: 1393 - matches ^[abcdefghijklmnopqrstuvwxy0123456789] against 'n'", () => {
20952186
const match = exec("^[abcdefghijklmnopqrstuvwxy0123456789]", "n", "s");
20962187
expect(match.matches[0]).toBe("n".substring(0, 1));

assembly/__tests__/capture-group.spec.ts

+6
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,9 @@ it("repeated capture groups should return the last match", () => {
3939
expect(match.matches[0]).toBe("ac");
4040
expect(match.matches[1]).toBe("c");
4141
});
42+
43+
it("range repitition capture groups should return the last match", () => {
44+
const match = exec("([a-c]){2}", "ac");
45+
expect(match.matches[0]).toBe("ac");
46+
expect(match.matches[1]).toBe("c");
47+
});

assembly/nfa/nfa.ts

+4-4
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ export class GroupStartMarkerState extends State {
4040
// captures from the path through the NFA that reaches the end are flagged
4141
flagged: bool = false;
4242

43-
constructor(next: State) {
43+
constructor(next: State, public id: i32) {
4444
super();
4545
this.transitions.push(next);
4646
}
@@ -162,10 +162,10 @@ function oneOrMore(nfa: Automata, greedy: bool): Automata {
162162
return new Automata(start, end);
163163
}
164164

165-
function group(nfa: Automata): Automata {
165+
function group(nfa: Automata, id: i32): Automata {
166166
// groups are implemented by wrapping the automata with
167167
// a pair of markers that record matches
168-
const startMarker = new GroupStartMarkerState(nfa.start);
168+
const startMarker = new GroupStartMarkerState(nfa.start, id);
169169
const end = new State();
170170
const endMarker = new GroupEndMarkerState(end, startMarker);
171171
nfa.end.transitions.push(endMarker);
@@ -236,7 +236,7 @@ class AutomataFactor {
236236
);
237237
case NodeType.Group: {
238238
const node = expression as GroupNode;
239-
return group(this.automataForNode(node.expression));
239+
return group(this.automataForNode(node.expression), node.id);
240240
}
241241
case NodeType.Assertion:
242242
return Automata.fromEpsilon();

assembly/parser/node.ts

+7-2
Original file line numberDiff line numberDiff line change
@@ -206,17 +206,22 @@ export class AlternationNode extends Node {
206206
}
207207
}
208208

209+
let _id = 0;
210+
209211
export class GroupNode extends Node {
210-
constructor(public expression: Node) {
212+
constructor(public expression: Node, public id: i32 = -1) {
211213
super(NodeType.Group);
214+
if (id == -1) {
215+
this.id = _id++;
216+
}
212217
}
213218

214219
children(): Node[] {
215220
return [this.expression];
216221
}
217222

218223
clone(): Node {
219-
return new GroupNode(this.expression.clone());
224+
return new GroupNode(this.expression.clone(), this.id);
220225
}
221226

222227
replace(node: Node, replacement: Node): void {

assembly/parser/walker.ts

+5-2
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,11 @@ export function expandRepetitions(visitor: NodeVisitor): void {
7575
// create multiple clones
7676
const clones = new Array<Node>(from);
7777
// a{4} => aaaa
78-
for (let i = 0; i < from; i++) {
79-
clones[i] = expression.clone();
78+
if (from > 0) {
79+
clones[0] = expression;
80+
for (let i = 1; i < from; i++) {
81+
clones[i] = expression.clone();
82+
}
8083
}
8184

8285
if (rangeRepNode.to == -1) {

assembly/regexp.ts

+32-3
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,30 @@ export class Flags {
8383
}
8484
}
8585

86+
// capture groups are implemented as GroupStart / GroupEnd states that record (capture)
87+
// the value of the current state of the string being matched.
88+
// Repeated capture groups, via rage repetitions (e.g. {2,3}) share the same 'id'. The
89+
// returned regex should only return the value of the final repetition.
90+
function filterCaptures(groupMarkers: GroupStartMarkerState[]): string[] {
91+
if (!groupMarkers.length) {
92+
return [];
93+
}
94+
const values = [first(groupMarkers).capture];
95+
let currrentId = first(groupMarkers).id;
96+
for (let i = 0; i < groupMarkers.length; i++) {
97+
const gm = groupMarkers[i];
98+
if (gm.id != currrentId) {
99+
currrentId = gm.id;
100+
values.push(gm.capture);
101+
} else {
102+
if (gm.flagged) {
103+
values[values.length - 1] = gm.capture;
104+
}
105+
}
106+
}
107+
return values;
108+
}
109+
86110
export class RegExp {
87111
lastIndex: i32 = 0;
88112
private flags: Flags;
@@ -143,15 +167,20 @@ export class RegExp {
143167
this.nfa.start,
144168
str.substr(matchIndex)
145169
);
170+
146171
// we have found a match
147172
if (matchStr != null) {
173+
// remove any non-flagged captures
174+
groupMarkers.forEach((gm) => {
175+
gm.capture = gm.flagged ? gm.capture : "";
176+
});
177+
148178
const match = new Match(
149-
[matchStr!].concat(
150-
groupMarkers.map<string>((m) => (m.flagged ? m.capture : ""))
151-
),
179+
[matchStr!].concat(filterCaptures(groupMarkers)),
152180
matchIndex,
153181
str
154182
);
183+
155184
// return this match (checking end of input condition)
156185
const matchEndIndex = match.index + match.matches[0].length;
157186
if (!this.endOfInput || (this.endOfInput && matchEndIndex == len)) {

spec/test-generator.js

-13
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,6 @@ const knownIssues = {
1616
...range(487, 494),
1717
...range(1077, 1082),
1818
],
19-
"issues with repeated capture groups": [
20-
262,
21-
263,
22-
...range(63, 68),
23-
1391,
24-
1392,
25-
...range(52, 55),
26-
57,
27-
58,
28-
72,
29-
73,
30-
78,
31-
],
3219
"lazy quantifiers should still yield the longest overall regex match": [
3320
...range(141, 143),
3421
1288,

ts/index.ts

+6-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ globalAny.log = console.log;
55

66
import { RegExp } from "../assembly/regexp";
77

8-
const regexObj = new RegExp("ba{0}b");
9-
const match = regexObj.exec("bb");
8+
const regexObj = new RegExp("^(a){1,3}");
9+
const match = regexObj.exec("abc");
10+
console.log(JSON.stringify(match, null, 2));
1011

11-
console.log(match);
12+
const regexObj2 = new RegExp("(a|b)c|a(b|c)");
13+
const match2 = regexObj2.exec("ab");
14+
console.log(JSON.stringify(match2, null, 2));

0 commit comments

Comments
 (0)