Skip to content

Commit 6fec3ef

Browse files
feat: implemented non-capturing groups
1 parent 8a039f4 commit 6fec3ef

File tree

10 files changed

+97
-40
lines changed

10 files changed

+97
-40
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ Based on the classfication within the [MDN cheatsheet](https://developer.mozilla
7777
- [x] (x) capturing group
7878
- [ ] \n back reference
7979
- [ ] (?<Name>x) named capturing group
80-
- [ ] (?:x) Non-capturing group
80+
- [x] (?:x) Non-capturing group
8181

8282
**Quantifiers**
8383

assembly/__spec_tests__/generated.spec.ts

+43-10
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,22 @@ it("line: 207 - matches ^(a(b(c)))(d(e(f)))(h(i(j)))(k(l(m)))$ against 'abcdefhi
10761076
expect(match.matches[11]).toBe("abcdefhijklm".substring(10, 12));
10771077
expect(match.matches[12]).toBe("abcdefhijklm".substring(11, 12));
10781078
});
1079-
xit("line: 208 - non capturing groups not supported", () => {});
1079+
it("line: 208 - matches ^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$ against 'abcdefhijklm'", () => {
1080+
const match = exec(
1081+
"^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$",
1082+
"abcdefhijklm",
1083+
"ms"
1084+
);
1085+
expect(match.matches[0]).toBe("abcdefhijklm".substring(0, 12));
1086+
expect(match.matches[1]).toBe("abcdefhijklm".substring(1, 3));
1087+
expect(match.matches[2]).toBe("abcdefhijklm".substring(2, 3));
1088+
expect(match.matches[3]).toBe("abcdefhijklm".substring(4, 6));
1089+
expect(match.matches[4]).toBe("abcdefhijklm".substring(5, 6));
1090+
expect(match.matches[5]).toBe("abcdefhijklm".substring(7, 9));
1091+
expect(match.matches[6]).toBe("abcdefhijklm".substring(8, 9));
1092+
expect(match.matches[7]).toBe("abcdefhijklm".substring(10, 12));
1093+
expect(match.matches[8]).toBe("abcdefhijklm".substring(11, 12));
1094+
});
10801095
xit("line: 209 - back references are not supported", () => {});
10811096
it("line: 210 - matches ^[.^$|()*+?{,}]+ against '.^$(*+)|{?,?}'", () => {
10821097
const match = exec("^[.^$|()*+?{,}]+", ".^$(*+)|{?,?}", "ms");
@@ -1305,10 +1320,10 @@ it("line: 266 - matches ^12.34 against '12\r34'", () => {
13051320
});
13061321
xit("line: 267 - lookaheads not supported", () => {});
13071322
xit("line: 268 - lookaheads not supported", () => {});
1308-
xit("line: 269 - non capturing groups not supported", () => {});
1309-
xit("line: 270 - non capturing groups not supported", () => {});
1310-
xit("line: 271 - non capturing groups not supported", () => {});
1311-
xit("line: 272 - non capturing groups not supported", () => {});
1323+
xit("line: 269 - lookaheads not supported", () => {});
1324+
xit("line: 270 - lookaheads not supported", () => {});
1325+
xit("line: 271 - lookaheads not supported", () => {});
1326+
xit("line: 272 - lookaheads not supported", () => {});
13121327
xit("line: 273 - lookaheads not supported", () => {});
13131328
xit("line: 274 - lookaheads not supported", () => {});
13141329
xit("line: 281 - test regex contains syntax not supported in JS", () => {});
@@ -1564,8 +1579,14 @@ it("line: 1162 - matches \\Aabc\\Z against 'qqq\nabc\nzzz'", () => {
15641579
});
15651580
xit("line: 1163 - JS does not support the A Z syntax for start and end of string", () => {});
15661581
xit("line: 1164 - JS does not support the A Z syntax for start and end of string", () => {});
1567-
xit("line: 1165 - non capturing groups not supported", () => {});
1568-
xit("line: 1166 - non capturing groups not supported", () => {});
1582+
it("line: 1165 - matches (?:b)|(?::+) against 'b::c'", () => {
1583+
const match = exec("(?:b)|(?::+)", "b::c", "ms");
1584+
expect(match.matches[0]).toBe("b::c".substring(0, 1));
1585+
});
1586+
it("line: 1166 - matches (?:b)|(?::+) against 'c::b'", () => {
1587+
const match = exec("(?:b)|(?::+)", "c::b", "ms");
1588+
expect(match.matches[0]).toBe("c::b".substring(1, 3));
1589+
});
15691590
it("line: 1167 - matches [-az]+ against 'az-'", () => {
15701591
const match = exec("[-az]+", "az-", "ms");
15711592
expect(match.matches[0]).toBe("az-".substring(0, 3));
@@ -1954,9 +1975,21 @@ it("line: 1311 - matches \\d\\d\\/\\d\\d\\/\\d\\d\\d\\d against '01/01/2000'", (
19541975
const match = exec("\\d\\d\\/\\d\\d\\/\\d\\d\\d\\d", "01/01/2000", "ms");
19551976
expect(match.matches[0]).toBe("01/01/2000".substring(0, 10));
19561977
});
1957-
xit("line: 1312 - non capturing groups not supported", () => {});
1958-
xit("line: 1313 - non capturing groups not supported", () => {});
1959-
xit("line: 1314 - non capturing groups not supported", () => {});
1978+
it("line: 1312 - matches word (?:[a-zA-Z0-9]+ ){0,10}otherword against 'word cat dog elephant mussel cow horse canary baboon snake shark otherword'", () => {
1979+
const match = exec(
1980+
"word (?:[a-zA-Z0-9]+ ){0,10}otherword",
1981+
"word cat dog elephant mussel cow horse canary baboon snake shark otherword",
1982+
"ms"
1983+
);
1984+
expect(match.matches[0]).toBe(
1985+
"word cat dog elephant mussel cow horse canary baboon snake shark otherword".substring(
1986+
0,
1987+
74
1988+
)
1989+
);
1990+
});
1991+
xit("line: 1313 - peformance issue", () => {});
1992+
xit("line: 1314 - peformance issue", () => {});
19601993
it("line: 1315 - matches ^(a){0,0} against 'bcd'", () => {
19611994
const match = exec("^(a){0,0}", "bcd", "ms");
19621995
expect(match.matches[0]).toBe("bcd".substring(0, 0));

assembly/__tests__/capture-group.spec.ts

+6
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,9 @@ it("range repitition capture groups should return the last match", () => {
4545
expect(match.matches[0]).toBe("ac");
4646
expect(match.matches[1]).toBe("c");
4747
});
48+
49+
it("non-capturing groups should not capture", () => {
50+
const match = exec("(?:foo)bar(baz)", "foobarbaz");
51+
expect(match.matches[0]).toBe("foobarbaz");
52+
expect(match.matches[1]).toBe("baz");
53+
});

assembly/char.ts

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ export const enum Char {
1616
Dot = 0x2e, // "."
1717
Zero = 0x30,
1818
Nine = 0x39,
19+
Colon = 0x3a,
1920
Question = 0x3f, // "?"
2021
A = 0x41,
2122
D = 0x44,

assembly/nfa/nfa.ts

+14-8
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ export class GroupStartMarkerState extends State {
4242
// captures from the path through the NFA that reaches the end are flagged
4343
flagged: bool = false;
4444

45-
constructor(next: State, public groupId: i32) {
45+
constructor(next: State, public capturing: bool, public groupId: i32) {
4646
super();
4747
this.transitions.push(next);
4848
}
@@ -60,10 +60,12 @@ export class GroupEndMarkerState extends State {
6060
}
6161

6262
matches(input: string, position: u32): MatchResult {
63-
this.startMarker.capture = input.substring(
64-
this.startMarker.location,
65-
position
66-
);
63+
if (this.startMarker.capturing) {
64+
this.startMarker.capture = input.substring(
65+
this.startMarker.location,
66+
position
67+
);
68+
}
6769
return MatchResult.Ignore;
6870
}
6971
}
@@ -164,10 +166,10 @@ function oneOrMore(nfa: Automata, greedy: bool): Automata {
164166
return new Automata(start, end);
165167
}
166168

167-
function group(nfa: Automata, id: i32): Automata {
169+
function group(nfa: Automata, capturing: bool, id: i32): Automata {
168170
// groups are implemented by wrapping the automata with
169171
// a pair of markers that record matches
170-
const startMarker = new GroupStartMarkerState(nfa.start, id);
172+
const startMarker = new GroupStartMarkerState(nfa.start, capturing, id);
171173
const end = new State();
172174
const endMarker = new GroupEndMarkerState(end, startMarker);
173175
nfa.end.transitions.push(endMarker);
@@ -238,7 +240,11 @@ class AutomataFactor {
238240
);
239241
case NodeType.Group: {
240242
const node = expression as GroupNode;
241-
return group(this.automataForNode(node.expression), node.id);
243+
return group(
244+
this.automataForNode(node.expression),
245+
node.capturing,
246+
node.id
247+
);
242248
}
243249
case NodeType.Assertion:
244250
return Automata.fromEpsilon();

assembly/parser/node.ts

+6-2
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,11 @@ export class AlternationNode extends Node {
209209
let _id = 0;
210210

211211
export class GroupNode extends Node {
212-
constructor(public expression: Node, public id: i32 = -1) {
212+
constructor(
213+
public expression: Node,
214+
public capturing: bool,
215+
public id: i32 = -1
216+
) {
213217
super(NodeType.Group);
214218
if (id == -1) {
215219
this.id = _id++;
@@ -221,7 +225,7 @@ export class GroupNode extends Node {
221225
}
222226

223227
clone(): Node {
224-
return new GroupNode(this.expression.clone(), this.id);
228+
return new GroupNode(this.expression.clone(), this.capturing, this.id);
225229
}
226230

227231
replace(node: Node, replacement: Node): void {

assembly/parser/parser.ts

+14-1
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,18 @@ export class Parser {
204204
return true;
205205
}
206206

207+
private isCapturing(): bool {
208+
if (
209+
this.iterator.current == Char.Question &&
210+
this.iterator.lookahead(1) == Char.Colon
211+
) {
212+
this.eatToken(Char.Question);
213+
this.eatToken(Char.Colon);
214+
return false;
215+
}
216+
return true;
217+
}
218+
207219
// parses a sequence of chars
208220
private parseSequence(): Node {
209221
let nodes = new Array<Node>();
@@ -218,7 +230,8 @@ export class Parser {
218230
// @ts-ignore
219231
} else if (token == Char.LeftParenthesis) {
220232
this.eatToken(Char.LeftParenthesis);
221-
nodes.push(new GroupNode(this.parseSequence()));
233+
const capturing = this.isCapturing();
234+
nodes.push(new GroupNode(this.parseSequence(), capturing));
222235
this.eatToken(Char.RightParenthesis);
223236
// @ts-ignore
224237
} else if (token == Char.LeftCurlyBrace) {

assembly/regexp.ts

+7-4
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ export class Flags {
8989

9090
// capture groups are implemented as GroupStart / GroupEnd states that record (capture)
9191
// the value of the current state of the string being matched.
92-
// Repeated capture groups, via rage repetitions (e.g. {2,3}) share the same 'id'. The
92+
// Repeated capture groups, via range repetitions (e.g. {2,3}) share the same 'id'. The
9393
// returned regex should only return the value of the final repetition.
94-
function filterCaptures(groupMarkers: GroupStartMarkerState[]): string[] {
94+
function lastCapturesForGroup(groupMarkers: GroupStartMarkerState[]): string[] {
9595
if (!groupMarkers.length) {
9696
return [];
9797
}
@@ -139,7 +139,10 @@ export class RegExp {
139139
gm = new Array<GroupStartMarkerState>();
140140
nfaWalker(this.nfa.start, (state) => {
141141
if (state instanceof GroupStartMarkerState) {
142-
gm.push(state as GroupStartMarkerState);
142+
const startMarker = state as GroupStartMarkerState;
143+
if (startMarker.capturing) {
144+
gm.push(state as GroupStartMarkerState);
145+
}
143146
}
144147
});
145148
this.groupMarkers = gm;
@@ -181,7 +184,7 @@ export class RegExp {
181184
});
182185

183186
const match = new Match(
184-
[matchStr!].concat(filterCaptures(groupMarkers)),
187+
[matchStr!].concat(lastCapturesForGroup(groupMarkers)),
185188
matchIndex,
186189
str
187190
);

spec/test-generator.js

+1-5
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const knownIssues = {
2222
...range(141, 143),
2323
1288,
2424
],
25+
"peformance issue": [1313, 1314],
2526

2627
/* -------- issues with the tests ------------ */
2728
"test appears to be incorrect?": [203, 204],
@@ -108,11 +109,6 @@ lines.forEach((line, index) => {
108109
return;
109110
}
110111

111-
if (["(?:"].some((f) => regex.includes(f))) {
112-
testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`;
113-
return;
114-
}
115-
116112
if (["(?!", "(?="].some((f) => regex.includes(f))) {
117113
testCase += `xit("line: ${index} - lookaheads not supported", () => {});`;
118114
return;

ts/index.ts

+4-9
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,8 @@ globalAny.log = console.log;
55

66
import { RegExp } from "../assembly/regexp";
77

8-
const regexObj = new RegExp("abc$", "m");
9-
let match = regexObj.exec("abc\n");
8+
const regexObj = new RegExp("word (?:[a-zA-Z0-9]+ ){0,300}otherword", "");
9+
let match = regexObj.exec(
10+
"word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope"
11+
);
1012
console.log(JSON.stringify(match, null, 2));
11-
// match = regexObj.exec("f1\nbar\nbaz\nf2");
12-
// console.log(JSON.stringify(match, null, 2));
13-
14-
// const regex = new RegExp("^f\\d{1}$", "gm");
15-
16-
// let match = regex.exec("f1\nbar\nbaz\nf2");
17-
// expect(match!.matches[0]).toBe("f1");

0 commit comments

Comments
 (0)