Skip to content

Commit 2faef6d

Browse files
authored
Factor positive lookaheads better into find optimizations (#112107)
* Factor positive lookaheads better into find optimizations A positive lookahead at the start of a pattern can be used for determining find optimizations even when the non-zero-width portions of the pattern aren't. This helps particularly in cases where the positive lookahead contains an anchor or a literal. Also extends the existing alternation reduction optimization to factor out anchors that begin every branch of an alternation.
1 parent bcf880c commit 2faef6d

File tree

6 files changed

+235
-22
lines changed

6 files changed

+235
-22
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

+42-12
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
#if SYSTEM_TEXT_REGULAREXPRESSIONS
45
using System.Buffers;
6+
#endif
57
using System.Collections.Generic;
68
using System.Diagnostics;
79

@@ -10,28 +12,55 @@ namespace System.Text.RegularExpressions
1012
/// <summary>Contains state and provides operations related to finding the next location a match could possibly begin.</summary>
1113
internal sealed class RegexFindOptimizations
1214
{
13-
/// <summary>True if the input should be processed right-to-left rather than left-to-right.</summary>
14-
private readonly bool _rightToLeft;
1515
/// <summary>Lookup table used for optimizing ASCII when doing set queries.</summary>
1616
private readonly uint[]?[]? _asciiLookups;
1717

18-
public RegexFindOptimizations(RegexNode root, RegexOptions options)
18+
public static RegexFindOptimizations Create(RegexNode root, RegexOptions options)
19+
{
20+
RegexFindOptimizations opts = new(root, options, isLeadingPartial: false);
21+
22+
if ((options & RegexOptions.RightToLeft) == 0 &&
23+
!opts.IsUseful &&
24+
RegexPrefixAnalyzer.FindLeadingPositiveLookahead(root) is RegexNode positiveLookahead)
25+
{
26+
RegexFindOptimizations positiveLookaheadOpts = new(positiveLookahead.Child(0), options, isLeadingPartial: true);
27+
28+
// Fixups to incorporate relevant information from the original optimizations.
29+
// - If the original has a larger minimum length than the lookahead, use it. Lookaheads don't currently factor into
30+
// the computation of the minimum as it complicates the logic due to them possibly overlapping with other portions.
31+
// - Use whatever max came from the original, if any. We shouldn't have computed a max for the lookahead because
32+
// it's partial.
33+
positiveLookaheadOpts.MinRequiredLength = Math.Max(opts.MinRequiredLength, positiveLookaheadOpts.MinRequiredLength);
34+
positiveLookaheadOpts.MaxPossibleLength = opts.MaxPossibleLength;
35+
36+
opts = positiveLookaheadOpts;
37+
}
38+
39+
return opts;
40+
}
41+
42+
/// <summary>Creates optimization information for searching with the pattern represented by <paramref name="root"/>.</summary>
43+
/// <param name="root">The root of the pattern node tree.</param>
44+
/// <param name="options">Options used when creating the regex.</param>
45+
/// <param name="isLeadingPartial">true if <paramref name="root"/> may not represent the whole pattern, only a leading node in it.</param>
46+
private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLeadingPartial)
1947
{
20-
_rightToLeft = (options & RegexOptions.RightToLeft) != 0;
48+
bool rightToLeft = (options & RegexOptions.RightToLeft) != 0;
49+
Debug.Assert(!isLeadingPartial || !rightToLeft, "RightToLeft unexpected when isLeadingPartial");
2150

2251
MinRequiredLength = root.ComputeMinLength();
2352

2453
// Compute any anchor starting the expression. If there is one, we won't need to search for anything,
2554
// as we can just match at that single location.
2655
LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root);
27-
if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
56+
if (rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
2857
{
2958
// Filter out Bol for RightToLeft, as we don't currently optimize for it.
3059
LeadingAnchor = RegexNodeKind.Unknown;
3160
}
3261
if (LeadingAnchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.EndZ or RegexNodeKind.End)
3362
{
34-
FindMode = (LeadingAnchor, _rightToLeft) switch
63+
FindMode = (LeadingAnchor, rightToLeft) switch
3564
{
3665
(RegexNodeKind.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning,
3766
(RegexNodeKind.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning,
@@ -47,7 +76,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
4776

4877
// Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length
4978
// for the whole expression, we can use that to quickly jump to the right location in the input.
50-
if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
79+
if (!rightToLeft && // haven't added FindNextStartingPositionMode trailing anchor support for RTL
80+
!isLeadingPartial) // trailing anchors in a partial root aren't relevant
5181
{
5282
TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root);
5383
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ &&
@@ -70,7 +100,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
70100
if (prefix.Length > 1)
71101
{
72102
LeadingPrefix = prefix;
73-
FindMode = _rightToLeft ?
103+
FindMode = rightToLeft ?
74104
FindNextStartingPositionMode.LeadingString_RightToLeft :
75105
FindNextStartingPositionMode.LeadingString_LeftToRight;
76106
return;
@@ -89,7 +119,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
89119
// more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter
90120
// we focus only on creating a set for the first character. Same for right-to-left, which is used very
91121
// rarely and thus we don't need to invest in special-casing it.
92-
if (_rightToLeft)
122+
if (rightToLeft)
93123
{
94124
// Determine a set for anything that can possibly start the expression.
95125
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
@@ -253,21 +283,21 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
253283
public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;
254284

255285
/// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
256-
public RegexNodeKind LeadingAnchor { get; }
286+
public RegexNodeKind LeadingAnchor { get; private set; }
257287

258288
/// <summary>Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
259289
public RegexNodeKind TrailingAnchor { get; }
260290

261291
/// <summary>Gets the minimum required length an input need be to match the pattern.</summary>
262292
/// <remarks>0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression.</remarks>
263-
public int MinRequiredLength { get; }
293+
public int MinRequiredLength { get; private set; }
264294

265295
/// <summary>The maximum possible length an input could be to match the pattern.</summary>
266296
/// <remarks>
267297
/// This is currently only set when <see cref="TrailingAnchor"/> is found to be an end anchor.
268298
/// That can be expanded in the future as needed.
269299
/// </remarks>
270-
public int? MaxPossibleLength { get; }
300+
public int? MaxPossibleLength { get; private set; }
271301

272302
/// <summary>Gets the leading prefix. May be an empty string.</summary>
273303
public string LeadingPrefix { get; } = string.Empty;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

+7-3
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,7 @@ private RegexNode ReduceAlternation()
940940
node = ExtractCommonPrefixText(node);
941941
if (node.Kind == RegexNodeKind.Alternate)
942942
{
943-
node = ExtractCommonPrefixOneNotoneSet(node);
943+
node = ExtractCommonPrefixNode(node);
944944
if (node.Kind == RegexNodeKind.Alternate)
945945
{
946946
node = RemoveRedundantEmptiesAndNothings(node);
@@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations()
10721072
// This function optimizes out prefix nodes from alternation branches that are
10731073
// the same across multiple contiguous branches.
10741074
// e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
1075-
static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
1075+
static RegexNode ExtractCommonPrefixNode(RegexNode alternation)
10761076
{
10771077
Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
10781078
Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
@@ -1097,7 +1097,7 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
10971097
{
10981098
Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });
10991099

1100-
// Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
1100+
// Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor.
11011101
// Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
11021102
// it for non-atomic variable length loops could change behavior as each branch could otherwise have a
11031103
// different number of characters consumed by the loop based on what's after it.
@@ -1107,6 +1107,10 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
11071107
case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
11081108
case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic:
11091109
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N:
1110+
case RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol
1111+
or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol
1112+
or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
1113+
or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
11101114
break;
11111115

11121116
default:

0 commit comments

Comments
 (0)