-
Notifications
You must be signed in to change notification settings - Fork 461
/
Copy pathstring.rs
149 lines (144 loc) · 5.34 KB
/
string.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
use {
anyhow::Result,
regex_lite::{Regex, RegexBuilder},
regex_test::{
CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner,
},
};
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let mut runner = TestRunner::new()?;
runner
.expand(&["is_match", "find", "captures"], |test| test.compiles())
.blacklist_iter(super::BLACKLIST)
.test_iter(crate::suite()?.iter(), compiler)
.assert();
Ok(())
}
fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
let hay = match std::str::from_utf8(test.haystack()) {
Ok(hay) => hay,
Err(err) => {
return TestResult::fail(&format!(
"haystack is not valid UTF-8: {err}",
));
}
};
match test.additional_name() {
"is_match" => TestResult::matched(re.is_match(hay)),
"find" => TestResult::matches(
re.find_iter(hay)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: 0,
span: Span { start: m.start(), end: m.end() },
}),
),
"captures" => {
let it = re
.captures_iter(hay)
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|caps| testify_captures(&caps));
TestResult::captures(it)
}
name => TestResult::fail(&format!("unrecognized test name: {name}")),
}
}
/// Converts the given regex test to a closure that searches with a
/// `bytes::Regex`. If the test configuration is unsupported, then a
/// `CompiledRegex` that skips the test is returned.
fn compiler(
test: &RegexTest,
_patterns: &[String],
) -> anyhow::Result<CompiledRegex> {
let Some(pattern) = skip_or_get_pattern(test) else {
return Ok(CompiledRegex::skip());
};
let re = RegexBuilder::new(pattern)
.case_insensitive(test.case_insensitive())
.build()?;
Ok(CompiledRegex::compiled(move |test| run_test(&re, test)))
}
/// Whether we should skip the given test or not. If not, return the single
/// pattern from the given test.
fn skip_or_get_pattern(test: &RegexTest) -> Option<&str> {
// We're only testing Regex here, which supports one pattern only.
let pattern = match test.regexes().len() {
1 => &test.regexes()[0],
_ => return None,
};
// If the test name contains 'regex-lite', then we ALWAYS run it. Because
// those tests are specifically designed for regex-lite. So if they fail,
// then something needs attention.
if test.full_name().contains("regex-lite/") {
return Some(pattern);
}
// If the pattern has a \p in it, then we almost certainly don't support
// it. This probably skips more than we intend, but there are likely very
// few tests that contain a \p that isn't also a Unicode class.
if pattern.contains(r"\p") || pattern.contains(r"\P") {
return None;
}
// Similar deal for Perl classes, but we can abide them if the haystack
// is ASCII-only.
if !test.haystack().is_ascii() {
if pattern.contains(r"\d") || pattern.contains(r"\D") {
return None;
}
if pattern.contains(r"\s") || pattern.contains(r"\S") {
return None;
}
if pattern.contains(r"\w") || pattern.contains(r"\W") {
return None;
}
}
// And also same deal for word boundaries.
if !test.haystack().is_ascii() {
if pattern.contains(r"\b") || pattern.contains(r"\B") {
return None;
}
}
// We only test is_match, find_iter and captures_iter. All of those are
// leftmost searches.
if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) {
return None;
}
// The top-level single-pattern regex API always uses leftmost-first.
if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) {
return None;
}
// The top-level regex API always runs unanchored searches. ... But we can
// handle tests that are anchored but have only one match.
if test.anchored() && test.match_limit() != Some(1) {
return None;
}
// We don't support tests with explicit search bounds. We could probably
// support this by using the 'find_at' (and such) APIs.
let bounds = test.bounds();
if !(bounds.start == 0 && bounds.end == test.haystack().len()) {
return None;
}
// The Regex API specifically does not support disabling UTF-8 mode because
// it can only search &str which is always valid UTF-8.
if !test.utf8() {
return None;
}
// regex-lite doesn't support Unicode-aware case insensitive matching.
if test.case_insensitive()
&& (!pattern.is_ascii() || !test.haystack().is_ascii())
{
return None;
}
Some(pattern)
}
/// Convert `Captures` into the test suite's capture values.
fn testify_captures(caps: ®ex_lite::Captures<'_>) -> regex_test::Captures {
let spans = caps.iter().map(|group| {
group.map(|m| regex_test::Span { start: m.start(), end: m.end() })
});
// This unwrap is OK because we assume our 'caps' represents a match, and
// a match always gives a non-zero number of groups with the first group
// being non-None.
regex_test::Captures::new(0, spans).unwrap()
}