Skip to content

Commit 5881ac3

Browse files
committed
automata: Fix broken universal start states with sparse DFA
The state IDs were not remapped, which will usually result in an index out of range error. Add a test based on is_special_state's doctest, which will validate the start state's behavior with a custom searcher.
1 parent ddeb85e commit 5881ac3

File tree

2 files changed

+100
-1
lines changed

2 files changed

+100
-1
lines changed

regex-automata/src/dfa/sparse.rs

+8
Original file line numberDiff line numberDiff line change
@@ -1846,6 +1846,14 @@ impl StartTable<Vec<u8>> {
18461846
let new_start_id = remap[dfa.to_index(old_start_id)];
18471847
sl.set_start(anchored, sty, new_start_id);
18481848
}
1849+
for ustart in [
1850+
&mut sl.universal_start_unanchored,
1851+
&mut sl.universal_start_anchored,
1852+
] {
1853+
if let Some(id) = ustart {
1854+
*id = remap[dfa.to_index(*id)];
1855+
}
1856+
}
18491857
Ok(sl)
18501858
}
18511859
}

regex-automata/tests/dfa/api.rs

+92-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::error::Error;
33
use regex_automata::{
44
dfa::{dense, Automaton, OverlappingState},
55
nfa::thompson,
6-
HalfMatch, Input, MatchError,
6+
Anchored, HalfMatch, Input, MatchError,
77
};
88

99
// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,94 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
6767
assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a")));
6868
Ok(())
6969
}
70+
71+
// A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states.
72+
#[test]
73+
fn universal_start_search() -> Result<(), Box<dyn Error>> {
74+
fn find<A: Automaton>(
75+
dfa: &A,
76+
haystack: &[u8],
77+
) -> Result<Option<HalfMatch>, MatchError> {
78+
let mut state = dfa
79+
.universal_start_state(Anchored::No)
80+
.expect("regex should not require lookbehind");
81+
assert!(dfa.is_start_state(state));
82+
let mut last_match = None;
83+
// Walk all the bytes in the haystack. We can quit early if we see
84+
// a dead or a quit state. The former means the automaton will
85+
// never transition to any other state. The latter means that the
86+
// automaton entered a condition in which its search failed.
87+
for (i, &b) in haystack.iter().enumerate() {
88+
state = dfa.next_state(state, b);
89+
if dfa.is_special_state(state) {
90+
if dfa.is_match_state(state) {
91+
last_match =
92+
Some(HalfMatch::new(dfa.match_pattern(state, 0), i));
93+
} else if dfa.is_dead_state(state) {
94+
return Ok(last_match);
95+
} else if dfa.is_quit_state(state) {
96+
// It is possible to enter into a quit state after
97+
// observing a match has occurred. In that case, we
98+
// should return the match instead of an error.
99+
if last_match.is_some() {
100+
return Ok(last_match);
101+
}
102+
return Err(MatchError::quit(b, i));
103+
}
104+
// Implementors may also want to check for start or accel
105+
// states and handle them differently for performance
106+
// reasons. But it is not necessary for correctness.
107+
}
108+
}
109+
// Matches are always delayed by 1 byte, so we must explicitly walk
110+
// the special "EOI" transition at the end of the search.
111+
state = dfa.next_eoi_state(state);
112+
if dfa.is_match_state(state) {
113+
last_match = Some(HalfMatch::new(
114+
dfa.match_pattern(state, 0),
115+
haystack.len(),
116+
));
117+
}
118+
Ok(last_match)
119+
}
120+
121+
fn check_impl(
122+
dfa: impl Automaton,
123+
haystack: &str,
124+
pat: usize,
125+
offset: usize,
126+
) -> Result<(), Box<dyn Error>> {
127+
let haystack = haystack.as_bytes();
128+
let mat = find(&dfa, haystack)?.unwrap();
129+
assert_eq!(mat.pattern().as_usize(), pat);
130+
assert_eq!(mat.offset(), offset);
131+
Ok(())
132+
}
133+
134+
fn check(
135+
dfa: &dense::DFA<Vec<u32>>,
136+
haystack: &str,
137+
pat: usize,
138+
offset: usize,
139+
) -> Result<(), Box<dyn Error>> {
140+
check_impl(dfa, haystack, pat, offset)?;
141+
check_impl(dfa.to_sparse()?, haystack, pat, offset)?;
142+
Ok(())
143+
}
144+
145+
let dfa = dense::DFA::new(r"[a-z]+")?;
146+
let haystack = "123 foobar 4567";
147+
check(&dfa, haystack, 0, 10)?;
148+
149+
let dfa = dense::DFA::new(r"[0-9]{4}")?;
150+
let haystack = "123 foobar 4567";
151+
check(&dfa, haystack, 0, 15)?;
152+
153+
let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
154+
let haystack = "123 foobar 4567";
155+
check(&dfa, haystack, 1, 3)?;
156+
check(&dfa, &haystack[3..], 0, 7)?;
157+
check(&dfa, &haystack[10..], 1, 5)?;
158+
159+
Ok(())
160+
}

0 commit comments

Comments
 (0)