Skip to content

Commit 35cee1b

Browse files
committed
automata: Fix broken universal start states with sparse DFA
The state IDs were not remapped, which will usually result in an index out of range error. Add a test based on is_special_state's doctest, which will validate the start state's behavior with a custom searcher.
1 parent ddeb85e commit 35cee1b

File tree

2 files changed

+99
-1
lines changed

2 files changed

+99
-1
lines changed

regex-automata/src/dfa/sparse.rs

+8
Original file line numberDiff line numberDiff line change
@@ -1846,6 +1846,14 @@ impl StartTable<Vec<u8>> {
18461846
let new_start_id = remap[dfa.to_index(old_start_id)];
18471847
sl.set_start(anchored, sty, new_start_id);
18481848
}
1849+
for ustart in [
1850+
&mut sl.universal_start_unanchored,
1851+
&mut sl.universal_start_anchored,
1852+
] {
1853+
if let Some(id) = ustart {
1854+
*id = remap[dfa.to_index(*id)];
1855+
}
1856+
}
18491857
Ok(sl)
18501858
}
18511859
}

regex-automata/tests/dfa/api.rs

+91-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::error::Error;
33
use regex_automata::{
44
dfa::{dense, Automaton, OverlappingState},
55
nfa::thompson,
6-
HalfMatch, Input, MatchError,
6+
Anchored, HalfMatch, Input, MatchError,
77
};
88

99
// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,93 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
6767
assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a")));
6868
Ok(())
6969
}
70+
71+
// A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states.
72+
#[test]
73+
fn universal_start_search() -> Result<(), Box<dyn Error>> {
74+
fn find<A: Automaton>(
75+
dfa: &A,
76+
haystack: &[u8],
77+
) -> Result<Option<HalfMatch>, MatchError> {
78+
let mut state = dfa
79+
.universal_start_state(Anchored::No)
80+
.expect("regex should not require lookbehind");
81+
let mut last_match = None;
82+
// Walk all the bytes in the haystack. We can quit early if we see
83+
// a dead or a quit state. The former means the automaton will
84+
// never transition to any other state. The latter means that the
85+
// automaton entered a condition in which its search failed.
86+
for (i, &b) in haystack.iter().enumerate() {
87+
state = dfa.next_state(state, b);
88+
if dfa.is_special_state(state) {
89+
if dfa.is_match_state(state) {
90+
last_match =
91+
Some(HalfMatch::new(dfa.match_pattern(state, 0), i));
92+
} else if dfa.is_dead_state(state) {
93+
return Ok(last_match);
94+
} else if dfa.is_quit_state(state) {
95+
// It is possible to enter into a quit state after
96+
// observing a match has occurred. In that case, we
97+
// should return the match instead of an error.
98+
if last_match.is_some() {
99+
return Ok(last_match);
100+
}
101+
return Err(MatchError::quit(b, i));
102+
}
103+
// Implementors may also want to check for start or accel
104+
// states and handle them differently for performance
105+
// reasons. But it is not necessary for correctness.
106+
}
107+
}
108+
// Matches are always delayed by 1 byte, so we must explicitly walk
109+
// the special "EOI" transition at the end of the search.
110+
state = dfa.next_eoi_state(state);
111+
if dfa.is_match_state(state) {
112+
last_match = Some(HalfMatch::new(
113+
dfa.match_pattern(state, 0),
114+
haystack.len(),
115+
));
116+
}
117+
Ok(last_match)
118+
}
119+
120+
fn check_impl(
121+
dfa: impl Automaton,
122+
haystack: &str,
123+
pat: usize,
124+
offset: usize,
125+
) -> Result<(), Box<dyn Error>> {
126+
let haystack = haystack.as_bytes();
127+
let mat = find(&dfa, haystack)?.unwrap();
128+
assert_eq!(mat.pattern().as_usize(), pat);
129+
assert_eq!(mat.offset(), offset);
130+
Ok(())
131+
}
132+
133+
fn check(
134+
dfa: &dense::DFA<Vec<u32>>,
135+
haystack: &str,
136+
pat: usize,
137+
offset: usize,
138+
) -> Result<(), Box<dyn Error>> {
139+
check_impl(dfa, haystack, pat, offset)?;
140+
check_impl(dfa.to_sparse()?, haystack, pat, offset)?;
141+
Ok(())
142+
}
143+
144+
let dfa = dense::DFA::new(r"[a-z]+")?;
145+
let haystack = "123 foobar 4567";
146+
check(&dfa, haystack, 0, 10)?;
147+
148+
let dfa = dense::DFA::new(r"[0-9]{4}")?;
149+
let haystack = "123 foobar 4567";
150+
check(&dfa, haystack, 0, 15)?;
151+
152+
let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
153+
let haystack = "123 foobar 4567";
154+
check(&dfa, haystack, 1, 3)?;
155+
check(&dfa, &haystack[3..], 0, 7)?;
156+
check(&dfa, &haystack[10..], 1, 5)?;
157+
158+
Ok(())
159+
}

0 commit comments

Comments
 (0)