@@ -3,7 +3,7 @@ use std::error::Error;
3
3
use regex_automata:: {
4
4
dfa:: { dense, Automaton , OverlappingState } ,
5
5
nfa:: thompson,
6
- HalfMatch , Input , MatchError ,
6
+ Anchored , HalfMatch , Input , MatchError ,
7
7
} ;
8
8
9
9
// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,93 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
67
67
assert_eq ! ( Ok ( Some ( expected) ) , dfa. try_search_fwd( & Input :: new( b" a" ) ) ) ;
68
68
Ok ( ( ) )
69
69
}
70
+
71
+ // A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states.
72
+ #[ test]
73
+ fn universal_start_search ( ) -> Result < ( ) , Box < dyn Error > > {
74
+ fn find < A : Automaton > (
75
+ dfa : & A ,
76
+ haystack : & [ u8 ] ,
77
+ ) -> Result < Option < HalfMatch > , MatchError > {
78
+ let mut state = dfa
79
+ . universal_start_state ( Anchored :: No )
80
+ . expect ( "regex should not require lookbehind" ) ;
81
+ let mut last_match = None ;
82
+ // Walk all the bytes in the haystack. We can quit early if we see
83
+ // a dead or a quit state. The former means the automaton will
84
+ // never transition to any other state. The latter means that the
85
+ // automaton entered a condition in which its search failed.
86
+ for ( i, & b) in haystack. iter ( ) . enumerate ( ) {
87
+ state = dfa. next_state ( state, b) ;
88
+ if dfa. is_special_state ( state) {
89
+ if dfa. is_match_state ( state) {
90
+ last_match =
91
+ Some ( HalfMatch :: new ( dfa. match_pattern ( state, 0 ) , i) ) ;
92
+ } else if dfa. is_dead_state ( state) {
93
+ return Ok ( last_match) ;
94
+ } else if dfa. is_quit_state ( state) {
95
+ // It is possible to enter into a quit state after
96
+ // observing a match has occurred. In that case, we
97
+ // should return the match instead of an error.
98
+ if last_match. is_some ( ) {
99
+ return Ok ( last_match) ;
100
+ }
101
+ return Err ( MatchError :: quit ( b, i) ) ;
102
+ }
103
+ // Implementors may also want to check for start or accel
104
+ // states and handle them differently for performance
105
+ // reasons. But it is not necessary for correctness.
106
+ }
107
+ }
108
+ // Matches are always delayed by 1 byte, so we must explicitly walk
109
+ // the special "EOI" transition at the end of the search.
110
+ state = dfa. next_eoi_state ( state) ;
111
+ if dfa. is_match_state ( state) {
112
+ last_match = Some ( HalfMatch :: new (
113
+ dfa. match_pattern ( state, 0 ) ,
114
+ haystack. len ( ) ,
115
+ ) ) ;
116
+ }
117
+ Ok ( last_match)
118
+ }
119
+
120
+ fn check_impl (
121
+ dfa : impl Automaton ,
122
+ haystack : & str ,
123
+ pat : usize ,
124
+ offset : usize ,
125
+ ) -> Result < ( ) , Box < dyn Error > > {
126
+ let haystack = haystack. as_bytes ( ) ;
127
+ let mat = find ( & dfa, haystack) ?. unwrap ( ) ;
128
+ assert_eq ! ( mat. pattern( ) . as_usize( ) , pat) ;
129
+ assert_eq ! ( mat. offset( ) , offset) ;
130
+ Ok ( ( ) )
131
+ }
132
+
133
+ fn check (
134
+ dfa : & dense:: DFA < Vec < u32 > > ,
135
+ haystack : & str ,
136
+ pat : usize ,
137
+ offset : usize ,
138
+ ) -> Result < ( ) , Box < dyn Error > > {
139
+ check_impl ( dfa, haystack, pat, offset) ?;
140
+ check_impl ( dfa. to_sparse ( ) ?, haystack, pat, offset) ?;
141
+ Ok ( ( ) )
142
+ }
143
+
144
+ let dfa = dense:: DFA :: new ( r"[a-z]+" ) ?;
145
+ let haystack = "123 foobar 4567" ;
146
+ check ( & dfa, haystack, 0 , 10 ) ?;
147
+
148
+ let dfa = dense:: DFA :: new ( r"[0-9]{4}" ) ?;
149
+ let haystack = "123 foobar 4567" ;
150
+ check ( & dfa, haystack, 0 , 15 ) ?;
151
+
152
+ let dfa = dense:: DFA :: new_many ( & [ r"[a-z]+" , r"[0-9]+" ] ) ?;
153
+ let haystack = "123 foobar 4567" ;
154
+ check ( & dfa, haystack, 1 , 3 ) ?;
155
+ check ( & dfa, & haystack[ 3 ..] , 0 , 7 ) ?;
156
+ check ( & dfa, & haystack[ 10 ..] , 1 , 5 ) ?;
157
+
158
+ Ok ( ( ) )
159
+ }
0 commit comments