@@ -3,7 +3,7 @@ use std::error::Error;
3
3
use regex_automata:: {
4
4
dfa:: { dense, Automaton , OverlappingState } ,
5
5
nfa:: thompson,
6
- HalfMatch , Input , MatchError ,
6
+ Anchored , HalfMatch , Input , MatchError ,
7
7
} ;
8
8
9
9
// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,94 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
67
67
assert_eq ! ( Ok ( Some ( expected) ) , dfa. try_search_fwd( & Input :: new( b" a" ) ) ) ;
68
68
Ok ( ( ) )
69
69
}
70
+
71
+ // A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states.
72
+ #[ test]
73
+ fn universal_start_search ( ) -> Result < ( ) , Box < dyn Error > > {
74
+ fn find < A : Automaton > (
75
+ dfa : & A ,
76
+ haystack : & [ u8 ] ,
77
+ ) -> Result < Option < HalfMatch > , MatchError > {
78
+ let mut state = dfa
79
+ . universal_start_state ( Anchored :: No )
80
+ . expect ( "regex should not require lookbehind" ) ;
81
+ assert ! ( dfa. is_start_state( state) ) ;
82
+ let mut last_match = None ;
83
+ // Walk all the bytes in the haystack. We can quit early if we see
84
+ // a dead or a quit state. The former means the automaton will
85
+ // never transition to any other state. The latter means that the
86
+ // automaton entered a condition in which its search failed.
87
+ for ( i, & b) in haystack. iter ( ) . enumerate ( ) {
88
+ state = dfa. next_state ( state, b) ;
89
+ if dfa. is_special_state ( state) {
90
+ if dfa. is_match_state ( state) {
91
+ last_match =
92
+ Some ( HalfMatch :: new ( dfa. match_pattern ( state, 0 ) , i) ) ;
93
+ } else if dfa. is_dead_state ( state) {
94
+ return Ok ( last_match) ;
95
+ } else if dfa. is_quit_state ( state) {
96
+ // It is possible to enter into a quit state after
97
+ // observing a match has occurred. In that case, we
98
+ // should return the match instead of an error.
99
+ if last_match. is_some ( ) {
100
+ return Ok ( last_match) ;
101
+ }
102
+ return Err ( MatchError :: quit ( b, i) ) ;
103
+ }
104
+ // Implementors may also want to check for start or accel
105
+ // states and handle them differently for performance
106
+ // reasons. But it is not necessary for correctness.
107
+ }
108
+ }
109
+ // Matches are always delayed by 1 byte, so we must explicitly walk
110
+ // the special "EOI" transition at the end of the search.
111
+ state = dfa. next_eoi_state ( state) ;
112
+ if dfa. is_match_state ( state) {
113
+ last_match = Some ( HalfMatch :: new (
114
+ dfa. match_pattern ( state, 0 ) ,
115
+ haystack. len ( ) ,
116
+ ) ) ;
117
+ }
118
+ Ok ( last_match)
119
+ }
120
+
121
+ fn check_impl (
122
+ dfa : impl Automaton ,
123
+ haystack : & str ,
124
+ pat : usize ,
125
+ offset : usize ,
126
+ ) -> Result < ( ) , Box < dyn Error > > {
127
+ let haystack = haystack. as_bytes ( ) ;
128
+ let mat = find ( & dfa, haystack) ?. unwrap ( ) ;
129
+ assert_eq ! ( mat. pattern( ) . as_usize( ) , pat) ;
130
+ assert_eq ! ( mat. offset( ) , offset) ;
131
+ Ok ( ( ) )
132
+ }
133
+
134
+ fn check (
135
+ dfa : & dense:: DFA < Vec < u32 > > ,
136
+ haystack : & str ,
137
+ pat : usize ,
138
+ offset : usize ,
139
+ ) -> Result < ( ) , Box < dyn Error > > {
140
+ check_impl ( dfa, haystack, pat, offset) ?;
141
+ check_impl ( dfa. to_sparse ( ) ?, haystack, pat, offset) ?;
142
+ Ok ( ( ) )
143
+ }
144
+
145
+ let dfa = dense:: DFA :: new ( r"[a-z]+" ) ?;
146
+ let haystack = "123 foobar 4567" ;
147
+ check ( & dfa, haystack, 0 , 10 ) ?;
148
+
149
+ let dfa = dense:: DFA :: new ( r"[0-9]{4}" ) ?;
150
+ let haystack = "123 foobar 4567" ;
151
+ check ( & dfa, haystack, 0 , 15 ) ?;
152
+
153
+ let dfa = dense:: DFA :: new_many ( & [ r"[a-z]+" , r"[0-9]+" ] ) ?;
154
+ let haystack = "123 foobar 4567" ;
155
+ check ( & dfa, haystack, 1 , 3 ) ?;
156
+ check ( & dfa, & haystack[ 3 ..] , 0 , 7 ) ?;
157
+ check ( & dfa, & haystack[ 10 ..] , 1 , 5 ) ?;
158
+
159
+ Ok ( ( ) )
160
+ }
0 commit comments