@@ -5,7 +5,6 @@ extern crate alloc;
5
5
6
6
use alloc:: collections:: BTreeMap ;
7
7
use alloc:: {
8
- boxed:: Box ,
9
8
string:: { String , ToString } ,
10
9
vec:: Vec ,
11
10
} ;
@@ -133,12 +132,22 @@ macro_rules! regex_m {
133
132
}
134
133
135
134
#[ derive( Debug , PartialEq ) ]
136
- pub enum TaggedWord {
135
+ enum TaggedWord {
137
136
Word ( String ) ,
138
137
Number ( String ) ,
139
138
Symbol ( String ) ,
140
139
Abbr ( String ) ,
141
140
}
141
+ impl Into < String > for TaggedWord {
142
+ fn into ( self ) -> String {
143
+ match self {
144
+ TaggedWord :: Word ( word) => word,
145
+ TaggedWord :: Number ( word) => word,
146
+ TaggedWord :: Symbol ( word) => word,
147
+ TaggedWord :: Abbr ( word) => word,
148
+ }
149
+ }
150
+ }
142
151
impl TaggedWord {
143
152
fn from_str < S : AsRef < str > > ( s : S ) -> Self {
144
153
let s: & str = s. as_ref ( ) ;
@@ -149,21 +158,14 @@ impl TaggedWord {
149
158
}
150
159
TaggedWord :: Word ( s. to_string ( ) )
151
160
}
152
- fn into_plain_word ( self ) -> Self {
153
- match self {
154
- Self :: Word ( word) => Self :: Word ( word) ,
155
- Self :: Number ( word) => Self :: Word ( word) ,
156
- Self :: Symbol ( word) => Self :: Word ( word) ,
157
- Self :: Abbr ( word) => Self :: Word ( word) ,
158
- }
159
- }
160
161
fn normalize ( self ) -> Self {
161
162
match self {
162
- Self :: Word ( word) => Self :: Word ( word) ,
163
- Self :: Number ( word) => normalize_number ( & word) . unwrap_or ( Self :: Word ( word ) ) ,
164
- Self :: Symbol ( word) => normalize_symbol ( & word) . unwrap_or ( Self :: Word ( word ) ) ,
165
- Self :: Abbr ( word) => normalize_abbr ( & word) . unwrap_or ( Self :: Word ( word ) ) ,
163
+ Self :: Word ( ref word) => normalize_word ( & word) ,
164
+ Self :: Number ( ref word) => normalize_number ( & word) ,
165
+ Self :: Symbol ( ref word) => normalize_symbol ( & word) ,
166
+ Self :: Abbr ( ref word) => normalize_abbr ( & word) ,
166
167
}
168
+ . unwrap_or ( Self :: Word ( self . into ( ) ) )
167
169
}
168
170
fn to_string ( self ) -> String {
169
171
match self {
@@ -177,7 +179,6 @@ impl TaggedWord {
177
179
178
180
const NUMBER_REGEX_STR : & str = "\\ $?[0-9,]+((st)|(nd)|(th))?" ;
179
181
const NUMBER_REGEX : LazyCell < Regex > = LazyCell :: new ( || Regex :: new ( NUMBER_REGEX_STR ) . unwrap ( ) ) ;
180
- const WORD_REGEX : & str = "[a-zA-Z]?[a-z']+" ;
181
182
// All uppercasae words are symbols and are spoken letter by letter
182
183
const SYMBOL_REGEX_STR : & str = "[A-Z.]{2,}" ;
183
184
const SYMBOL_REGEX : LazyCell < Regex > = LazyCell :: new ( || Regex :: new ( SYMBOL_REGEX_STR ) . unwrap ( ) ) ;
@@ -191,11 +192,14 @@ const ABBR_DICT: LazyCell<BTreeMap<&'static str, &'static str>> = LazyCell::new(
191
192
abbr_dict
192
193
} ) ;
193
194
194
- fn tag_words ( input : & str ) -> Vec < TaggedWord > {
195
+ pub fn normalize ( input : & str ) -> String {
195
196
input
196
197
. split_whitespace ( )
197
- . map ( |word| TaggedWord :: from_str ( word) )
198
- . collect :: < Vec < TaggedWord > > ( )
198
+ . map ( TaggedWord :: from_str)
199
+ . map ( |s| s. normalize ( ) )
200
+ . map ( |n| n. to_string ( ) )
201
+ . collect :: < Vec < String > > ( )
202
+ . join ( " " )
199
203
}
200
204
201
205
#[ cfg( test) ]
0 commit comments