@@ -5,7 +5,6 @@ extern crate alloc;
55
66use alloc:: collections:: BTreeMap ;
77use alloc:: {
8- boxed:: Box ,
98 string:: { String , ToString } ,
109 vec:: Vec ,
1110} ;
@@ -133,12 +132,22 @@ macro_rules! regex_m {
133132}
134133
135134#[ derive( Debug , PartialEq ) ]
136- pub enum TaggedWord {
135+ enum TaggedWord {
137136 Word ( String ) ,
138137 Number ( String ) ,
139138 Symbol ( String ) ,
140139 Abbr ( String ) ,
141140}
141+ impl Into < String > for TaggedWord {
142+ fn into ( self ) -> String {
143+ match self {
144+ TaggedWord :: Word ( word) => word,
145+ TaggedWord :: Number ( word) => word,
146+ TaggedWord :: Symbol ( word) => word,
147+ TaggedWord :: Abbr ( word) => word,
148+ }
149+ }
150+ }
142151impl TaggedWord {
143152 fn from_str < S : AsRef < str > > ( s : S ) -> Self {
144153 let s: & str = s. as_ref ( ) ;
@@ -149,21 +158,14 @@ impl TaggedWord {
149158 }
150159 TaggedWord :: Word ( s. to_string ( ) )
151160 }
152- fn into_plain_word ( self ) -> Self {
153- match self {
154- Self :: Word ( word) => Self :: Word ( word) ,
155- Self :: Number ( word) => Self :: Word ( word) ,
156- Self :: Symbol ( word) => Self :: Word ( word) ,
157- Self :: Abbr ( word) => Self :: Word ( word) ,
158- }
159- }
160161 fn normalize ( self ) -> Self {
161162 match self {
162- Self :: Word ( word) => Self :: Word ( word) ,
163- Self :: Number ( word) => normalize_number ( & word) . unwrap_or ( Self :: Word ( word ) ) ,
164- Self :: Symbol ( word) => normalize_symbol ( & word) . unwrap_or ( Self :: Word ( word ) ) ,
165- Self :: Abbr ( word) => normalize_abbr ( & word) . unwrap_or ( Self :: Word ( word ) ) ,
163+ Self :: Word ( ref word) => normalize_word ( & word) ,
164+ Self :: Number ( ref word) => normalize_number ( & word) ,
165+ Self :: Symbol ( ref word) => normalize_symbol ( & word) ,
166+ Self :: Abbr ( ref word) => normalize_abbr ( & word) ,
166167 }
168+ . unwrap_or ( Self :: Word ( self . into ( ) ) )
167169 }
168170 fn to_string ( self ) -> String {
169171 match self {
@@ -177,7 +179,6 @@ impl TaggedWord {
177179
178180const NUMBER_REGEX_STR : & str = "\\ $?[0-9,]+((st)|(nd)|(th))?" ;
179181const NUMBER_REGEX : LazyCell < Regex > = LazyCell :: new ( || Regex :: new ( NUMBER_REGEX_STR ) . unwrap ( ) ) ;
180- const WORD_REGEX : & str = "[a-zA-Z]?[a-z']+" ;
181182// All uppercasae words are symbols and are spoken letter by letter
182183const SYMBOL_REGEX_STR : & str = "[A-Z.]{2,}" ;
183184const SYMBOL_REGEX : LazyCell < Regex > = LazyCell :: new ( || Regex :: new ( SYMBOL_REGEX_STR ) . unwrap ( ) ) ;
@@ -191,11 +192,14 @@ const ABBR_DICT: LazyCell<BTreeMap<&'static str, &'static str>> = LazyCell::new(
191192 abbr_dict
192193} ) ;
193194
194- fn tag_words ( input : & str ) -> Vec < TaggedWord > {
195+ pub fn normalize ( input : & str ) -> String {
195196 input
196197 . split_whitespace ( )
197- . map ( |word| TaggedWord :: from_str ( word) )
198- . collect :: < Vec < TaggedWord > > ( )
198+ . map ( TaggedWord :: from_str)
199+ . map ( |s| s. normalize ( ) )
200+ . map ( |n| n. to_string ( ) )
201+ . collect :: < Vec < String > > ( )
202+ . join ( " " )
199203}
200204
201205#[ cfg( test) ]
0 commit comments