@@ -27,8 +27,10 @@ impl CharNormalizer for LowercaseNormalizer {
2727
2828 fn should_normalize ( & self , token : & Token ) -> bool {
2929 // https://en.wikipedia.org/wiki/Letter_case#Capitalisation
30- matches ! ( token. script, Script :: Latin | Script :: Cyrillic | Script :: Greek | Script :: Georgian )
31- && token. lemma . chars ( ) . any ( char:: is_uppercase)
30+ matches ! (
31+ token. script,
32+ Script :: Latin | Script :: Cyrillic | Script :: Greek | Script :: Georgian | Script :: Armenian
33+ ) && token. lemma . chars ( ) . any ( char:: is_uppercase)
3234 }
3335}
3436
@@ -41,58 +43,106 @@ mod test {
4143 use crate :: token:: TokenKind ;
4244
4345 fn tokens ( ) -> Vec < Token < ' static > > {
44- vec ! [ Token {
45- lemma: Owned ( "PascalCase" . to_string( ) ) ,
46- char_end: 10 ,
47- byte_end: 10 ,
48- script: Script :: Latin ,
49- ..Default :: default ( )
50- } ]
46+ vec ! [
47+ Token {
48+ lemma: Owned ( "PascalCase" . to_string( ) ) ,
49+ char_end: 10 ,
50+ byte_end: 10 ,
51+ script: Script :: Latin ,
52+ ..Default :: default ( )
53+ } ,
54+ Token {
55+ lemma: Owned ( "ՀայասՏան" . to_string( ) ) ,
56+ char_end: 8 ,
57+ byte_end: 16 ,
58+ script: Script :: Armenian ,
59+ ..Default :: default ( )
60+ } ,
61+ ]
5162 }
5263
5364 fn normalizer_result ( ) -> Vec < Token < ' static > > {
54- vec ! [ Token {
55- lemma: Owned ( "pascalcase" . to_string( ) ) ,
56- char_end: 10 ,
57- byte_end: 10 ,
58- script: Script :: Latin ,
59- char_map: Some ( vec![
60- ( 1 , 1 ) ,
61- ( 1 , 1 ) ,
62- ( 1 , 1 ) ,
63- ( 1 , 1 ) ,
64- ( 1 , 1 ) ,
65- ( 1 , 1 ) ,
66- ( 1 , 1 ) ,
67- ( 1 , 1 ) ,
68- ( 1 , 1 ) ,
69- ( 1 , 1 ) ,
70- ] ) ,
71- ..Default :: default ( )
72- } ]
65+ vec ! [
66+ Token {
67+ lemma: Owned ( "pascalcase" . to_string( ) ) ,
68+ char_end: 10 ,
69+ byte_end: 10 ,
70+ script: Script :: Latin ,
71+ char_map: Some ( vec![
72+ ( 1 , 1 ) ,
73+ ( 1 , 1 ) ,
74+ ( 1 , 1 ) ,
75+ ( 1 , 1 ) ,
76+ ( 1 , 1 ) ,
77+ ( 1 , 1 ) ,
78+ ( 1 , 1 ) ,
79+ ( 1 , 1 ) ,
80+ ( 1 , 1 ) ,
81+ ( 1 , 1 ) ,
82+ ] ) ,
83+ ..Default :: default ( )
84+ } ,
85+ Token {
86+ lemma: Owned ( "հայաստան" . to_string( ) ) ,
87+ char_end: 8 ,
88+ byte_end: 16 ,
89+ script: Script :: Armenian ,
90+ char_map: Some ( vec![
91+ ( 2 , 2 ) ,
92+ ( 2 , 2 ) ,
93+ ( 2 , 2 ) ,
94+ ( 2 , 2 ) ,
95+ ( 2 , 2 ) ,
96+ ( 2 , 2 ) ,
97+ ( 2 , 2 ) ,
98+ ( 2 , 2 ) ,
99+ ] ) ,
100+ ..Default :: default ( )
101+ } ,
102+ ]
73103 }
74104
75105 fn normalized_tokens ( ) -> Vec < Token < ' static > > {
76- vec ! [ Token {
77- lemma: Owned ( "pascalcase" . to_string( ) ) ,
78- char_end: 10 ,
79- byte_end: 10 ,
80- script: Script :: Latin ,
81- kind: TokenKind :: Word ,
82- char_map: Some ( vec![
83- ( 1 , 1 ) ,
84- ( 1 , 1 ) ,
85- ( 1 , 1 ) ,
86- ( 1 , 1 ) ,
87- ( 1 , 1 ) ,
88- ( 1 , 1 ) ,
89- ( 1 , 1 ) ,
90- ( 1 , 1 ) ,
91- ( 1 , 1 ) ,
92- ( 1 , 1 ) ,
93- ] ) ,
94- ..Default :: default ( )
95- } ]
106+ vec ! [
107+ Token {
108+ lemma: Owned ( "pascalcase" . to_string( ) ) ,
109+ char_end: 10 ,
110+ byte_end: 10 ,
111+ script: Script :: Latin ,
112+ kind: TokenKind :: Word ,
113+ char_map: Some ( vec![
114+ ( 1 , 1 ) ,
115+ ( 1 , 1 ) ,
116+ ( 1 , 1 ) ,
117+ ( 1 , 1 ) ,
118+ ( 1 , 1 ) ,
119+ ( 1 , 1 ) ,
120+ ( 1 , 1 ) ,
121+ ( 1 , 1 ) ,
122+ ( 1 , 1 ) ,
123+ ( 1 , 1 ) ,
124+ ] ) ,
125+ ..Default :: default ( )
126+ } ,
127+ Token {
128+ lemma: Owned ( "հայաստան" . to_string( ) ) ,
129+ char_end: 8 ,
130+ byte_end: 16 ,
131+ script: Script :: Armenian ,
132+ kind: TokenKind :: Word ,
133+ char_map: Some ( vec![
134+ ( 2 , 2 ) ,
135+ ( 2 , 2 ) ,
136+ ( 2 , 2 ) ,
137+ ( 2 , 2 ) ,
138+ ( 2 , 2 ) ,
139+ ( 2 , 2 ) ,
140+ ( 2 , 2 ) ,
141+ ( 2 , 2 ) ,
142+ ] ) ,
143+ ..Default :: default ( )
144+ } ,
145+ ]
96146 }
97147
98148 test_normalizer ! ( LowercaseNormalizer , tokens( ) , normalizer_result( ) , normalized_tokens( ) ) ;
0 commit comments