@@ -27,8 +27,10 @@ impl CharNormalizer for LowercaseNormalizer {
27
27
28
28
fn should_normalize ( & self , token : & Token ) -> bool {
29
29
// https://en.wikipedia.org/wiki/Letter_case#Capitalisation
30
- matches ! ( token. script, Script :: Latin | Script :: Cyrillic | Script :: Greek | Script :: Georgian )
31
- && token. lemma . chars ( ) . any ( char:: is_uppercase)
30
+ matches ! (
31
+ token. script,
32
+ Script :: Latin | Script :: Cyrillic | Script :: Greek | Script :: Georgian | Script :: Armenian
33
+ ) && token. lemma . chars ( ) . any ( char:: is_uppercase)
32
34
}
33
35
}
34
36
@@ -41,58 +43,106 @@ mod test {
41
43
use crate :: token:: TokenKind ;
42
44
43
45
fn tokens ( ) -> Vec < Token < ' static > > {
44
- vec ! [ Token {
45
- lemma: Owned ( "PascalCase" . to_string( ) ) ,
46
- char_end: 10 ,
47
- byte_end: 10 ,
48
- script: Script :: Latin ,
49
- ..Default :: default ( )
50
- } ]
46
+ vec ! [
47
+ Token {
48
+ lemma: Owned ( "PascalCase" . to_string( ) ) ,
49
+ char_end: 10 ,
50
+ byte_end: 10 ,
51
+ script: Script :: Latin ,
52
+ ..Default :: default ( )
53
+ } ,
54
+ Token {
55
+ lemma: Owned ( "ՀայասՏան" . to_string( ) ) ,
56
+ char_end: 8 ,
57
+ byte_end: 16 ,
58
+ script: Script :: Armenian ,
59
+ ..Default :: default ( )
60
+ } ,
61
+ ]
51
62
}
52
63
53
64
fn normalizer_result ( ) -> Vec < Token < ' static > > {
54
- vec ! [ Token {
55
- lemma: Owned ( "pascalcase" . to_string( ) ) ,
56
- char_end: 10 ,
57
- byte_end: 10 ,
58
- script: Script :: Latin ,
59
- char_map: Some ( vec![
60
- ( 1 , 1 ) ,
61
- ( 1 , 1 ) ,
62
- ( 1 , 1 ) ,
63
- ( 1 , 1 ) ,
64
- ( 1 , 1 ) ,
65
- ( 1 , 1 ) ,
66
- ( 1 , 1 ) ,
67
- ( 1 , 1 ) ,
68
- ( 1 , 1 ) ,
69
- ( 1 , 1 ) ,
70
- ] ) ,
71
- ..Default :: default ( )
72
- } ]
65
+ vec ! [
66
+ Token {
67
+ lemma: Owned ( "pascalcase" . to_string( ) ) ,
68
+ char_end: 10 ,
69
+ byte_end: 10 ,
70
+ script: Script :: Latin ,
71
+ char_map: Some ( vec![
72
+ ( 1 , 1 ) ,
73
+ ( 1 , 1 ) ,
74
+ ( 1 , 1 ) ,
75
+ ( 1 , 1 ) ,
76
+ ( 1 , 1 ) ,
77
+ ( 1 , 1 ) ,
78
+ ( 1 , 1 ) ,
79
+ ( 1 , 1 ) ,
80
+ ( 1 , 1 ) ,
81
+ ( 1 , 1 ) ,
82
+ ] ) ,
83
+ ..Default :: default ( )
84
+ } ,
85
+ Token {
86
+ lemma: Owned ( "հայաստան" . to_string( ) ) ,
87
+ char_end: 8 ,
88
+ byte_end: 16 ,
89
+ script: Script :: Armenian ,
90
+ char_map: Some ( vec![
91
+ ( 2 , 2 ) ,
92
+ ( 2 , 2 ) ,
93
+ ( 2 , 2 ) ,
94
+ ( 2 , 2 ) ,
95
+ ( 2 , 2 ) ,
96
+ ( 2 , 2 ) ,
97
+ ( 2 , 2 ) ,
98
+ ( 2 , 2 ) ,
99
+ ] ) ,
100
+ ..Default :: default ( )
101
+ } ,
102
+ ]
73
103
}
74
104
75
105
fn normalized_tokens ( ) -> Vec < Token < ' static > > {
76
- vec ! [ Token {
77
- lemma: Owned ( "pascalcase" . to_string( ) ) ,
78
- char_end: 10 ,
79
- byte_end: 10 ,
80
- script: Script :: Latin ,
81
- kind: TokenKind :: Word ,
82
- char_map: Some ( vec![
83
- ( 1 , 1 ) ,
84
- ( 1 , 1 ) ,
85
- ( 1 , 1 ) ,
86
- ( 1 , 1 ) ,
87
- ( 1 , 1 ) ,
88
- ( 1 , 1 ) ,
89
- ( 1 , 1 ) ,
90
- ( 1 , 1 ) ,
91
- ( 1 , 1 ) ,
92
- ( 1 , 1 ) ,
93
- ] ) ,
94
- ..Default :: default ( )
95
- } ]
106
+ vec ! [
107
+ Token {
108
+ lemma: Owned ( "pascalcase" . to_string( ) ) ,
109
+ char_end: 10 ,
110
+ byte_end: 10 ,
111
+ script: Script :: Latin ,
112
+ kind: TokenKind :: Word ,
113
+ char_map: Some ( vec![
114
+ ( 1 , 1 ) ,
115
+ ( 1 , 1 ) ,
116
+ ( 1 , 1 ) ,
117
+ ( 1 , 1 ) ,
118
+ ( 1 , 1 ) ,
119
+ ( 1 , 1 ) ,
120
+ ( 1 , 1 ) ,
121
+ ( 1 , 1 ) ,
122
+ ( 1 , 1 ) ,
123
+ ( 1 , 1 ) ,
124
+ ] ) ,
125
+ ..Default :: default ( )
126
+ } ,
127
+ Token {
128
+ lemma: Owned ( "հայաստան" . to_string( ) ) ,
129
+ char_end: 8 ,
130
+ byte_end: 16 ,
131
+ script: Script :: Armenian ,
132
+ kind: TokenKind :: Word ,
133
+ char_map: Some ( vec![
134
+ ( 2 , 2 ) ,
135
+ ( 2 , 2 ) ,
136
+ ( 2 , 2 ) ,
137
+ ( 2 , 2 ) ,
138
+ ( 2 , 2 ) ,
139
+ ( 2 , 2 ) ,
140
+ ( 2 , 2 ) ,
141
+ ( 2 , 2 ) ,
142
+ ] ) ,
143
+ ..Default :: default ( )
144
+ } ,
145
+ ]
96
146
}
97
147
98
148
test_normalizer ! ( LowercaseNormalizer , tokens( ) , normalizer_result( ) , normalized_tokens( ) ) ;
0 commit comments