Skip to content

Commit a8c970a

Browse files
Merge #328
328: Armenian letters should be lowercased r=ManyTheFish a=NarHakobyan Fixes #325 Co-authored-by: Narek <[email protected]>
2 parents af5d046 + 98a479c commit a8c970a

File tree

4 files changed

+101
-50
lines changed

4 files changed

+101
-50
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ charabia/target
99
/data.ms
1010
Cargo.lock
1111
.idea
12+
.DS_Store

charabia/src/normalizer/lowercase.rs

Lines changed: 98 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ impl CharNormalizer for LowercaseNormalizer {
2727

2828
fn should_normalize(&self, token: &Token) -> bool {
2929
// https://en.wikipedia.org/wiki/Letter_case#Capitalisation
30-
matches!(token.script, Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian)
31-
&& token.lemma.chars().any(char::is_uppercase)
30+
matches!(
31+
token.script,
32+
Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian | Script::Armenian
33+
) && token.lemma.chars().any(char::is_uppercase)
3234
}
3335
}
3436

@@ -41,58 +43,106 @@ mod test {
4143
use crate::token::TokenKind;
4244

4345
fn tokens() -> Vec<Token<'static>> {
44-
vec![Token {
45-
lemma: Owned("PascalCase".to_string()),
46-
char_end: 10,
47-
byte_end: 10,
48-
script: Script::Latin,
49-
..Default::default()
50-
}]
46+
vec![
47+
Token {
48+
lemma: Owned("PascalCase".to_string()),
49+
char_end: 10,
50+
byte_end: 10,
51+
script: Script::Latin,
52+
..Default::default()
53+
},
54+
Token {
55+
lemma: Owned("ՀայասՏան".to_string()),
56+
char_end: 8,
57+
byte_end: 16,
58+
script: Script::Armenian,
59+
..Default::default()
60+
},
61+
]
5162
}
5263

5364
fn normalizer_result() -> Vec<Token<'static>> {
54-
vec![Token {
55-
lemma: Owned("pascalcase".to_string()),
56-
char_end: 10,
57-
byte_end: 10,
58-
script: Script::Latin,
59-
char_map: Some(vec![
60-
(1, 1),
61-
(1, 1),
62-
(1, 1),
63-
(1, 1),
64-
(1, 1),
65-
(1, 1),
66-
(1, 1),
67-
(1, 1),
68-
(1, 1),
69-
(1, 1),
70-
]),
71-
..Default::default()
72-
}]
65+
vec![
66+
Token {
67+
lemma: Owned("pascalcase".to_string()),
68+
char_end: 10,
69+
byte_end: 10,
70+
script: Script::Latin,
71+
char_map: Some(vec![
72+
(1, 1),
73+
(1, 1),
74+
(1, 1),
75+
(1, 1),
76+
(1, 1),
77+
(1, 1),
78+
(1, 1),
79+
(1, 1),
80+
(1, 1),
81+
(1, 1),
82+
]),
83+
..Default::default()
84+
},
85+
Token {
86+
lemma: Owned("հայաստան".to_string()),
87+
char_end: 8,
88+
byte_end: 16,
89+
script: Script::Armenian,
90+
char_map: Some(vec![
91+
(2, 2),
92+
(2, 2),
93+
(2, 2),
94+
(2, 2),
95+
(2, 2),
96+
(2, 2),
97+
(2, 2),
98+
(2, 2),
99+
]),
100+
..Default::default()
101+
},
102+
]
73103
}
74104

75105
fn normalized_tokens() -> Vec<Token<'static>> {
76-
vec![Token {
77-
lemma: Owned("pascalcase".to_string()),
78-
char_end: 10,
79-
byte_end: 10,
80-
script: Script::Latin,
81-
kind: TokenKind::Word,
82-
char_map: Some(vec![
83-
(1, 1),
84-
(1, 1),
85-
(1, 1),
86-
(1, 1),
87-
(1, 1),
88-
(1, 1),
89-
(1, 1),
90-
(1, 1),
91-
(1, 1),
92-
(1, 1),
93-
]),
94-
..Default::default()
95-
}]
106+
vec![
107+
Token {
108+
lemma: Owned("pascalcase".to_string()),
109+
char_end: 10,
110+
byte_end: 10,
111+
script: Script::Latin,
112+
kind: TokenKind::Word,
113+
char_map: Some(vec![
114+
(1, 1),
115+
(1, 1),
116+
(1, 1),
117+
(1, 1),
118+
(1, 1),
119+
(1, 1),
120+
(1, 1),
121+
(1, 1),
122+
(1, 1),
123+
(1, 1),
124+
]),
125+
..Default::default()
126+
},
127+
Token {
128+
lemma: Owned("հայաստան".to_string()),
129+
char_end: 8,
130+
byte_end: 16,
131+
script: Script::Armenian,
132+
kind: TokenKind::Word,
133+
char_map: Some(vec![
134+
(2, 2),
135+
(2, 2),
136+
(2, 2),
137+
(2, 2),
138+
(2, 2),
139+
(2, 2),
140+
(2, 2),
141+
(2, 2),
142+
]),
143+
..Default::default()
144+
},
145+
]
96146
}
97147

98148
test_normalizer!(LowercaseNormalizer, tokens(), normalizer_result(), normalized_tokens());

charabia/src/separators.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ pub const CONTEXT_SEPARATORS: &[&str] = &[
8181
"᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph.
8282
"᚛", "᚜", // Oghams, mark start and end of text
8383
"!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin
84-
"՜", // Armenian exclamation mark
84+
"՜", "´", // Armenian exclamation mark
8585
"՝", // Armenian comma
8686
"՞", // Armenian question mark
8787
"։", // Armenian full stop or period, used to indicate the end of a sentence

charabia/src/token.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ impl Token<'_> {
113113

114114
/// Returns true if the current token is a separator.
115115
pub fn is_separator(&self) -> bool {
116-
self.separator_kind().map_or(false, |_| true)
116+
self.separator_kind().is_some_and(|_| true)
117117
}
118118

119119
/// Returns Some([`SeparatorKind`]) if the token is a separator and None if it's a word or a stop word.

0 commit comments

Comments
 (0)