diff --git a/.gitignore b/.gitignore index 0abaf59..20eddca 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ charabia/target /data.ms Cargo.lock .idea +.DS_Store diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index 8e2ed03..bf58be7 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -27,8 +27,10 @@ impl CharNormalizer for LowercaseNormalizer { fn should_normalize(&self, token: &Token) -> bool { // https://en.wikipedia.org/wiki/Letter_case#Capitalisation - matches!(token.script, Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian) - && token.lemma.chars().any(char::is_uppercase) + matches!( + token.script, + Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian | Script::Armenian + ) && token.lemma.chars().any(char::is_uppercase) } } @@ -41,58 +43,106 @@ mod test { use crate::token::TokenKind; fn tokens() -> Vec> { - vec![Token { - lemma: Owned("PascalCase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - ..Default::default() - }] + vec![ + Token { + lemma: Owned("PascalCase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + ..Default::default() + }, + Token { + lemma: Owned("ՀայասՏան".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + ..Default::default() + }, + ] } fn normalizer_result() -> Vec> { - vec![Token { - lemma: Owned("pascalcase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - char_map: Some(vec![ - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - ]), - ..Default::default() - }] + vec![ + Token { + lemma: Owned("pascalcase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("հայաստան".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), + ..Default::default() + }, + ] } fn normalized_tokens() -> Vec> { - vec![Token { - lemma: Owned("pascalcase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - kind: TokenKind::Word, - char_map: Some(vec![ - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - ]), - ..Default::default() - }] + vec![ + Token { + lemma: Owned("pascalcase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("հայաստան".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + kind: TokenKind::Word, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), + ..Default::default() + }, + ] } test_normalizer!(LowercaseNormalizer, tokens(), normalizer_result(), normalized_tokens()); diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index 29fc26a..eba5d42 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -81,7 +81,7 @@ pub const CONTEXT_SEPARATORS: &[&str] = &[ "᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph. "᚛", "᚜", // Oghams, mark start and end of text "!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin - "՜", // Armenian exclamation mark + "՜", "´", // Armenian exclamation mark "՝", // Armenian comma "՞", // Armenian question mark "։", // Armenian full stop or period, used to indicate the end of a sentence diff --git a/charabia/src/token.rs b/charabia/src/token.rs index 60923da..9476ab4 100644 --- a/charabia/src/token.rs +++ b/charabia/src/token.rs @@ -113,7 +113,7 @@ impl Token<'_> { /// Returns true if the current token is a separator. pub fn is_separator(&self) -> bool { - self.separator_kind().map_or(false, |_| true) + self.separator_kind().is_some_and(|_| true) } /// Returns Some([`SeparatorKind`]) if the token is a separator and None if it's a word or a stop word.