From 88aadbd140f2d9ceace79c1b74a4ba242ff28063 Mon Sep 17 00:00:00 2001 From: Narek Date: Fri, 7 Feb 2025 19:13:39 +0400 Subject: [PATCH 1/6] Armenian letters should be lowercased Fixes #325 --- charabia/src/normalizer/lowercase.rs | 2 +- charabia/src/separators.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index 8e2ed031..6ff5df00 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -27,7 +27,7 @@ impl CharNormalizer for LowercaseNormalizer { fn should_normalize(&self, token: &Token) -> bool { // https://en.wikipedia.org/wiki/Letter_case#Capitalisation - matches!(token.script, Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian) + matches!(token.script, Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian | Script::Armenian) && token.lemma.chars().any(char::is_uppercase) } } diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index 29fc26ac..eba5d422 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -81,7 +81,7 @@ pub const CONTEXT_SEPARATORS: &[&str] = &[ "᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph. "᚛", "᚜", // Oghams, mark start and end of text "!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin - "՜", // Armenian exclamation mark + "՜", "´", // Armenian exclamation mark "՝", // Armenian comma "՞", // Armenian question mark "։", // Armenian full stop or period, used to indicate the end of a sentence From 99f1841fc358effeb4deadcae11e2946480768c7 Mon Sep 17 00:00:00 2001 From: Narek Date: Mon, 10 Feb 2025 18:53:32 +0400 Subject: [PATCH 2/6] add .DS_Store gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0abaf594..20eddcac 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ charabia/target /data.ms Cargo.lock .idea +.DS_Store From d929c011711585ac50d3677c36f71718ec144d24 Mon Sep 17 00:00:00 2001 From: Narek Date: Mon, 10 Feb 2025 19:03:27 +0400 Subject: [PATCH 3/6] fix linter errors --- charabia/src/normalizer/lowercase.rs | 6 ++++-- charabia/src/token.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index 6ff5df00..dc543fc7 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -27,8 +27,10 @@ impl CharNormalizer for LowercaseNormalizer { fn should_normalize(&self, token: &Token) -> bool { // https://en.wikipedia.org/wiki/Letter_case#Capitalisation - matches!(token.script, Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian | Script::Armenian) - && token.lemma.chars().any(char::is_uppercase) + matches!( + token.script, + Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian | Script::Armenian + ) && token.lemma.chars().any(char::is_uppercase) } } diff --git a/charabia/src/token.rs b/charabia/src/token.rs index 60923dab..9476ab4b 100644 --- a/charabia/src/token.rs +++ b/charabia/src/token.rs @@ -113,7 +113,7 @@ impl Token<'_> { /// Returns true if the current token is a separator. pub fn is_separator(&self) -> bool { - self.separator_kind().map_or(false, |_| true) + self.separator_kind().is_some_and(|_| true) } /// Returns Some([`SeparatorKind`]) if the token is a separator and None if it's a word or a stop word. From 98c1db81cd98ff301f0b0ee10ffc3be5d4c90b58 Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 12 Feb 2025 14:52:33 +0400 Subject: [PATCH 4/6] add test for Armenian --- charabia/src/normalizer/lowercase.rs | 106 ++++++++++++++++----------- 1 file changed, 63 insertions(+), 43 deletions(-) diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index dc543fc7..5b013ca0 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -44,57 +44,77 @@ mod test { fn tokens() -> Vec> { vec![Token { - lemma: Owned("PascalCase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - ..Default::default() - }] + lemma: Owned("PascalCase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + ..Default::default() + }, Token { + lemma: Owned("ֆիզիկոսը".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + ..Default::default() + }] } fn normalizer_result() -> Vec> { vec![Token { - lemma: Owned("pascalcase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - char_map: Some(vec![ - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - ]), - ..Default::default() - }] + lemma: Owned("pascalcase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, Token { + lemma: Owned("ֆիզիկոսը".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + ..Default::default() + }] } fn normalized_tokens() -> Vec> { vec![Token { - lemma: Owned("pascalcase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - kind: TokenKind::Word, - char_map: Some(vec![ - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - ]), - ..Default::default() - }] + lemma: Owned("pascalcase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("ֆիզիկոսը".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + kind: TokenKind::Word, + ..Default::default() + }] } test_normalizer!(LowercaseNormalizer, tokens(), normalizer_result(), normalized_tokens()); From 5f50203c3bf58bbc7efb51fc2447243f9cba1bec Mon Sep 17 00:00:00 2001 From: Narek Date: Thu, 13 Feb 2025 14:30:03 +0400 Subject: [PATCH 5/6] fix linter issues --- charabia/src/normalizer/lowercase.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index 5b013ca0..f80e2328 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -43,23 +43,27 @@ mod test { use crate::token::TokenKind; fn tokens() -> Vec> { - vec![Token { + vec![ + Token { lemma: Owned("PascalCase".to_string()), char_end: 10, byte_end: 10, script: Script::Latin, ..Default::default() - }, Token { + }, + Token { lemma: Owned("ֆիզիկոսը".to_string()), char_end: 8, byte_end: 16, script: Script::Armenian, ..Default::default() - }] + }, + ] } fn normalizer_result() -> Vec> { - vec![Token { + vec![ + Token { lemma: Owned("pascalcase".to_string()), char_end: 10, byte_end: 10, @@ -77,17 +81,20 @@ mod test { (1, 1), ]), ..Default::default() - }, Token { + }, + Token { lemma: Owned("ֆիզիկոսը".to_string()), char_end: 8, byte_end: 16, script: Script::Armenian, ..Default::default() - }] + }, + ] } fn normalized_tokens() -> Vec> { - vec![Token { + vec![ + Token { lemma: Owned("pascalcase".to_string()), char_end: 10, byte_end: 10, @@ -114,7 +121,8 @@ mod test { script: Script::Armenian, kind: TokenKind::Word, ..Default::default() - }] + }, + ] } test_normalizer!(LowercaseNormalizer, tokens(), normalizer_result(), normalized_tokens()); From 98a479c48d8056a1263d404ee7abf2b5eececb35 Mon Sep 17 00:00:00 2001 From: Narek Date: Mon, 17 Feb 2025 13:44:53 +0400 Subject: [PATCH 6/6] change Armenian word for test --- charabia/src/normalizer/lowercase.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index f80e2328..bf58be7a 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -52,7 +52,7 @@ mod test { ..Default::default() }, Token { - lemma: Owned("ֆիզիկոսը".to_string()), + lemma: Owned("ՀայասՏան".to_string()), char_end: 8, byte_end: 16, script: Script::Armenian, @@ -83,10 +83,20 @@ mod test { ..Default::default() }, Token { - lemma: Owned("ֆիզիկոսը".to_string()), + lemma: Owned("հայաստան".to_string()), char_end: 8, byte_end: 16, script: Script::Armenian, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), ..Default::default() }, ] @@ -115,11 +125,21 @@ mod test { ..Default::default() }, Token { - lemma: Owned("ֆիզիկոսը".to_string()), + lemma: Owned("հայաստան".to_string()), char_end: 8, byte_end: 16, script: Script::Armenian, kind: TokenKind::Word, + char_map: Some(vec![ + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + (2, 2), + ]), ..Default::default() }, ]