Skip to content

Commit

Permalink
Merge #328
Browse files Browse the repository at this point in the history
328: Armenian letters should be lowercased r=ManyTheFish a=NarHakobyan

Fixes #325

Co-authored-by: Narek <[email protected]>
  • Loading branch information
meili-bors[bot] and NarHakobyan authored Feb 24, 2025
2 parents af5d046 + 98a479c commit a8c970a
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 50 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ charabia/target
/data.ms
Cargo.lock
.idea
.DS_Store
146 changes: 98 additions & 48 deletions charabia/src/normalizer/lowercase.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ impl CharNormalizer for LowercaseNormalizer {

fn should_normalize(&self, token: &Token) -> bool {
// https://en.wikipedia.org/wiki/Letter_case#Capitalisation
matches!(token.script, Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian)
&& token.lemma.chars().any(char::is_uppercase)
matches!(
token.script,
Script::Latin | Script::Cyrillic | Script::Greek | Script::Georgian | Script::Armenian
) && token.lemma.chars().any(char::is_uppercase)
}
}

Expand All @@ -41,58 +43,106 @@ mod test {
use crate::token::TokenKind;

fn tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("PascalCase".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Latin,
..Default::default()
}]
vec![
Token {
lemma: Owned("PascalCase".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("ՀայասՏան".to_string()),
char_end: 8,
byte_end: 16,
script: Script::Armenian,
..Default::default()
},
]
}

fn normalizer_result() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("pascalcase".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Latin,
char_map: Some(vec![
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
]),
..Default::default()
}]
vec![
Token {
lemma: Owned("pascalcase".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Latin,
char_map: Some(vec![
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
]),
..Default::default()
},
Token {
lemma: Owned("հայաստան".to_string()),
char_end: 8,
byte_end: 16,
script: Script::Armenian,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
]),
..Default::default()
},
]
}

fn normalized_tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("pascalcase".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Latin,
kind: TokenKind::Word,
char_map: Some(vec![
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
]),
..Default::default()
}]
vec![
Token {
lemma: Owned("pascalcase".to_string()),
char_end: 10,
byte_end: 10,
script: Script::Latin,
kind: TokenKind::Word,
char_map: Some(vec![
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
]),
..Default::default()
},
Token {
lemma: Owned("հայաստան".to_string()),
char_end: 8,
byte_end: 16,
script: Script::Armenian,
kind: TokenKind::Word,
char_map: Some(vec![
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
(2, 2),
]),
..Default::default()
},
]
}

test_normalizer!(LowercaseNormalizer, tokens(), normalizer_result(), normalized_tokens());
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/separators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ pub const CONTEXT_SEPARATORS: &[&str] = &[
"᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph.
"᚛", "᚜", // Oghams, mark start and end of text
"!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin
"՜", // Armenian exclamation mark
"՜", "´", // Armenian exclamation mark
"՝", // Armenian comma
"՞", // Armenian question mark
"։", // Armenian full stop or period, used to indicate the end of a sentence
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ impl Token<'_> {

/// Returns true if the current token is a separator.
pub fn is_separator(&self) -> bool {
self.separator_kind().map_or(false, |_| true)
self.separator_kind().is_some_and(|_| true)
}

/// Returns Some([`SeparatorKind`]) if the token is a separator and None if it's a word or a stop word.
Expand Down

0 comments on commit a8c970a

Please sign in to comment.