From 98c1db81cd98ff301f0b0ee10ffc3be5d4c90b58 Mon Sep 17 00:00:00 2001 From: Narek Date: Wed, 12 Feb 2025 14:52:33 +0400 Subject: [PATCH] add test for Armenian --- charabia/src/normalizer/lowercase.rs | 106 ++++++++++++++++----------- 1 file changed, 63 insertions(+), 43 deletions(-) diff --git a/charabia/src/normalizer/lowercase.rs b/charabia/src/normalizer/lowercase.rs index dc543fc..5b013ca 100644 --- a/charabia/src/normalizer/lowercase.rs +++ b/charabia/src/normalizer/lowercase.rs @@ -44,57 +44,77 @@ mod test { fn tokens() -> Vec> { vec![Token { - lemma: Owned("PascalCase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - ..Default::default() - }] + lemma: Owned("PascalCase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + ..Default::default() + }, Token { + lemma: Owned("ֆիզիկոսը".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + ..Default::default() + }] } fn normalizer_result() -> Vec> { vec![Token { - lemma: Owned("pascalcase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - char_map: Some(vec![ - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - ]), - ..Default::default() - }] + lemma: Owned("pascalcase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, Token { + lemma: Owned("ֆիզիկոսը".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + ..Default::default() + }] } fn normalized_tokens() -> Vec> { vec![Token { - lemma: Owned("pascalcase".to_string()), - char_end: 10, - byte_end: 10, - script: Script::Latin, - kind: TokenKind::Word, - char_map: Some(vec![ - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - (1, 1), - ]), - ..Default::default() - }] + lemma: Owned("pascalcase".to_string()), + char_end: 10, + byte_end: 10, + script: Script::Latin, + kind: TokenKind::Word, + char_map: Some(vec![ + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + (1, 1), + ]), + ..Default::default() + }, + Token { + lemma: Owned("ֆիզիկոսը".to_string()), + char_end: 8, + byte_end: 16, + script: Script::Armenian, + kind: TokenKind::Word, + ..Default::default() + }] } test_normalizer!(LowercaseNormalizer, tokens(), normalizer_result(), normalized_tokens());