From 0bccf7b4c030e2683f175dfa715f4e739511eb88 Mon Sep 17 00:00:00 2001 From: Arty I Date: Sat, 22 Jun 2024 22:50:56 +0200 Subject: [PATCH 1/2] Normalizer for russian --- charabia/Cargo.toml | 5 +- .../normalizer/compatibility_decomposition.rs | 24 ++++ charabia/src/normalizer/mod.rs | 6 + charabia/src/normalizer/russian.rs | 134 ++++++++++++++++++ 4 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 charabia/src/normalizer/russian.rs diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 15e01d5a..d3315027 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -35,7 +35,7 @@ litemap = "0.7.2" zerovec = "0.10.1" [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "russian"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] @@ -61,6 +61,9 @@ thai = [] # allow greek specialized tokenization greek = [] +# allow russian specialized tokenization +russian = [] + # allow splitting camelCase latin words latin-camelcase = ["dep:finl_unicode"] diff --git a/charabia/src/normalizer/compatibility_decomposition.rs b/charabia/src/normalizer/compatibility_decomposition.rs index 84b5d390..661a458e 100644 --- a/charabia/src/normalizer/compatibility_decomposition.rs +++ b/charabia/src/normalizer/compatibility_decomposition.rs @@ -52,6 +52,13 @@ mod test { // base tokens to normalize. fn tokens() -> Vec> { vec![ + Token { + lemma: Owned("Ёё".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + ..Default::default() + }, Token { // Decompose 1E69 to 0073 0323 0307 lemma: Owned("ṩ ṩ".to_string()), @@ -74,6 +81,14 @@ mod test { // expected result of the current Normalizer. fn normalizer_result() -> Vec> { vec![ + Token { + lemma: Owned("Е\u{308}е\u{308}".to_string()), + char_end: 2, + byte_end: 2, + char_map: Some(vec![(2, 4), (2, 4)]), + script: Script::Cyrillic, + ..Default::default() + }, Token { lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()), char_end: 2, @@ -108,6 +123,15 @@ mod test { // expected result of the complete Normalizer pieline. fn normalized_tokens() -> Vec> { vec![ + Token { + lemma: Owned("ее".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + char_map: Some(vec![(2, 2), (2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }, Token { lemma: Owned("s s".to_string()), char_end: 2, diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index e4551961..5bd14f0b 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -19,6 +19,8 @@ use self::quote::QuoteNormalizer; use self::swedish_recomposition::SwedishRecompositionNormalizer; #[cfg(feature = "vietnamese")] pub use self::vietnamese::VietnameseNormalizer; +#[cfg(feature = "russian")] +pub use self::russian::RussianNormalizer; use crate::segmenter::SegmentedTokenIter; use crate::Token; @@ -41,6 +43,8 @@ mod quote; mod swedish_recomposition; #[cfg(feature = "vietnamese")] mod vietnamese; +#[cfg(feature = "russian")] +mod russian; mod ae_oe_normalizer; @@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { Box::new(NonspacingMarkNormalizer), #[cfg(feature = "vietnamese")] Box::new(VietnameseNormalizer), + #[cfg(feature = "russian")] + Box::new(RussianNormalizer) ] }); diff --git a/charabia/src/normalizer/russian.rs b/charabia/src/normalizer/russian.rs new file mode 100644 index 00000000..da6987ed --- /dev/null +++ b/charabia/src/normalizer/russian.rs @@ -0,0 +1,134 @@ +use std::borrow::Cow; + +use super::{Normalizer, NormalizerOption}; +use aho_corasick::AhoCorasick; +use once_cell::sync::Lazy; +use crate::{Script, Token}; + +pub struct RussianNormalizer; + +static MATCHING_STR: Lazy = Lazy::new(|| { + AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap() +}); + +impl Normalizer for RussianNormalizer { + fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> { + match token.char_map.take() { + Some(mut char_map) => { + // if a char_map already exists,iterate over it to reconstruct sub-strings. + let mut lemma = String::new(); + let mut tail = token.lemma.as_ref(); + let mut normalized = String::new(); + for (_, normalized_len) in char_map.iter_mut() { + let (head, t) = tail.split_at(*normalized_len as usize); + tail = t; + normalized.clear(); + // then normalize each sub-strings recomputing the size in the char_map. + let mut peekable = head.chars().peekable(); + while let Some(c) = peekable.next() { + let (c, peek_consumed) = normalize_russian(c, peekable.peek()); + + if peek_consumed { + peekable.next(); + } + + normalized.push(c); + } + + *normalized_len = normalized.len() as u8; + lemma.push_str(normalized.as_ref()); + } + + token.lemma = Cow::Owned(lemma); + token.char_map = Some(char_map); + } + None => { + // if no char_map exists, iterate over the lemma recomposing characters. + let mut char_map = Vec::new(); + let mut lemma = String::new(); + let mut peekable = token.lemma.chars().peekable(); + while let Some(c) = peekable.next() { + let (normalized, peek_consumed) = normalize_russian(c, peekable.peek()); + + if peek_consumed { + peekable.next(); + } + + if options.create_char_map { + char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8)); + } + lemma.push(normalized); + } + token.lemma = Cow::Owned(lemma); + if options.create_char_map { + token.char_map = Some(char_map); + } + } + } + + token + } + + fn should_normalize(&self, token: &Token) -> bool { + token.script == Script::Cyrillic && MATCHING_STR.is_match(token.lemma()) + } +} + +// https://en.wikipedia.org/wiki/Russian_alphabet +// Only decomposed forms are considered, as compatibility decomposition already takes care of 1-codepoint forms. +fn normalize_russian(current: char, next: Option<&char>) -> (char, bool) { + match (current, next) { + // ё -> е, grammatically permissible, common in writing + ('Е', Some('\u{308}')) => ('Е', true), + ('е', Some('\u{308}')) => ('е', true), + + (c, _) => (c, false), + } +} + +#[cfg(test)] +mod test { + use std::borrow::Cow::Owned; + + use crate::normalizer::test::test_normalizer; + use crate::normalizer::Normalizer; + use crate::token::TokenKind; + + // base tokens to normalize. + fn tokens() -> Vec> { + vec![Token { + lemma: Owned("Ёё".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + ..Default::default() + }] + } + + // expected result of the current Normalizer. + fn normalizer_result() -> Vec> { + vec![Token { + lemma: Owned("Ёё".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + char_map: None, + ..Default::default() + }] + } + + // expected result of the complete Normalizer pipeline. + fn normalized_tokens() -> Vec> { + vec![Token { + lemma: Owned("ее".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + char_map: Some(vec![(2, 2), (2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }] + } + + test_normalizer!(RussianNormalizer, tokens(), normalizer_result(), normalized_tokens()); +} From 1872257491991f43732de352e5c9b0e9a8307ab3 Mon Sep 17 00:00:00 2001 From: Arty I Date: Mon, 8 Jul 2024 12:52:44 +0200 Subject: [PATCH 2/2] Applying Rustfmt --- charabia/src/normalizer/mod.rs | 10 +++++----- charabia/src/normalizer/russian.rs | 11 +++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 5bd14f0b..fbd2ec2d 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -15,12 +15,12 @@ pub use self::japanese::JapaneseNormalizer; pub use self::lowercase::LowercaseNormalizer; use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; +#[cfg(feature = "russian")] +pub use self::russian::RussianNormalizer; #[cfg(feature = "swedish-recomposition")] use self::swedish_recomposition::SwedishRecompositionNormalizer; #[cfg(feature = "vietnamese")] pub use self::vietnamese::VietnameseNormalizer; -#[cfg(feature = "russian")] -pub use self::russian::RussianNormalizer; use crate::segmenter::SegmentedTokenIter; use crate::Token; @@ -39,12 +39,12 @@ mod japanese; mod lowercase; mod nonspacing_mark; mod quote; +#[cfg(feature = "russian")] +mod russian; #[cfg(feature = "swedish-recomposition")] mod swedish_recomposition; #[cfg(feature = "vietnamese")] mod vietnamese; -#[cfg(feature = "russian")] -mod russian; mod ae_oe_normalizer; @@ -76,7 +76,7 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { #[cfg(feature = "vietnamese")] Box::new(VietnameseNormalizer), #[cfg(feature = "russian")] - Box::new(RussianNormalizer) + Box::new(RussianNormalizer), ] }); diff --git a/charabia/src/normalizer/russian.rs b/charabia/src/normalizer/russian.rs index da6987ed..5a4cf1b1 100644 --- a/charabia/src/normalizer/russian.rs +++ b/charabia/src/normalizer/russian.rs @@ -1,15 +1,14 @@ use std::borrow::Cow; use super::{Normalizer, NormalizerOption}; +use crate::{Script, Token}; use aho_corasick::AhoCorasick; use once_cell::sync::Lazy; -use crate::{Script, Token}; pub struct RussianNormalizer; -static MATCHING_STR: Lazy = Lazy::new(|| { - AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap() -}); +static MATCHING_STR: Lazy = + Lazy::new(|| AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap()); impl Normalizer for RussianNormalizer { fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> { @@ -111,8 +110,8 @@ mod test { lemma: Owned("Ёё".to_string()), char_end: 2, byte_end: 2, - script: Script::Cyrillic, - char_map: None, + script: Script::Cyrillic, + char_map: None, ..Default::default() }] }