From 0bccf7b4c030e2683f175dfa715f4e739511eb88 Mon Sep 17 00:00:00 2001
From: Arty I <work.artyignatovich@gmail.com>
Date: Sat, 22 Jun 2024 22:50:56 +0200
Subject: [PATCH 1/2] Normalizer for russian

---
 charabia/Cargo.toml                           |   5 +-
 .../normalizer/compatibility_decomposition.rs |  24 ++++
 charabia/src/normalizer/mod.rs                |   6 +
 charabia/src/normalizer/russian.rs            | 134 ++++++++++++++++++
 4 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 charabia/src/normalizer/russian.rs
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index 15e01d5a..d3315027 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -35,7 +35,7 @@ litemap = "0.7.2"
 zerovec = "0.10.1"
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "russian"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -61,6 +61,9 @@ thai = []
 # allow greek specialized tokenization
 greek = []
 
+# allow russian specialized tokenization
+russian = []
+
 # allow splitting camelCase latin words
 latin-camelcase = ["dep:finl_unicode"]
 
diff --git a/charabia/src/normalizer/compatibility_decomposition.rs b/charabia/src/normalizer/compatibility_decomposition.rs
index 84b5d390..661a458e 100644
--- a/charabia/src/normalizer/compatibility_decomposition.rs
+++ b/charabia/src/normalizer/compatibility_decomposition.rs
@@ -52,6 +52,13 @@ mod test {
     // base tokens to normalize.
     fn tokens() -> Vec<Token<'static>> {
         vec![
+            Token {
+                lemma: Owned("Ёё".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Cyrillic,
+                ..Default::default()
+            },
             Token {
                 // Decompose 1E69 to 0073 0323 0307
                 lemma: Owned("ṩ ṩ".to_string()),
@@ -74,6 +81,14 @@ mod test {
     // expected result of the current Normalizer.
     fn normalizer_result() -> Vec<Token<'static>> {
         vec![
+            Token {
+                lemma: Owned("Е\u{308}е\u{308}".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                char_map: Some(vec![(2, 4), (2, 4)]),
+                script: Script::Cyrillic,
+                ..Default::default()
+            },
             Token {
                 lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()),
                 char_end: 2,
@@ -108,6 +123,15 @@ mod test {
     // expected result of the complete Normalizer pieline.
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
+            Token {
+                lemma: Owned("ее".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Cyrillic,
+                char_map: Some(vec![(2, 2), (2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
             Token {
                 lemma: Owned("s s".to_string()),
                 char_end: 2,
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index e4551961..5bd14f0b 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -19,6 +19,8 @@ use self::quote::QuoteNormalizer;
 use self::swedish_recomposition::SwedishRecompositionNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
+#[cfg(feature = "russian")]
+pub use self::russian::RussianNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
@@ -41,6 +43,8 @@ mod quote;
 mod swedish_recomposition;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
+#[cfg(feature = "russian")]
+mod russian;
 
 mod ae_oe_normalizer;
 
@@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(NonspacingMarkNormalizer),
         #[cfg(feature = "vietnamese")]
         Box::new(VietnameseNormalizer),
+        #[cfg(feature = "russian")]
+        Box::new(RussianNormalizer)
     ]
 });
 
diff --git a/charabia/src/normalizer/russian.rs b/charabia/src/normalizer/russian.rs
new file mode 100644
index 00000000..da6987ed
--- /dev/null
+++ b/charabia/src/normalizer/russian.rs
@@ -0,0 +1,134 @@
+use std::borrow::Cow;
+
+use super::{Normalizer, NormalizerOption};
+use aho_corasick::AhoCorasick;
+use once_cell::sync::Lazy;
+use crate::{Script, Token};
+
+pub struct RussianNormalizer;
+
+static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
+    AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap()
+});
+
+impl Normalizer for RussianNormalizer {
+    fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
+        match token.char_map.take() {
+            Some(mut char_map) => {
+                // if a char_map already exists,iterate over it to reconstruct sub-strings.
+                let mut lemma = String::new();
+                let mut tail = token.lemma.as_ref();
+                let mut normalized = String::new();
+                for (_, normalized_len) in char_map.iter_mut() {
+                    let (head, t) = tail.split_at(*normalized_len as usize);
+                    tail = t;
+                    normalized.clear();
+                    // then normalize each sub-strings recomputing the size in the char_map.
+                    let mut peekable = head.chars().peekable();
+                    while let Some(c) = peekable.next() {
+                        let (c, peek_consumed) = normalize_russian(c, peekable.peek());
+
+                        if peek_consumed {
+                            peekable.next();
+                        }
+
+                        normalized.push(c);
+                    }
+
+                    *normalized_len = normalized.len() as u8;
+                    lemma.push_str(normalized.as_ref());
+                }
+
+                token.lemma = Cow::Owned(lemma);
+                token.char_map = Some(char_map);
+            }
+            None => {
+                // if no char_map exists, iterate over the lemma recomposing characters.
+                let mut char_map = Vec::new();
+                let mut lemma = String::new();
+                let mut peekable = token.lemma.chars().peekable();
+                while let Some(c) = peekable.next() {
+                    let (normalized, peek_consumed) = normalize_russian(c, peekable.peek());
+
+                    if peek_consumed {
+                        peekable.next();
+                    }
+
+                    if options.create_char_map {
+                        char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8));
+                    }
+                    lemma.push(normalized);
+                }
+                token.lemma = Cow::Owned(lemma);
+                if options.create_char_map {
+                    token.char_map = Some(char_map);
+                }
+            }
+        }
+
+        token
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Cyrillic && MATCHING_STR.is_match(token.lemma())
+    }
+}
+
+// https://en.wikipedia.org/wiki/Russian_alphabet
+// Only decomposed forms are considered, as compatibility decomposition already takes care of 1-codepoint forms.
+fn normalize_russian(current: char, next: Option<&char>) -> (char, bool) {
+    match (current, next) {
+        // ё -> е, grammatically permissible, common in writing
+        ('Е', Some('\u{308}')) => ('Е', true),
+        ('е', Some('\u{308}')) => ('е', true),
+
+        (c, _) => (c, false),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::Normalizer;
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("Ёё".to_string()),
+            char_end: 2,
+            byte_end: 2,
+            script: Script::Cyrillic,
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("Ёё".to_string()),
+            char_end: 2,
+            byte_end: 2,
+            script: Script::Cyrillic,   
+            char_map: None,   
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the complete Normalizer pipeline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("ее".to_string()),
+            char_end: 2,
+            byte_end: 2,
+            script: Script::Cyrillic,
+            char_map: Some(vec![(2, 2), (2, 2)]),
+            kind: TokenKind::Word,
+            ..Default::default()
+        }]
+    }
+
+    test_normalizer!(RussianNormalizer, tokens(), normalizer_result(), normalized_tokens());
+}

From 1872257491991f43732de352e5c9b0e9a8307ab3 Mon Sep 17 00:00:00 2001
From: Arty I <work.artyignatovich@gmail.com>
Date: Mon, 8 Jul 2024 12:52:44 +0200
Subject: [PATCH 2/2] Applying Rustfmt

---
 charabia/src/normalizer/mod.rs     | 10 +++++-----
 charabia/src/normalizer/russian.rs | 11 +++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index 5bd14f0b..fbd2ec2d 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -15,12 +15,12 @@ pub use self::japanese::JapaneseNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
 use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
+#[cfg(feature = "russian")]
+pub use self::russian::RussianNormalizer;
 #[cfg(feature = "swedish-recomposition")]
 use self::swedish_recomposition::SwedishRecompositionNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
-#[cfg(feature = "russian")]
-pub use self::russian::RussianNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
@@ -39,12 +39,12 @@ mod japanese;
 mod lowercase;
 mod nonspacing_mark;
 mod quote;
+#[cfg(feature = "russian")]
+mod russian;
 #[cfg(feature = "swedish-recomposition")]
 mod swedish_recomposition;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
-#[cfg(feature = "russian")]
-mod russian;
 
 mod ae_oe_normalizer;
 
@@ -76,7 +76,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         #[cfg(feature = "vietnamese")]
         Box::new(VietnameseNormalizer),
         #[cfg(feature = "russian")]
-        Box::new(RussianNormalizer)
+        Box::new(RussianNormalizer),
     ]
 });
 
diff --git a/charabia/src/normalizer/russian.rs b/charabia/src/normalizer/russian.rs
index da6987ed..5a4cf1b1 100644
--- a/charabia/src/normalizer/russian.rs
+++ b/charabia/src/normalizer/russian.rs
@@ -1,15 +1,14 @@
 use std::borrow::Cow;
 
 use super::{Normalizer, NormalizerOption};
+use crate::{Script, Token};
 use aho_corasick::AhoCorasick;
 use once_cell::sync::Lazy;
-use crate::{Script, Token};
 
 pub struct RussianNormalizer;
 
-static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
-    AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap()
-});
+static MATCHING_STR: Lazy<AhoCorasick> =
+    Lazy::new(|| AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap());
 
 impl Normalizer for RussianNormalizer {
     fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
@@ -111,8 +110,8 @@ mod test {
             lemma: Owned("Ёё".to_string()),
             char_end: 2,
             byte_end: 2,
-            script: Script::Cyrillic,   
-            char_map: None,   
+            script: Script::Cyrillic,
+            char_map: None,
             ..Default::default()
         }]
     }