Merge #278

278: Adds a new normalizer to normalize œ to oe and æ to ae r=ManyTheFish a=Soham1803 # Pull Request ## Related issue Fixes #268 ## What does this PR do? - Creates a new normalizer *ae_oe_normalizer* - normalizes `œ` and `Œ` to `oe`, `æ` and `Æ` to `ae`. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Soham <[email protected]> Co-authored-by: Clémentine <[email protected]>
meilisearch · May 21, 2024 · 6b270fc · 6b270fc
2 parents eb3326d + ddda698
commit 6b270fc
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 0 deletions.
diff --git a/charabia/src/normalizer/ae_oe_normalizer.rs b/charabia/src/normalizer/ae_oe_normalizer.rs
@@ -0,0 +1,154 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::{Script, Token};
+
+/// This module contains the implementation of the `AeOeNormalizer` struct, which is a character normalizer
+/// that replaces the characters 'œ', 'æ', 'Œ', and 'Æ' with their respective replacements 'oe', 'ae', 'OE', and 'AE'.
+/// It also provides a test suite to validate the normalizer's functionality.
+
+pub struct AeOeNormalizer;
+
+// All normalizers only need to implement the method `normalize_char` and the method `should_normalize` of the `CharNormalizer` trait.
+impl CharNormalizer for AeOeNormalizer {
+    // Creates the normalized version of the provided char.
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        match c {
+            'œ' | 'Œ' => Some("oe".to_string().into()),
+            'æ' | 'Æ' => Some("ae".to_string().into()),
+            _ => Some(c.into()),
+        }
+    }
+
+    // Returns `true` if the Normalizer should be used.
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
+    }
+}
+fn is_should_normalize(c: char) -> bool {
+    matches!(c, 'œ' | 'æ' | 'Œ' | 'Æ')
+}
+
+// Test the normalizer:
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::{Normalizer, NormalizerOption};
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("œ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("Œ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("æ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("Æ".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("oe".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("ae".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Latin,
+                char_map: Some(vec![(2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+        ]
+    }
+
+    test_normalizer!(AeOeNormalizer, tokens(), normalizer_result(), normalized_tokens());
+}
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -22,6 +22,8 @@ pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
+pub use self::ae_oe_normalizer::AeOeNormalizer;
+
 mod arabic;
 #[cfg(feature = "chinese-normalization")]
 mod chinese;
@@ -40,6 +42,8 @@ mod swedish_recomposition;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
 
+mod ae_oe_normalizer;
+
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
@@ -56,6 +60,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
         Box::new(LowercaseNormalizer),
         Box::new(QuoteNormalizer),
+        Box::new(AeOeNormalizer),
         #[cfg(feature = "chinese-normalization")]
         Box::new(ChineseNormalizer),
         #[cfg(feature = "japanese-transliteration")]