Merge #211

211: Enhance Quotation marks support r=ManyTheFish a=ManyTheFish # Pull Request This PR now segment and normalize all single-high-quotation-marks as single quotes (`'`). ## Related issue Related to [meilisearch#3689](meilisearch/meilisearch#3689), this PR is an hotfix for Meilisearch v1.2 before the `separator customization feature` that will refactor the Latin segmenter in v1.3 ## What does this PR do? - Make the Latin Segmenter segment on the other type of single high quotes - Normalize Unicode single high quotes into single quotes. Co-authored-by: ManyTheFish <[email protected]>
meilisearch · Apr 26, 2023 · cd1ad65 · cd1ad65
2 parents 37e495a + d3a724c
commit cd1ad65
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 3 deletions.
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -14,6 +14,7 @@ use crate::classifier::ClassifiedTokenIter;
 #[cfg(feature = "greek")]
 use crate::normalizer::greek::GreekNormalizer;
 use crate::normalizer::nonspacing_mark::NonspacingMarkNormalizer;
+use crate::normalizer::quote::QuoteNormalizer;
 use crate::Token;
 
 mod arabic;
@@ -27,6 +28,7 @@ mod greek;
 mod japanese;
 mod lowercase;
 mod nonspacing_mark;
+mod quote;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`].
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
@@ -40,6 +42,7 @@ pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         #[cfg(feature = "greek")]
         Box::new(GreekNormalizer),
         Box::new(ControlCharNormalizer),
+        Box::new(QuoteNormalizer),
         Box::new(NonspacingMarkNormalizer),
         Box::new(ArabicNormalizer),
     ]

diff --git a/charabia/src/normalizer/quote.rs b/charabia/src/normalizer/quote.rs
@@ -0,0 +1,71 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::detection::Script;
+use crate::Token;
+
+/// Latin specialized [`Normalizer`].
+///
+/// This Normalizer replaces unicode high quotation marks by a single quote.
+pub struct QuoteNormalizer;
+
+impl CharNormalizer for QuoteNormalizer {
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        if is_unicode_high_quotation_mark(c) {
+            Some('\''.into())
+        } else {
+            Some(c.into())
+        }
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && token.lemma.chars().any(is_unicode_high_quotation_mark)
+    }
+}
+
+fn is_unicode_high_quotation_mark(c: char) -> bool {
+    matches!(c, '’' | '‘' | '‛')
+}
+
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::{Normalizer, NormalizerOption};
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("l'l’l‘l‛".to_string()),
+            char_end: 8,
+            byte_end: 14,
+            script: Script::Latin,
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("l'l'l'l'".to_string()),
+            char_end: 8,
+            byte_end: 14,
+            script: Script::Latin,
+            char_map: Some(vec![(1, 1), (1, 1), (1, 1), (3, 1), (1, 1), (3, 1), (1, 1), (3, 1)]),
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("l'l'l'l'".to_string()),
+            char_end: 8,
+            byte_end: 14,
+            script: Script::Latin,
+            char_map: Some(vec![(1, 1), (1, 1), (1, 1), (3, 1), (1, 1), (3, 1), (1, 1), (3, 1)]),
+            ..Default::default()
+        }]
+    }
+
+    test_normalizer!(QuoteNormalizer, tokens(), normalizer_result(), normalized_tokens());
+}
diff --git a/charabia/src/segmenter/latin.rs b/charabia/src/segmenter/latin.rs
@@ -20,7 +20,7 @@ impl Segmenter for LatinSegmenter {
     fn segment_str<'o>(&self, s: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
         let lemmas = s
             .split_word_bounds()
-            .flat_map(|lemma| lemma.split_inclusive('\''))
+            .flat_map(|lemma| lemma.split_inclusive(['\'', '’', '‘', '‛']))
             .flat_map(split_camel_case_bounds);
 
         Box::new(lemmas)
@@ -32,9 +32,9 @@ mod test {
     use crate::segmenter::test::test_segmenter;
 
     const TEXT: &str =
-        "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F! camelCase";
+        "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase";
     const SEGMENTED: &[&str] = &[
-        "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can'", "t", " ",
+        "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can’", "t", " ",
         "jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", " ", "Brr", ",", " ", "it'", "s",
         " ", "29.3", "°", "F", "!", " ", "camel", "Case",
     ];