Merge #222

meili-bors[bot] · ManyTheFish · web-flow · commit f3cc03beb00c · 2023-06-29T11:08:58.000Z
222: Add helper methods for the integration in Meilisearch r=ManyTheFish a=ManyTheFish - [Add a method allowing to consume the builder when creating a tokenizer](d01eb4d) - [Add a normalize method to str](5e4cee7) Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
diff --git a/charabia/src/lib.rs b/charabia/src/lib.rs
@@ -51,6 +51,7 @@ mod token;
 mod tokenizer;
 
 pub use detection::{Language, Script};
+pub use normalizer::Normalize;
 pub use segmenter::Segment;
 pub use token::{SeparatorKind, Token, TokenKind};
 
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -224,11 +224,18 @@ impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
     }
 }
 
-impl Token<'_> {
+pub trait Normalize {
+    type Item;
+    fn normalize(self, options: &NormalizerOption) -> Self::Item;
+}
+
+impl Normalize for Token<'_> {
+    type Item = Self;
+
     /// Normalize [`Token`] using all the compatible Normalizers.
     ///
     /// A Latin `Token` would not be normalized the same as a Chinese `Token`.
-    pub fn normalize(mut self, options: &NormalizerOption) -> Self {
+    fn normalize(mut self, options: &NormalizerOption) -> Self::Item {
         for normalizer in NORMALIZERS.iter() {
             if normalizer.should_normalize(&self) {
                 self = normalizer.normalize(self, options);
@@ -247,12 +254,32 @@ impl Token<'_> {
     }
 }
 
+impl<'o> Normalize for &'o str {
+    type Item = Cow<'o, str>;
+
+    /// Normalize an str.
+    fn normalize(self, options: &NormalizerOption) -> Self::Item {
+        let mut normalized = Token { lemma: Cow::Borrowed(self), ..Default::default() };
+        for normalizer in NORMALIZERS.iter() {
+            normalized = normalizer.normalize(normalized, options);
+        }
+
+        if options.lossy {
+            for normalizer in LOSSY_NORMALIZERS.iter() {
+                normalized = normalizer.normalize(normalized, options);
+            }
+        }
+
+        normalized.lemma
+    }
+}
+
 #[cfg(test)]
 mod test {
     macro_rules! test_normalizer {
         ($normalizer:expr, $tokens:expr, $normalizer_result:expr, $global_result:expr) => {
             use super::*;
-            use crate::Token;
+            use crate::{Token, Normalize};
 
             const TEST_NORMALIZER_OPTIONS: NormalizerOption = NormalizerOption {
                 create_char_map: true,
diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::collections::HashMap;
 
 use aho_corasick::{AhoCorasick, MatchKind};
@@ -98,32 +99,32 @@ impl Tokenize<'_> for &str {
 ///
 /// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
 pub struct Tokenizer<'tb> {
-    segmenter_option: &'tb SegmenterOption<'tb>,
-    normalizer_option: &'tb NormalizerOption<'tb>,
+    segmenter_option: Cow<'tb, SegmenterOption<'tb>>,
+    normalizer_option: Cow<'tb, NormalizerOption<'tb>>,
 }
 
 impl<'tb> Tokenizer<'tb> {
     /// Creates an Iterator over [`Token`]s.
     ///
     /// The provided text is segmented creating tokens,
     /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
-    pub fn tokenize<'o>(&self, original: &'o str) -> NormalizedTokenIter<'o, 'tb> {
-        original.segment_with_option(self.segmenter_option).normalize(self.normalizer_option)
+    pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't> {
+        original.segment_with_option(&self.segmenter_option).normalize(&self.normalizer_option)
     }
 
     /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
-    pub fn reconstruct<'o>(&self, original: &'o str) -> ReconstructedTokenIter<'o, 'tb> {
+    pub fn reconstruct<'t, 'o>(&'t self, original: &'o str) -> ReconstructedTokenIter<'o, 't> {
         ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
     }
 
     /// Segments the provided text creating an Iterator over [`Token`].
-    pub fn segment<'o>(&self, original: &'o str) -> SegmentedTokenIter<'o, 'tb> {
-        original.segment_with_option(self.segmenter_option)
+    pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't> {
+        original.segment_with_option(&self.segmenter_option)
     }
 
     /// Segments the provided text creating an Iterator over `&str`.
-    pub fn segment_str<'o>(&self, original: &'o str) -> SegmentedStrIter<'o, 'tb> {
-        original.segment_str_with_option(self.segmenter_option)
+    pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't> {
+        original.segment_str_with_option(&self.segmenter_option)
     }
 }
 
@@ -337,8 +338,20 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
         }
 
         Tokenizer {
-            normalizer_option: &self.normalizer_option,
-            segmenter_option: &self.segmenter_option,
+            normalizer_option: Cow::Borrowed(&self.normalizer_option),
+            segmenter_option: Cow::Borrowed(&self.segmenter_option),
+        }
+    }
+
+    /// Build the configurated `Tokenizer` consumming self.
+    ///
+    /// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself.
+    pub fn into_tokenizer(mut self) -> Tokenizer<'tb> {
+        drop(self.build());
+
+        Tokenizer {
+            normalizer_option: Cow::Owned(self.normalizer_option),
+            segmenter_option: Cow::Owned(self.segmenter_option),
         }
     }
 }