diff --git a/charabia/src/lib.rs b/charabia/src/lib.rs index 593d0cb4..347dd9d5 100644 --- a/charabia/src/lib.rs +++ b/charabia/src/lib.rs @@ -51,6 +51,7 @@ mod token; mod tokenizer; pub use detection::{Language, Script}; +pub use normalizer::Normalize; pub use segmenter::Segment; pub use token::{SeparatorKind, Token, TokenKind}; diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 095036a2..df989541 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -224,11 +224,18 @@ impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> { } } -impl Token<'_> { +pub trait Normalize { + type Item; + fn normalize(self, options: &NormalizerOption) -> Self::Item; +} + +impl Normalize for Token<'_> { + type Item = Self; + /// Normalize [`Token`] using all the compatible Normalizers. /// /// A Latin `Token` would not be normalized the same as a Chinese `Token`. - pub fn normalize(mut self, options: &NormalizerOption) -> Self { + fn normalize(mut self, options: &NormalizerOption) -> Self::Item { for normalizer in NORMALIZERS.iter() { if normalizer.should_normalize(&self) { self = normalizer.normalize(self, options); @@ -247,12 +254,32 @@ impl Token<'_> { } } +impl<'o> Normalize for &'o str { + type Item = Cow<'o, str>; + + /// Normalize an str. + fn normalize(self, options: &NormalizerOption) -> Self::Item { + let mut normalized = Token { lemma: Cow::Borrowed(self), ..Default::default() }; + for normalizer in NORMALIZERS.iter() { + normalized = normalizer.normalize(normalized, options); + } + + if options.lossy { + for normalizer in LOSSY_NORMALIZERS.iter() { + normalized = normalizer.normalize(normalized, options); + } + } + + normalized.lemma + } +} + #[cfg(test)] mod test { macro_rules! test_normalizer { ($normalizer:expr, $tokens:expr, $normalizer_result:expr, $global_result:expr) => { use super::*; - use crate::Token; + use crate::{Token, Normalize}; const TEST_NORMALIZER_OPTIONS: NormalizerOption = NormalizerOption { create_char_map: true, diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs index 08d7e7fe..77db5e31 100644 --- a/charabia/src/tokenizer.rs +++ b/charabia/src/tokenizer.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::HashMap; use aho_corasick::{AhoCorasick, MatchKind}; @@ -98,8 +99,8 @@ impl Tokenize<'_> for &str { /// /// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`]. pub struct Tokenizer<'tb> { - segmenter_option: &'tb SegmenterOption<'tb>, - normalizer_option: &'tb NormalizerOption<'tb>, + segmenter_option: Cow<'tb, SegmenterOption<'tb>>, + normalizer_option: Cow<'tb, NormalizerOption<'tb>>, } impl<'tb> Tokenizer<'tb> { @@ -107,23 +108,23 @@ impl<'tb> Tokenizer<'tb> { /// /// The provided text is segmented creating tokens, /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`]. - pub fn tokenize<'o>(&self, original: &'o str) -> NormalizedTokenIter<'o, 'tb> { - original.segment_with_option(self.segmenter_option).normalize(self.normalizer_option) + pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't> { + original.segment_with_option(&self.segmenter_option).normalize(&self.normalizer_option) } /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text. - pub fn reconstruct<'o>(&self, original: &'o str) -> ReconstructedTokenIter<'o, 'tb> { + pub fn reconstruct<'t, 'o>(&'t self, original: &'o str) -> ReconstructedTokenIter<'o, 't> { ReconstructedTokenIter { original, token_iter: self.tokenize(original) } } /// Segments the provided text creating an Iterator over [`Token`]. - pub fn segment<'o>(&self, original: &'o str) -> SegmentedTokenIter<'o, 'tb> { - original.segment_with_option(self.segmenter_option) + pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't> { + original.segment_with_option(&self.segmenter_option) } /// Segments the provided text creating an Iterator over `&str`. - pub fn segment_str<'o>(&self, original: &'o str) -> SegmentedStrIter<'o, 'tb> { - original.segment_str_with_option(self.segmenter_option) + pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't> { + original.segment_str_with_option(&self.segmenter_option) } } @@ -337,8 +338,20 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { } Tokenizer { - normalizer_option: &self.normalizer_option, - segmenter_option: &self.segmenter_option, + normalizer_option: Cow::Borrowed(&self.normalizer_option), + segmenter_option: Cow::Borrowed(&self.segmenter_option), + } + } + + /// Build the configurated `Tokenizer` consumming self. + /// + /// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself. + pub fn into_tokenizer(mut self) -> Tokenizer<'tb> { + drop(self.build()); + + Tokenizer { + normalizer_option: Cow::Owned(self.normalizer_option), + segmenter_option: Cow::Owned(self.segmenter_option), } } }