Skip to content

Commit

Permalink
Merge #222
Browse files Browse the repository at this point in the history
222: Add helper methods for the integration in Meilisearch r=ManyTheFish a=ManyTheFish

- [Add a method allowing to consume the builder when creating a tokenizer](d01eb4d)
- [Add a normalize method to str](5e4cee7)

Co-authored-by: ManyTheFish <[email protected]>
Co-authored-by: Many the fish <[email protected]>
  • Loading branch information
meili-bors[bot] and ManyTheFish authored Jun 29, 2023
2 parents f96cfa4 + 5fe758b commit f3cc03b
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 14 deletions.
1 change: 1 addition & 0 deletions charabia/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ mod token;
mod tokenizer;

pub use detection::{Language, Script};
pub use normalizer::Normalize;
pub use segmenter::Segment;
pub use token::{SeparatorKind, Token, TokenKind};

Expand Down
33 changes: 30 additions & 3 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,18 @@ impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
}
}

impl Token<'_> {
pub trait Normalize {
type Item;
fn normalize(self, options: &NormalizerOption) -> Self::Item;
}

impl Normalize for Token<'_> {
type Item = Self;

/// Normalize [`Token`] using all the compatible Normalizers.
///
/// A Latin `Token` would not be normalized the same as a Chinese `Token`.
pub fn normalize(mut self, options: &NormalizerOption) -> Self {
fn normalize(mut self, options: &NormalizerOption) -> Self::Item {
for normalizer in NORMALIZERS.iter() {
if normalizer.should_normalize(&self) {
self = normalizer.normalize(self, options);
Expand All @@ -247,12 +254,32 @@ impl Token<'_> {
}
}

impl<'o> Normalize for &'o str {
type Item = Cow<'o, str>;

/// Normalize an str.
fn normalize(self, options: &NormalizerOption) -> Self::Item {
let mut normalized = Token { lemma: Cow::Borrowed(self), ..Default::default() };
for normalizer in NORMALIZERS.iter() {
normalized = normalizer.normalize(normalized, options);
}

if options.lossy {
for normalizer in LOSSY_NORMALIZERS.iter() {
normalized = normalizer.normalize(normalized, options);
}
}

normalized.lemma
}
}

#[cfg(test)]
mod test {
macro_rules! test_normalizer {
($normalizer:expr, $tokens:expr, $normalizer_result:expr, $global_result:expr) => {
use super::*;
use crate::Token;
use crate::{Token, Normalize};

const TEST_NORMALIZER_OPTIONS: NormalizerOption = NormalizerOption {
create_char_map: true,
Expand Down
35 changes: 24 additions & 11 deletions charabia/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::collections::HashMap;

use aho_corasick::{AhoCorasick, MatchKind};
Expand Down Expand Up @@ -98,32 +99,32 @@ impl Tokenize<'_> for &str {
///
/// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
pub struct Tokenizer<'tb> {
segmenter_option: &'tb SegmenterOption<'tb>,
normalizer_option: &'tb NormalizerOption<'tb>,
segmenter_option: Cow<'tb, SegmenterOption<'tb>>,
normalizer_option: Cow<'tb, NormalizerOption<'tb>>,
}

impl<'tb> Tokenizer<'tb> {
/// Creates an Iterator over [`Token`]s.
///
/// The provided text is segmented creating tokens,
/// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
pub fn tokenize<'o>(&self, original: &'o str) -> NormalizedTokenIter<'o, 'tb> {
original.segment_with_option(self.segmenter_option).normalize(self.normalizer_option)
pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't> {
original.segment_with_option(&self.segmenter_option).normalize(&self.normalizer_option)
}

/// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
pub fn reconstruct<'o>(&self, original: &'o str) -> ReconstructedTokenIter<'o, 'tb> {
pub fn reconstruct<'t, 'o>(&'t self, original: &'o str) -> ReconstructedTokenIter<'o, 't> {
ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
}

/// Segments the provided text creating an Iterator over [`Token`].
pub fn segment<'o>(&self, original: &'o str) -> SegmentedTokenIter<'o, 'tb> {
original.segment_with_option(self.segmenter_option)
pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't> {
original.segment_with_option(&self.segmenter_option)
}

/// Segments the provided text creating an Iterator over `&str`.
pub fn segment_str<'o>(&self, original: &'o str) -> SegmentedStrIter<'o, 'tb> {
original.segment_str_with_option(self.segmenter_option)
pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't> {
original.segment_str_with_option(&self.segmenter_option)
}
}

Expand Down Expand Up @@ -337,8 +338,20 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
}

Tokenizer {
normalizer_option: &self.normalizer_option,
segmenter_option: &self.segmenter_option,
normalizer_option: Cow::Borrowed(&self.normalizer_option),
segmenter_option: Cow::Borrowed(&self.segmenter_option),
}
}

/// Build the configurated `Tokenizer` consumming self.
///
/// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself.
pub fn into_tokenizer(mut self) -> Tokenizer<'tb> {
drop(self.build());

Tokenizer {
normalizer_option: Cow::Owned(self.normalizer_option),
segmenter_option: Cow::Owned(self.segmenter_option),
}
}
}
Expand Down

0 comments on commit f3cc03b

Please sign in to comment.