Skip to content

Commit f3cc03b

Browse files
Merge #222
222: Add helper methods for the integration in Meilisearch r=ManyTheFish a=ManyTheFish - [Add a method allowing to consume the builder when creating a tokenizer](d01eb4d) - [Add a normalize method to str](5e4cee7) Co-authored-by: ManyTheFish <[email protected]> Co-authored-by: Many the fish <[email protected]>
2 parents f96cfa4 + 5fe758b commit f3cc03b

File tree

3 files changed

+55
-14
lines changed

3 files changed

+55
-14
lines changed

charabia/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ mod token;
5151
mod tokenizer;
5252

5353
pub use detection::{Language, Script};
54+
pub use normalizer::Normalize;
5455
pub use segmenter::Segment;
5556
pub use token::{SeparatorKind, Token, TokenKind};
5657

charabia/src/normalizer/mod.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,11 +224,18 @@ impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
224224
}
225225
}
226226

227-
impl Token<'_> {
227+
pub trait Normalize {
228+
type Item;
229+
fn normalize(self, options: &NormalizerOption) -> Self::Item;
230+
}
231+
232+
impl Normalize for Token<'_> {
233+
type Item = Self;
234+
228235
/// Normalize [`Token`] using all the compatible Normalizers.
229236
///
230237
/// A Latin `Token` would not be normalized the same as a Chinese `Token`.
231-
pub fn normalize(mut self, options: &NormalizerOption) -> Self {
238+
fn normalize(mut self, options: &NormalizerOption) -> Self::Item {
232239
for normalizer in NORMALIZERS.iter() {
233240
if normalizer.should_normalize(&self) {
234241
self = normalizer.normalize(self, options);
@@ -247,12 +254,32 @@ impl Token<'_> {
247254
}
248255
}
249256

257+
impl<'o> Normalize for &'o str {
258+
type Item = Cow<'o, str>;
259+
260+
/// Normalize an str.
261+
fn normalize(self, options: &NormalizerOption) -> Self::Item {
262+
let mut normalized = Token { lemma: Cow::Borrowed(self), ..Default::default() };
263+
for normalizer in NORMALIZERS.iter() {
264+
normalized = normalizer.normalize(normalized, options);
265+
}
266+
267+
if options.lossy {
268+
for normalizer in LOSSY_NORMALIZERS.iter() {
269+
normalized = normalizer.normalize(normalized, options);
270+
}
271+
}
272+
273+
normalized.lemma
274+
}
275+
}
276+
250277
#[cfg(test)]
251278
mod test {
252279
macro_rules! test_normalizer {
253280
($normalizer:expr, $tokens:expr, $normalizer_result:expr, $global_result:expr) => {
254281
use super::*;
255-
use crate::Token;
282+
use crate::{Token, Normalize};
256283

257284
const TEST_NORMALIZER_OPTIONS: NormalizerOption = NormalizerOption {
258285
create_char_map: true,

charabia/src/tokenizer.rs

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use std::borrow::Cow;
12
use std::collections::HashMap;
23

34
use aho_corasick::{AhoCorasick, MatchKind};
@@ -98,32 +99,32 @@ impl Tokenize<'_> for &str {
9899
///
99100
/// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
100101
pub struct Tokenizer<'tb> {
101-
segmenter_option: &'tb SegmenterOption<'tb>,
102-
normalizer_option: &'tb NormalizerOption<'tb>,
102+
segmenter_option: Cow<'tb, SegmenterOption<'tb>>,
103+
normalizer_option: Cow<'tb, NormalizerOption<'tb>>,
103104
}
104105

105106
impl<'tb> Tokenizer<'tb> {
106107
/// Creates an Iterator over [`Token`]s.
107108
///
108109
/// The provided text is segmented creating tokens,
109110
/// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
110-
pub fn tokenize<'o>(&self, original: &'o str) -> NormalizedTokenIter<'o, 'tb> {
111-
original.segment_with_option(self.segmenter_option).normalize(self.normalizer_option)
111+
pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't> {
112+
original.segment_with_option(&self.segmenter_option).normalize(&self.normalizer_option)
112113
}
113114

114115
/// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
115-
pub fn reconstruct<'o>(&self, original: &'o str) -> ReconstructedTokenIter<'o, 'tb> {
116+
pub fn reconstruct<'t, 'o>(&'t self, original: &'o str) -> ReconstructedTokenIter<'o, 't> {
116117
ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
117118
}
118119

119120
/// Segments the provided text creating an Iterator over [`Token`].
120-
pub fn segment<'o>(&self, original: &'o str) -> SegmentedTokenIter<'o, 'tb> {
121-
original.segment_with_option(self.segmenter_option)
121+
pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't> {
122+
original.segment_with_option(&self.segmenter_option)
122123
}
123124

124125
/// Segments the provided text creating an Iterator over `&str`.
125-
pub fn segment_str<'o>(&self, original: &'o str) -> SegmentedStrIter<'o, 'tb> {
126-
original.segment_str_with_option(self.segmenter_option)
126+
pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't> {
127+
original.segment_str_with_option(&self.segmenter_option)
127128
}
128129
}
129130

@@ -337,8 +338,20 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
337338
}
338339

339340
Tokenizer {
340-
normalizer_option: &self.normalizer_option,
341-
segmenter_option: &self.segmenter_option,
341+
normalizer_option: Cow::Borrowed(&self.normalizer_option),
342+
segmenter_option: Cow::Borrowed(&self.segmenter_option),
343+
}
344+
}
345+
346+
/// Build the configurated `Tokenizer` consumming self.
347+
///
348+
/// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself.
349+
pub fn into_tokenizer(mut self) -> Tokenizer<'tb> {
350+
drop(self.build());
351+
352+
Tokenizer {
353+
normalizer_option: Cow::Owned(self.normalizer_option),
354+
segmenter_option: Cow::Owned(self.segmenter_option),
342355
}
343356
}
344357
}

0 commit comments

Comments
 (0)