|
| 1 | +use std::borrow::Cow; |
1 | 2 | use std::collections::HashMap; |
2 | 3 |
|
3 | 4 | use aho_corasick::{AhoCorasick, MatchKind}; |
@@ -98,32 +99,32 @@ impl Tokenize<'_> for &str { |
98 | 99 | /// |
99 | 100 | /// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`]. |
100 | 101 | pub struct Tokenizer<'tb> { |
101 | | - segmenter_option: &'tb SegmenterOption<'tb>, |
102 | | - normalizer_option: &'tb NormalizerOption<'tb>, |
| 102 | + segmenter_option: Cow<'tb, SegmenterOption<'tb>>, |
| 103 | + normalizer_option: Cow<'tb, NormalizerOption<'tb>>, |
103 | 104 | } |
104 | 105 |
|
105 | 106 | impl<'tb> Tokenizer<'tb> { |
106 | 107 | /// Creates an Iterator over [`Token`]s. |
107 | 108 | /// |
108 | 109 | /// The provided text is segmented creating tokens, |
109 | 110 | /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`]. |
110 | | - pub fn tokenize<'o>(&self, original: &'o str) -> NormalizedTokenIter<'o, 'tb> { |
111 | | - original.segment_with_option(self.segmenter_option).normalize(self.normalizer_option) |
| 111 | + pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't> { |
| 112 | + original.segment_with_option(&self.segmenter_option).normalize(&self.normalizer_option) |
112 | 113 | } |
113 | 114 |
|
114 | 115 | /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text. |
115 | | - pub fn reconstruct<'o>(&self, original: &'o str) -> ReconstructedTokenIter<'o, 'tb> { |
| 116 | + pub fn reconstruct<'t, 'o>(&'t self, original: &'o str) -> ReconstructedTokenIter<'o, 't> { |
116 | 117 | ReconstructedTokenIter { original, token_iter: self.tokenize(original) } |
117 | 118 | } |
118 | 119 |
|
119 | 120 | /// Segments the provided text creating an Iterator over [`Token`]. |
120 | | - pub fn segment<'o>(&self, original: &'o str) -> SegmentedTokenIter<'o, 'tb> { |
121 | | - original.segment_with_option(self.segmenter_option) |
| 121 | + pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't> { |
| 122 | + original.segment_with_option(&self.segmenter_option) |
122 | 123 | } |
123 | 124 |
|
124 | 125 | /// Segments the provided text creating an Iterator over `&str`. |
125 | | - pub fn segment_str<'o>(&self, original: &'o str) -> SegmentedStrIter<'o, 'tb> { |
126 | | - original.segment_str_with_option(self.segmenter_option) |
| 126 | + pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't> { |
| 127 | + original.segment_str_with_option(&self.segmenter_option) |
127 | 128 | } |
128 | 129 | } |
129 | 130 |
|
@@ -337,8 +338,20 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { |
337 | 338 | } |
338 | 339 |
|
339 | 340 | Tokenizer { |
340 | | - normalizer_option: &self.normalizer_option, |
341 | | - segmenter_option: &self.segmenter_option, |
| 341 | + normalizer_option: Cow::Borrowed(&self.normalizer_option), |
| 342 | + segmenter_option: Cow::Borrowed(&self.segmenter_option), |
| 343 | + } |
| 344 | + } |
| 345 | + |
| 346 | + /// Build the configurated `Tokenizer` consumming self. |
| 347 | + /// |
| 348 | + /// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself. |
| 349 | + pub fn into_tokenizer(mut self) -> Tokenizer<'tb> { |
| 350 | + drop(self.build()); |
| 351 | + |
| 352 | + Tokenizer { |
| 353 | + normalizer_option: Cow::Owned(self.normalizer_option), |
| 354 | + segmenter_option: Cow::Owned(self.segmenter_option), |
342 | 355 | } |
343 | 356 | } |
344 | 357 | } |
|
0 commit comments