|
| 1 | +use std::borrow::Cow; |
1 | 2 | use std::collections::HashMap;
|
2 | 3 |
|
3 | 4 | use aho_corasick::{AhoCorasick, MatchKind};
|
@@ -98,32 +99,32 @@ impl Tokenize<'_> for &str {
|
98 | 99 | ///
|
99 | 100 | /// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
|
100 | 101 | pub struct Tokenizer<'tb> {
|
101 |
| - segmenter_option: &'tb SegmenterOption<'tb>, |
102 |
| - normalizer_option: &'tb NormalizerOption<'tb>, |
| 102 | + segmenter_option: Cow<'tb, SegmenterOption<'tb>>, |
| 103 | + normalizer_option: Cow<'tb, NormalizerOption<'tb>>, |
103 | 104 | }
|
104 | 105 |
|
105 | 106 | impl<'tb> Tokenizer<'tb> {
|
106 | 107 | /// Creates an Iterator over [`Token`]s.
|
107 | 108 | ///
|
108 | 109 | /// The provided text is segmented creating tokens,
|
109 | 110 | /// then tokens are normalized and classified depending on the list of normalizers and classifiers in [`normalizer::NORMALIZERS`].
|
110 |
| - pub fn tokenize<'o>(&self, original: &'o str) -> NormalizedTokenIter<'o, 'tb> { |
111 |
| - original.segment_with_option(self.segmenter_option).normalize(self.normalizer_option) |
| 111 | + pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't> { |
| 112 | + original.segment_with_option(&self.segmenter_option).normalize(&self.normalizer_option) |
112 | 113 | }
|
113 | 114 |
|
114 | 115 | /// Same as [`tokenize`] but attaches each [`Token`] to its corresponding portion of the original text.
|
115 |
| - pub fn reconstruct<'o>(&self, original: &'o str) -> ReconstructedTokenIter<'o, 'tb> { |
| 116 | + pub fn reconstruct<'t, 'o>(&'t self, original: &'o str) -> ReconstructedTokenIter<'o, 't> { |
116 | 117 | ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
|
117 | 118 | }
|
118 | 119 |
|
119 | 120 | /// Segments the provided text creating an Iterator over [`Token`].
|
120 |
| - pub fn segment<'o>(&self, original: &'o str) -> SegmentedTokenIter<'o, 'tb> { |
121 |
| - original.segment_with_option(self.segmenter_option) |
| 121 | + pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't> { |
| 122 | + original.segment_with_option(&self.segmenter_option) |
122 | 123 | }
|
123 | 124 |
|
124 | 125 | /// Segments the provided text creating an Iterator over `&str`.
|
125 |
| - pub fn segment_str<'o>(&self, original: &'o str) -> SegmentedStrIter<'o, 'tb> { |
126 |
| - original.segment_str_with_option(self.segmenter_option) |
| 126 | + pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't> { |
| 127 | + original.segment_str_with_option(&self.segmenter_option) |
127 | 128 | }
|
128 | 129 | }
|
129 | 130 |
|
@@ -337,8 +338,20 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
|
337 | 338 | }
|
338 | 339 |
|
339 | 340 | Tokenizer {
|
340 |
| - normalizer_option: &self.normalizer_option, |
341 |
| - segmenter_option: &self.segmenter_option, |
| 341 | + normalizer_option: Cow::Borrowed(&self.normalizer_option), |
| 342 | + segmenter_option: Cow::Borrowed(&self.segmenter_option), |
| 343 | + } |
| 344 | + } |
| 345 | + |
| 346 | + /// Build the configurated `Tokenizer` consumming self. |
| 347 | + /// |
| 348 | + /// This method allows to drop the tokenizer builder without having to drop the Tokenizer itself. |
| 349 | + pub fn into_tokenizer(mut self) -> Tokenizer<'tb> { |
| 350 | + drop(self.build()); |
| 351 | + |
| 352 | + Tokenizer { |
| 353 | + normalizer_option: Cow::Owned(self.normalizer_option), |
| 354 | + segmenter_option: Cow::Owned(self.segmenter_option), |
342 | 355 | }
|
343 | 356 | }
|
344 | 357 | }
|
|
0 commit comments