@@ -3,11 +3,13 @@ use fst::raw::{Fst, Output};
3
3
/// Final-state-transducer (FST) Segmenter
4
4
pub ( crate ) struct FstSegmenter < ' fst > {
5
5
words_fst : & ' fst Fst < & ' fst [ u8 ] > ,
6
+ min_length : Option < usize > , // Optional minimum length for a word to be segmented
7
+ allow_char_split : bool , // Flag to allow or disallow splitting words into characters
6
8
}
7
9
8
10
impl < ' fst > FstSegmenter < ' fst > {
9
- pub ( crate ) fn new ( words_fst : & ' fst Fst < & ' fst [ u8 ] > ) -> Self {
10
- Self { words_fst }
11
+ pub ( crate ) fn new ( words_fst : & ' fst Fst < & ' fst [ u8 ] > , min_length : Option < usize > , allow_char_split : bool ) -> Self {
12
+ Self { words_fst, min_length , allow_char_split }
11
13
}
12
14
13
15
pub fn segment_str < ' o > (
@@ -23,34 +25,48 @@ impl<'fst> FstSegmenter<'fst> {
23
25
return None ;
24
26
}
25
27
26
- let length = match find_longest_prefix ( self . words_fst , to_segment. as_bytes ( ) ) {
28
+ let mut length = match find_longest_prefix ( self . words_fst , to_segment. as_bytes ( ) ) {
27
29
Some ( ( _, length) ) => length,
28
30
None => {
29
- // if no sequence matches, we return the next character as a lemma.
30
- let first = to_segment. chars ( ) . next ( ) . unwrap ( ) ;
31
- first. len_utf8 ( )
31
+ if self . allow_char_split {
32
+ // if no sequence matches, we return the next character as a lemma.
33
+ to_segment. chars ( ) . next ( ) . unwrap ( ) . len_utf8 ( )
34
+ } else {
35
+ // if splitting is not allowed, return the whole input
36
+ let result = to_segment;
37
+ to_segment = "" ;
38
+ return Some ( result) ;
39
+ }
32
40
}
33
41
} ;
34
42
43
+ if let Some ( min_len) = self . min_length {
44
+ // enforce minimum lemma length if specified
45
+ if length < min_len && to_segment. len ( ) > length {
46
+ length = min_len. min ( to_segment. len ( ) ) ;
47
+ }
48
+
49
+ // prevent left over lemmas with a length fewer than min_len
50
+ if to_segment. len ( ) - length < min_len {
51
+ length = to_segment. len ( ) ;
52
+ }
53
+ }
54
+
55
+ // ensure the length is a valid character boundary
56
+ length = to_segment
57
+ . char_indices ( )
58
+ . nth ( length)
59
+ . map ( |( idx, _) | idx)
60
+ . unwrap_or ( to_segment. len ( ) ) ;
61
+
35
62
let ( left, right) = to_segment. split_at ( length) ;
36
63
to_segment = right;
37
64
Some ( left)
38
65
} ) ;
39
66
40
67
Box :: new ( iter)
41
68
}
42
- }
43
-
44
- /// Thanks to @llogiq for this function
45
- /// https://github.com/BurntSushi/fst/pull/104/files
46
- ///
47
- /// find the longest key that is prefix of the given value.
48
- ///
49
- /// If the key exists, then `Some((value, key_len))` is returned, where
50
- /// `value` is the value associated with the key, and `key_len` is the
51
- /// length of the found key. Otherwise `None` is returned.
52
- ///
53
- /// This can be used to e.g. build tokenizing functions.
69
+ } /// find the longest key that is prefix of the given value.
54
70
#[ inline]
55
71
fn find_longest_prefix ( fst : & Fst < & [ u8 ] > , value : & [ u8 ] ) -> Option < ( u64 , usize ) > {
56
72
let mut node = fst. root ( ) ;
@@ -69,4 +85,4 @@ fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {
69
85
}
70
86
}
71
87
last_match
72
- }
88
+ }
0 commit comments