Skip to content

Commit 83089b9

Browse files
committed
Introduces new options in fst segmenter to allow character splitting and min lemma length definition
1 parent 3882ec4 commit 83089b9

File tree

3 files changed

+37
-21
lines changed

3 files changed

+37
-21
lines changed

charabia/src/segmenter/khmer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use once_cell::sync::Lazy;
1818
static WORDS_FST: Lazy<Fst<&[u8]>> =
1919
Lazy::new(|| Fst::new(&include_bytes!("../../dictionaries/fst/khmer/words.fst")[..]).unwrap());
2020

21-
static FST_SEGMENTER: Lazy<FstSegmenter> = Lazy::new(|| FstSegmenter::new(&WORDS_FST));
21+
static FST_SEGMENTER: Lazy<FstSegmenter> = Lazy::new(|| FstSegmenter::new(&WORDS_FST, None, true));
2222

2323
// Make a small documentation of the specialized Segmenter like below.
2424
/// <Script/Language> specialized [`Segmenter`].

charabia/src/segmenter/thai.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub struct ThaiSegmenter;
1414
static WORDS_FST: Lazy<Fst<&[u8]>> =
1515
Lazy::new(|| Fst::new(&include_bytes!("../../dictionaries/fst/thai/words.fst")[..]).unwrap());
1616

17-
static FST_SEGMENTER: Lazy<FstSegmenter> = Lazy::new(|| FstSegmenter::new(&WORDS_FST));
17+
static FST_SEGMENTER: Lazy<FstSegmenter> = Lazy::new(|| FstSegmenter::new(&WORDS_FST, None, true));
1818

1919
impl Segmenter for ThaiSegmenter {
2020
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {

charabia/src/segmenter/utils.rs

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ use fst::raw::{Fst, Output};
33
/// Final-state-transducer (FST) Segmenter
44
pub(crate) struct FstSegmenter<'fst> {
55
words_fst: &'fst Fst<&'fst [u8]>,
6+
min_length: Option<usize>, // Optional minimum length for a word to be segmented
7+
allow_char_split: bool, // Flag to allow or disallow splitting words into characters
68
}
79

810
impl<'fst> FstSegmenter<'fst> {
9-
pub(crate) fn new(words_fst: &'fst Fst<&'fst [u8]>) -> Self {
10-
Self { words_fst }
11+
pub(crate) fn new(words_fst: &'fst Fst<&'fst [u8]>, min_length: Option<usize>, allow_char_split: bool) -> Self {
12+
Self { words_fst, min_length, allow_char_split }
1113
}
1214

1315
pub fn segment_str<'o>(
@@ -23,34 +25,48 @@ impl<'fst> FstSegmenter<'fst> {
2325
return None;
2426
}
2527

26-
let length = match find_longest_prefix(self.words_fst, to_segment.as_bytes()) {
28+
let mut length = match find_longest_prefix(self.words_fst, to_segment.as_bytes()) {
2729
Some((_, length)) => length,
2830
None => {
29-
// if no sequence matches, we return the next character as a lemma.
30-
let first = to_segment.chars().next().unwrap();
31-
first.len_utf8()
31+
if self.allow_char_split {
32+
// if no sequence matches, we return the next character as a lemma.
33+
to_segment.chars().next().unwrap().len_utf8()
34+
} else {
35+
// if splitting is not allowed, return the whole input
36+
let result = to_segment;
37+
to_segment = "";
38+
return Some(result);
39+
}
3240
}
3341
};
3442

43+
if let Some(min_len) = self.min_length {
44+
// enforce minimum lemma length if specified
45+
if length < min_len && to_segment.len() > length {
46+
length = min_len.min(to_segment.len());
47+
}
48+
49+
// prevent left over lemmas with a length fewer than min_len
50+
if to_segment.len() - length < min_len {
51+
length = to_segment.len();
52+
}
53+
}
54+
55+
// ensure the length is a valid character boundary
56+
length = to_segment
57+
.char_indices()
58+
.nth(length)
59+
.map(|(idx, _)| idx)
60+
.unwrap_or(to_segment.len());
61+
3562
let (left, right) = to_segment.split_at(length);
3663
to_segment = right;
3764
Some(left)
3865
});
3966

4067
Box::new(iter)
4168
}
42-
}
43-
44-
/// Thanks to @llogiq for this function
45-
/// https://github.com/BurntSushi/fst/pull/104/files
46-
///
47-
/// find the longest key that is prefix of the given value.
48-
///
49-
/// If the key exists, then `Some((value, key_len))` is returned, where
50-
/// `value` is the value associated with the key, and `key_len` is the
51-
/// length of the found key. Otherwise `None` is returned.
52-
///
53-
/// This can be used to e.g. build tokenizing functions.
69+
}/// find the longest key that is prefix of the given value.
5470
#[inline]
5571
fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {
5672
let mut node = fst.root();
@@ -69,4 +85,4 @@ fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {
6985
}
7086
}
7187
last_match
72-
}
88+
}

0 commit comments

Comments
 (0)