Skip to content

Commit

Permalink
Adds some more test cases and updates dictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
luflow committed Sep 10, 2024
1 parent 38b8529 commit aad5073
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 4 deletions.
Binary file modified charabia/dictionaries/fst/german/words.fst
Binary file not shown.
6 changes: 4 additions & 2 deletions charabia/dictionaries/txt/german/words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6192,7 +6192,7 @@ Glas
Glaser
Glasnost
Glasur
Glatteis
Glatt
Glatze
Glaube
Glauben
Expand Down Expand Up @@ -8306,6 +8306,7 @@ Kuli
Kulisse
Kult
Kultur
Kulturalität
Kultus
Kummer
Kumpan
Expand Down Expand Up @@ -21691,7 +21692,7 @@ glas
glaser
glasnost
glasur
glatteis
glatt
glatze
glaube
glauben
Expand Down Expand Up @@ -23805,6 +23806,7 @@ kuli
kulisse
kult
kultur
kulturalität
kultus
kummer
kumpan
Expand Down
31 changes: 29 additions & 2 deletions charabia/src/segmenter/german.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ static WORDS_FST: Lazy<Fst<&[u8]>> =
Lazy::new(|| Fst::new(&include_bytes!("../../dictionaries/fst/german/words.fst")[..]).unwrap());

static FST_SEGMENTER: Lazy<FstSegmenter> =
Lazy::new(|| FstSegmenter::new(&WORDS_FST, Some(4), true));
Lazy::new(|| FstSegmenter::new(&WORDS_FST, Some(2), true));

impl Segmenter for GermanSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
Expand Down Expand Up @@ -100,7 +100,34 @@ mod test {
word1
);
test_segmentation!("Schreibprozess", &["Schreib", "prozess"], word2);
test_segmentation!("Interkulturalität", &["Inter", "kultur", "alität"], word3);
test_segmentation!("Interkulturalität", &["Inter", "kulturalität"], word3);
test_segmentation!("Wissensorganisation", &["Wissens", "organisation"], word4);
test_segmentation!("Aufgabenplanung", &["Aufgaben", "planung"], word5);
test_segmentation!("Eisbrecher", &["Eis", "brecher"], word6);
test_segmentation!("Zuckerei", &["Zucker", "ei"], word7);
test_segmentation!("Glatteis", &["Glatt", "eis"], word8);
test_segmentation!("Sinnfindung", &["Sinn", "findung"], word9);
test_segmentation!(
"Donaudampfschifffahrtsgesellschaftskapitän",
&["Donau", "dampf", "schifffahrts", "gesellschafts", "kapitän"],
word10
);
test_segmentation!(
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
&[
"Rind",
"fleisch",
"etikettierungs",
"überwachungs",
"aufgaben",
"übertragungs",
"gesetz"
],
word11
);
test_segmentation!(
"Nahrungsmittelunverträglichkeitsdiagnoseverfahren",
&["Nahrungs", "mittel", "un", "verträglichkeits", "diagnose", "verfahren"],
word12
);
}

0 comments on commit aad5073

Please sign in to comment.