Skip to content

Commit

Permalink
Fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ManyTheFish committed Sep 19, 2024
1 parent c75c335 commit 45f66ea
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
24 changes: 24 additions & 0 deletions charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,42 @@ mod test {

const TEXT: &str =
"The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case";

#[rustfmt::skip]
#[cfg(feature = "latin-camelcase")]
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

#[rustfmt::skip]
#[cfg(feature = "latin-camelcase")]
const TOKENIZED: &[&str] = &[
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

#[rustfmt::skip]
#[cfg(not(feature = "latin-camelcase"))]
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

#[rustfmt::skip]
#[cfg(not(feature = "latin-camelcase"))]
const TOKENIZED: &[&str] = &[
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
}
6 changes: 3 additions & 3 deletions charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,6 @@ mod test {
($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => {
use crate::{Token, Language, Script};
use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO};
use crate::tokenizer::Tokenize;
use super::*;

#[test]
Expand Down Expand Up @@ -427,7 +426,7 @@ Check if the expected Script/Language corresponds to the detected Script/Languag

#[test]
fn segment() {
let segmented_text: Vec<_> = $text.segment_str().collect();
let segmented_text: Vec<_> = $text.segment_str_with_option(None, Some(&[$language])).collect();
assert_eq!(&segmented_text[..], $segmented, r#"
Segmenter chosen by global segment() function, didn't segment the text as expected.
Expand All @@ -438,7 +437,8 @@ Check if the tested segmenter is assigned to the good Script/Language in `SEGMEN

#[test]
fn tokenize() {
let tokens: Vec<_> = $text.tokenize().collect();
let tokenizer = crate::TokenizerBuilder::default().into_tokenizer();
let tokens: Vec<_> = tokenizer.tokenize_with_allow_list($text, Some(&[$language])).collect();
let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect();

assert_eq!(&tokenized_text[..], $tokenized, r#"
Expand Down

0 comments on commit 45f66ea

Please sign in to comment.