diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e98a3fc8..c0fb7664 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -26,6 +26,8 @@ jobs: run: cargo test --verbose - name: Run tests with japanese-transliteration on run: cargo test --verbose --features japanese-transliteration + - name: Run tests with chinese-normalization-pinyin on + run: cargo test --verbose --features chinese chinese-normalization-pinyin - name: Run irg-kvariants tests run: cargo test -p irg-kvariants --verbose diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index f4ef47f4..f1d59800 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "charabia" -version = "0.8.8" +version = "0.8.9" license = "MIT" authors = ["Many "] edition = "2021" @@ -24,9 +24,7 @@ once_cell = "1.19.0" serde = "1.0" slice-group-by = "0.3.1" whatlang = "0.16.4" -lindera-core = "=0.28.0" -lindera-dictionary = "=0.28.0" -lindera-tokenizer = { version = "=0.28.0", default-features = false, optional = true } +lindera = { version = "=0.30.0", default-features = false, optional = true } pinyin = { version = "0.10", default-features = false, features = [ "with_tone", ], optional = true } @@ -41,19 +39,22 @@ jemalloc-sys = "0.5.4" default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] # allow chinese specialized tokenization -chinese = ["dep:pinyin", "dep:jieba-rs"] +chinese = ["chinese-segmentation", "chinese-normalization"] +chinese-segmentation = ["dep:jieba-rs"] +chinese-normalization = [] +chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"] # allow hebrew specialized tokenization hebrew = [] # allow japanese specialized tokenization japanese = ["japanese-segmentation-unidic"] -japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"] -japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"] +japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"] +japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization -korean = ["lindera-tokenizer/ko-dic", "lindera-tokenizer/ko-dic-compress"] +korean = ["lindera/ko-dic", "lindera/compress"] # allow thai specialized tokenization thai = [] diff --git a/charabia/README.md b/charabia/README.md index 50a76848..9bc0d01b 100644 --- a/charabia/README.md +++ b/charabia/README.md @@ -19,7 +19,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor | **Latin** | โœ… CamelCase segmentation | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `ร vs ฤ` spoofing normalization | ๐ŸŸฉ ~23MiB/sec | ๐ŸŸจ ~9MiB/sec | | **Greek** | โŒ | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | ๐ŸŸฉ ~27MiB/sec | ๐ŸŸจ ~8MiB/sec | | **Cyrillic** - **Georgian** | โŒ | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | ๐ŸŸฉ ~27MiB/sec | ๐ŸŸจ ~9MiB/sec | -| **Chinese** **CMN** ๐Ÿ‡จ๐Ÿ‡ณ | โœ… [jieba](https://github.com/messense/jieba-rs) | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + pinyin conversion | ๐ŸŸจ ~10MiB/sec | ๐ŸŸง ~5MiB/sec | +| **Chinese** **CMN** ๐Ÿ‡จ๐Ÿ‡ณ | โœ… [jieba](https://github.com/messense/jieba-rs) | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | ๐ŸŸจ ~10MiB/sec | ๐ŸŸง ~5MiB/sec | | **Hebrew** ๐Ÿ‡ฎ๐Ÿ‡ฑ | โŒ | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal | ๐ŸŸฉ ~33MiB/sec | ๐ŸŸจ ~11MiB/sec | | **Arabic** | โœ… `ุงู„` segmentation | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + [Tatweel, Alef, Yeh, and Taa Marbuta normalization] | ๐ŸŸฉ ~36MiB/sec | ๐ŸŸจ ~11MiB/sec | | **Japanese** ๐Ÿ‡ฏ๐Ÿ‡ต | โœ… [lindera](https://github.com/lindera-morphology/lindera) IPA-dict | โŒ [compatibility decomposition](https://unicode.org/reports/tr15/) | ๐ŸŸง ~3MiB/sec | ๐ŸŸง ~3MiB/sec | diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs index 4739903c..3b3d8ec0 100644 --- a/charabia/src/normalizer/chinese.rs +++ b/charabia/src/normalizer/chinese.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "chinese-normalization-pinyin")] use pinyin::ToPinyin; use super::CharNormalizer; @@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer { // Normalize to Pinyin // If we don't manage to convert the kvariant, we try to convert the original character. // If none of them are converted, we return the kvariant. - match kvariant.to_pinyin().or_else(|| c.to_pinyin()) { + #[cfg(feature = "chinese-normalization-pinyin")] + let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) { Some(converted) => { let with_tone = converted.with_tone(); - Some(with_tone.to_string().into()) + with_tone.to_string() } - None => Some(kvariant.into()), // e.g. ๆค - } + None => kvariant, // e.g. ๆค + }; + + Some(kvariant.into()) } fn should_normalize(&self, token: &Token) -> bool { @@ -77,6 +81,7 @@ mod test { } // expected result of the current Normalizer. + #[cfg(feature = "chinese-normalization-pinyin")] fn normalizer_result() -> Vec> { vec![ Token { @@ -113,6 +118,7 @@ mod test { } // expected result of the complete Normalizer pieline. + #[cfg(feature = "chinese-normalization-pinyin")] fn normalized_tokens() -> Vec> { vec![ Token { @@ -148,5 +154,79 @@ mod test { ] } + // expected result of the current Normalizer. + #[cfg(not(feature = "chinese-normalization-pinyin"))] + fn normalizer_result() -> Vec> { + vec![ + Token { + lemma: Owned("ๅฐŠๅšด".to_string()), + char_end: 2, + byte_end: 6, + char_map: Some(vec![(3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + ..Default::default() + }, + Token { + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑ".to_string()), + char_end: 4, + byte_end: 12, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + ..Default::default() + }, + Token { + lemma: Owned("ๆพณไˆไบžๆœฌๅˆƒ๐ฃœœ".to_string()), + char_end: 5, + byte_end: 15, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]), + script: Script::Cj, + language: Some(Language::Cmn), + ..Default::default() + }, + ] + } + + // expected result of the complete Normalizer pieline. + #[cfg(not(feature = "chinese-normalization-pinyin"))] + fn normalized_tokens() -> Vec> { + vec![ + Token { + kind: TokenKind::Word, + lemma: Owned("ๅฐŠๅšด".to_string()), + char_start: 0, + char_end: 2, + byte_start: 0, + byte_end: 6, + char_map: Some(vec![(3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + }, + Token { + kind: TokenKind::Word, + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑ".to_string()), + char_start: 0, + char_end: 4, + byte_start: 0, + byte_end: 12, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + }, + Token { + kind: TokenKind::Word, + lemma: Owned("ๆพณไˆไบžๆœฌๅˆƒ๐ฃœœ".to_string()), + char_start: 0, + char_end: 5, + byte_start: 0, + byte_end: 15, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]), + script: Script::Cj, + language: Some(Language::Cmn), + }, + ] + } + test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens()); } diff --git a/charabia/src/normalizer/control_char.rs b/charabia/src/normalizer/control_char.rs index 94f4dd5f..c0d0d478 100644 --- a/charabia/src/normalizer/control_char.rs +++ b/charabia/src/normalizer/control_char.rs @@ -103,6 +103,7 @@ mod test { } // expected result of the complete Normalizer pieline. + #[cfg(feature = "chinese-normalization-pinyin")] fn normalized_tokens() -> Vec> { vec![ Token { @@ -146,5 +147,50 @@ mod test { ] } + // expected result of the complete Normalizer pieline. + #[cfg(not(feature = "chinese-normalization-pinyin"))] + fn normalized_tokens() -> Vec> { + vec![ + Token { + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑoo".to_string()), + char_end: 9, + byte_end: 17, + script: Script::Cj, + char_map: Some(vec![ + (1, 0), + (3, 3), + (3, 3), + (3, 3), + (3, 3), + (1, 0), + (1, 1), + (1, 1), + (1, 0), + ]), + kind: TokenKind::Word, + ..Default::default() + }, + Token { + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑoo".to_string()), + char_end: 9, + byte_end: 17, + script: Script::Cj, + char_map: Some(vec![ + (1, 0), + (3, 3), + (3, 3), + (3, 3), + (3, 3), + (1, 0), + (1, 1), + (1, 1), + (1, 0), + ]), + kind: TokenKind::Word, + ..Default::default() + }, + ] + } + test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens()); } diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 09ae81bc..6fb3e667 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use once_cell::sync::Lazy; pub use self::arabic::ArabicNormalizer; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-normalization")] pub use self::chinese::ChineseNormalizer; pub use self::classify::{Classifier, ClassifierOption}; pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer; @@ -23,7 +23,7 @@ use crate::Token; pub use self::ae_oe_normalizer::AeOeNormalizer; mod arabic; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-normalization")] mod chinese; mod classify; mod compatibility_decomposition; @@ -55,7 +55,7 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { Box::new(LowercaseNormalizer), Box::new(QuoteNormalizer), Box::new(AeOeNormalizer), - #[cfg(feature = "chinese")] + #[cfg(feature = "chinese-normalization")] Box::new(ChineseNormalizer), #[cfg(feature = "japanese-transliteration")] Box::new(JapaneseNormalizer), diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs index 631936ff..9af1ad11 100644 --- a/charabia/src/segmenter/chinese.rs +++ b/charabia/src/segmenter/chinese.rs @@ -64,6 +64,7 @@ mod test { ]; // Segmented and normalized version of the text. + #[cfg(feature = "chinese-normalization-pinyin")] const TOKENIZED: &[&str] = &[ "rรฉnrรฉn", "shฤ“ngรฉrzรฌyรณu", @@ -99,6 +100,42 @@ mod test { "ใ€‚", ]; + #[cfg(not(feature = "chinese-normalization-pinyin"))] + const TOKENIZED: &[&str] = &[ + "ไบบไบบ", + "็”Ÿ่€Œ่‡ช็”ฑ", + ",", + "ๅœจ", + "ๅฐŠ", + "ๅšด", + "ๅ’Œ", + "ๆฌŠ", + "ๅˆฉ", + "ไธŠ", + "ไธ€ๅพ‹ๅนณ็ญ‰", + "ใ€‚", + "ไป–", + "ๅ€‘", + "่ณฆ", + "ๆœ‰", + "็†ๆ€ง", + "ๅ’Œ", + "่‰ฏๅฟƒ", + ",", + "ไธฆ", + "ๆ‡‰", + "ไปฅ", + "ๅ…„ๅผŸ", + "้—œ", + "ไฟ‚", + "็š„", + "็ฒพ็ฅž", + "ไบ’็›ธ", + "ๅฐ", + "ๅพ…", + "ใ€‚", + ]; + // Macro that run several tests on the Segmenter. test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn); } diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index da256718..e19c4e94 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,8 +1,6 @@ -use lindera_core::mode::Mode; #[cfg(feature = "japanese-segmentation-ipadic")] -use lindera_core::mode::Penalty; -use lindera_dictionary::{DictionaryConfig, DictionaryKind}; -use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig}; +use lindera::Penalty; +use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index b0c001b5..31604929 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -1,6 +1,4 @@ -use lindera_core::mode::{Mode, Penalty}; -use lindera_dictionary::{DictionaryConfig, DictionaryKind}; -use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig}; +use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs index 0cff67d9..579d3389 100644 --- a/charabia/src/segmenter/mod.rs +++ b/charabia/src/segmenter/mod.rs @@ -3,11 +3,13 @@ use std::collections::HashMap; use aho_corasick::{AhoCorasick, FindIter, MatchKind}; pub use arabic::ArabicSegmenter; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-segmentation")] pub use chinese::ChineseSegmenter; use either::Either; #[cfg(feature = "japanese")] pub use japanese::JapaneseSegmenter; +#[cfg(feature = "khmer")] +pub use khmer::KhmerSegmenter; #[cfg(feature = "korean")] pub use korean::KoreanSegmenter; pub use latin::LatinSegmenter; @@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy; #[cfg(feature = "thai")] pub use thai::ThaiSegmenter; -#[cfg(feature = "khmer")] -pub use khmer::KhmerSegmenter; - use crate::detection::{Detect, Language, Script, StrDetection}; use crate::separators::DEFAULT_SEPARATORS; use crate::token::Token; mod arabic; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-segmentation")] mod chinese; #[cfg(feature = "japanese")] mod japanese; @@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy>> = L // latin segmenter ((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box), // chinese segmenter - #[cfg(feature = "chinese")] + #[cfg(feature = "chinese-segmentation")] ((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box), // japanese segmenter #[cfg(feature = "japanese")] diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index 3d4a8498..60a00483 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -59,7 +59,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[ "๐‘ช ", "๐‘ชก", "๐‘ชข", "๐‘ฑ", "๐‘ฑ‚", "๐‘ฑƒ", "๐‘ฑ„", "๐‘ฑ…", "๐‘ฑฐ", "๐‘ฑฑ", "๐‘ปท", "๐‘ปธ", "๐‘ฟฟ", "๐’‘ฐ", "๐’‘ฑ", "๐’‘ฒ", "๐’‘ณ", "๐’‘ด", "๐–ฉฎ", "๐–ฉฏ", "๐–ซต", "๐–ฌท", "๐–ฌธ", "๐–ฌน", "๐–ฌบ", "๐–ฌป", "๐–ญ„", "๐–บ—", "๐–บ˜", "๐–บ™", "๐–บš", "๐–ฟข", "๐›ฒŸ", "๐ช‡", "๐ชˆ", "๐ช‰", "๐ชŠ", "๐ช‹", "๐žฅž", "๐žฅŸ", "\n", "\r", "\u{2029}", " ", "แš€", "โ€‚", "โ€ƒ", "โ€‚", "โ€ƒ", "โ€„", "โ€…", "โ€†", "โ€‡", "โ€ˆ", "โ€‰", - "โ€Š", "ใ€€", "`" + "โ€Š", "ใ€€", "`", "\t" ]; #[rustfmt::skip] diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs index fc1e1aac..e9b5f4dd 100644 --- a/charabia/src/tokenizer.rs +++ b/charabia/src/tokenizer.rs @@ -313,26 +313,25 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { // TODO: avoid recreating the automaton if nothing changed match (self.normalizer_option.classifier.separators, self.words_dict) { (Some(separators), None) => { + let pattern = separators.iter().filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) - .build(separators) + .build(pattern) .unwrap(); - self.segmenter_option.aho = Some(aho); + self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0); } (separators, Some(words)) => { // use the default separators' list if a custom words' list is given but no custom separators' list. let separators = separators.unwrap_or(DEFAULT_SEPARATORS); // merge both lists together and create the Aho-Corasick automaton. - let mut vec = Vec::with_capacity(separators.len() + words.len()); - vec.extend_from_slice(words); - vec.extend_from_slice(separators); + let pattern = words.iter().chain(separators).filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) - .build(vec) + .build(pattern) .unwrap(); - self.segmenter_option.aho = Some(aho); + self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0); } // reset the state in case the builder is reused. (None, None) => self.segmenter_option.aho = None,