From 7acd9838cecb86d55b3b95a31a61e653048e55c6 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Sat, 13 Apr 2024 22:51:44 +0900 Subject: [PATCH 1/8] Update Lindera to 0.30.0 --- charabia/Cargo.toml | 10 ++++------ charabia/src/segmenter/japanese.rs | 6 ++---- charabia/src/segmenter/korean.rs | 4 +--- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index ac738b4c..4b9819a0 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -24,9 +24,7 @@ once_cell = "1.19.0" serde = "1.0" slice-group-by = "0.3.1" whatlang = "0.16.4" -lindera-core = "=0.28.0" -lindera-dictionary = "=0.28.0" -lindera-tokenizer = { version = "=0.28.0", default-features = false, optional = true } +lindera = { version = "=0.30.0", default-features = false, optional = true } pinyin = { version = "0.10", default-features = false, features = [ "with_tone", ], optional = true } @@ -47,12 +45,12 @@ hebrew = [] # allow japanese specialized tokenization japanese = ["japanese-segmentation-unidic"] -japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"] -japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"] +japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"] +japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization -korean = ["lindera-tokenizer/ko-dic", "lindera-tokenizer/ko-dic-compress"] +korean = ["lindera/ko-dic", "lindera/compress"] # allow thai specialized tokenization thai = [] diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index da256718..e19c4e94 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,8 +1,6 @@ -use lindera_core::mode::Mode; #[cfg(feature = "japanese-segmentation-ipadic")] -use lindera_core::mode::Penalty; -use lindera_dictionary::{DictionaryConfig, DictionaryKind}; -use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig}; +use lindera::Penalty; +use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index b0c001b5..31604929 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -1,6 +1,4 @@ -use lindera_core::mode::{Mode, Penalty}; -use lindera_dictionary::{DictionaryConfig, DictionaryKind}; -use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig}; +use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; use crate::segmenter::Segmenter; From e41dfd5ede94c399cd6f9b0c0df198ece0e508b2 Mon Sep 17 00:00:00 2001 From: Gusted Date: Mon, 15 Apr 2024 23:54:47 +0200 Subject: [PATCH 2/8] Add `\t` as recognized separator Currently `\t` isn't seen as an recognized separator. This was causing issues for meilisearch, when it was trying to search on a keyword (fuzzy or exact match) and in the document the keyword was present but the character before the keyword was an `\t` charabia would create a token that was `\t` which in turn led to meilisearch returning the document as part of the search but not returning the positions of matches (`_matchesPosition` field). The actual reproducer for this bug was code files of the Linux kernel (such as `fs/ext4/readpage.c`) which uses tabs for indentation and searching for keywords like `while` would usually be 'prefixed' by an tab causing the described issue. Making `\t` a separator fixed this issue. --- charabia/src/separators.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index 3d4a8498..60a00483 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -59,7 +59,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[ "๐‘ช ", "๐‘ชก", "๐‘ชข", "๐‘ฑ", "๐‘ฑ‚", "๐‘ฑƒ", "๐‘ฑ„", "๐‘ฑ…", "๐‘ฑฐ", "๐‘ฑฑ", "๐‘ปท", "๐‘ปธ", "๐‘ฟฟ", "๐’‘ฐ", "๐’‘ฑ", "๐’‘ฒ", "๐’‘ณ", "๐’‘ด", "๐–ฉฎ", "๐–ฉฏ", "๐–ซต", "๐–ฌท", "๐–ฌธ", "๐–ฌน", "๐–ฌบ", "๐–ฌป", "๐–ญ„", "๐–บ—", "๐–บ˜", "๐–บ™", "๐–บš", "๐–ฟข", "๐›ฒŸ", "๐ช‡", "๐ชˆ", "๐ช‰", "๐ชŠ", "๐ช‹", "๐žฅž", "๐žฅŸ", "\n", "\r", "\u{2029}", " ", "แš€", "โ€‚", "โ€ƒ", "โ€‚", "โ€ƒ", "โ€„", "โ€…", "โ€†", "โ€‡", "โ€ˆ", "โ€‰", - "โ€Š", "ใ€€", "`" + "โ€Š", "ใ€€", "`", "\t" ]; #[rustfmt::skip] From b7d1c991868f6f0c50a8f73cf0a8c50c01ae45ba Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Apr 2024 15:31:13 +0200 Subject: [PATCH 3/8] Filter empty token before inserting them in the AhoCorasick automaton avoiding a char boundary panic --- charabia/src/tokenizer.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs index fc1e1aac..7e4c5a93 100644 --- a/charabia/src/tokenizer.rs +++ b/charabia/src/tokenizer.rs @@ -313,26 +313,26 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { // TODO: avoid recreating the automaton if nothing changed match (self.normalizer_option.classifier.separators, self.words_dict) { (Some(separators), None) => { + let pattern = separators.into_iter().filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) - .build(separators) + .build(pattern) .unwrap(); - self.segmenter_option.aho = Some(aho); + self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0); } (separators, Some(words)) => { // use the default separators' list if a custom words' list is given but no custom separators' list. let separators = separators.unwrap_or(DEFAULT_SEPARATORS); // merge both lists together and create the Aho-Corasick automaton. - let mut vec = Vec::with_capacity(separators.len() + words.len()); - vec.extend_from_slice(words); - vec.extend_from_slice(separators); + let pattern = + words.into_iter().chain(separators.into_iter()).filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) - .build(vec) + .build(pattern) .unwrap(); - self.segmenter_option.aho = Some(aho); + self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0); } // reset the state in case the builder is reused. (None, None) => self.segmenter_option.aho = None, From 5f2c7377eeb5ae0f40983398a978941abb0c6438 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Apr 2024 16:56:12 +0200 Subject: [PATCH 4/8] Make the pinyin-normalization optional --- .github/workflows/rust.yml | 2 + charabia/Cargo.toml | 5 +- charabia/src/normalizer/chinese.rs | 88 +++++++++++++++++++++++-- charabia/src/normalizer/control_char.rs | 46 +++++++++++++ charabia/src/normalizer/mod.rs | 6 +- charabia/src/segmenter/chinese.rs | 37 +++++++++++ charabia/src/segmenter/mod.rs | 11 ++-- 7 files changed, 181 insertions(+), 14 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e98a3fc8..c0fb7664 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -26,6 +26,8 @@ jobs: run: cargo test --verbose - name: Run tests with japanese-transliteration on run: cargo test --verbose --features japanese-transliteration + - name: Run tests with chinese-normalization-pinyin on + run: cargo test --verbose --features chinese chinese-normalization-pinyin - name: Run irg-kvariants tests run: cargo test -p irg-kvariants --verbose diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index ac738b4c..3c17b8e4 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -40,7 +40,10 @@ zerovec = "0.10.1" default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] # allow chinese specialized tokenization -chinese = ["dep:pinyin", "dep:jieba-rs"] +chinese = ["chinese-segmentation", "chinese-normalization"] +chinese-segmentation = ["dep:jieba-rs"] +chinese-normalization = [] +chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"] # allow hebrew specialized tokenization hebrew = [] diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs index 4739903c..3b3d8ec0 100644 --- a/charabia/src/normalizer/chinese.rs +++ b/charabia/src/normalizer/chinese.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "chinese-normalization-pinyin")] use pinyin::ToPinyin; use super::CharNormalizer; @@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer { // Normalize to Pinyin // If we don't manage to convert the kvariant, we try to convert the original character. // If none of them are converted, we return the kvariant. - match kvariant.to_pinyin().or_else(|| c.to_pinyin()) { + #[cfg(feature = "chinese-normalization-pinyin")] + let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) { Some(converted) => { let with_tone = converted.with_tone(); - Some(with_tone.to_string().into()) + with_tone.to_string() } - None => Some(kvariant.into()), // e.g. ๆค - } + None => kvariant, // e.g. ๆค + }; + + Some(kvariant.into()) } fn should_normalize(&self, token: &Token) -> bool { @@ -77,6 +81,7 @@ mod test { } // expected result of the current Normalizer. + #[cfg(feature = "chinese-normalization-pinyin")] fn normalizer_result() -> Vec> { vec![ Token { @@ -113,6 +118,7 @@ mod test { } // expected result of the complete Normalizer pieline. + #[cfg(feature = "chinese-normalization-pinyin")] fn normalized_tokens() -> Vec> { vec![ Token { @@ -148,5 +154,79 @@ mod test { ] } + // expected result of the current Normalizer. + #[cfg(not(feature = "chinese-normalization-pinyin"))] + fn normalizer_result() -> Vec> { + vec![ + Token { + lemma: Owned("ๅฐŠๅšด".to_string()), + char_end: 2, + byte_end: 6, + char_map: Some(vec![(3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + ..Default::default() + }, + Token { + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑ".to_string()), + char_end: 4, + byte_end: 12, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + ..Default::default() + }, + Token { + lemma: Owned("ๆพณไˆไบžๆœฌๅˆƒ๐ฃœœ".to_string()), + char_end: 5, + byte_end: 15, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]), + script: Script::Cj, + language: Some(Language::Cmn), + ..Default::default() + }, + ] + } + + // expected result of the complete Normalizer pieline. + #[cfg(not(feature = "chinese-normalization-pinyin"))] + fn normalized_tokens() -> Vec> { + vec![ + Token { + kind: TokenKind::Word, + lemma: Owned("ๅฐŠๅšด".to_string()), + char_start: 0, + char_end: 2, + byte_start: 0, + byte_end: 6, + char_map: Some(vec![(3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + }, + Token { + kind: TokenKind::Word, + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑ".to_string()), + char_start: 0, + char_end: 4, + byte_start: 0, + byte_end: 12, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]), + script: Script::Cj, + language: Some(Language::Cmn), + }, + Token { + kind: TokenKind::Word, + lemma: Owned("ๆพณไˆไบžๆœฌๅˆƒ๐ฃœœ".to_string()), + char_start: 0, + char_end: 5, + byte_start: 0, + byte_end: 15, + char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]), + script: Script::Cj, + language: Some(Language::Cmn), + }, + ] + } + test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens()); } diff --git a/charabia/src/normalizer/control_char.rs b/charabia/src/normalizer/control_char.rs index 94f4dd5f..c0d0d478 100644 --- a/charabia/src/normalizer/control_char.rs +++ b/charabia/src/normalizer/control_char.rs @@ -103,6 +103,7 @@ mod test { } // expected result of the complete Normalizer pieline. + #[cfg(feature = "chinese-normalization-pinyin")] fn normalized_tokens() -> Vec> { vec![ Token { @@ -146,5 +147,50 @@ mod test { ] } + // expected result of the complete Normalizer pieline. + #[cfg(not(feature = "chinese-normalization-pinyin"))] + fn normalized_tokens() -> Vec> { + vec![ + Token { + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑoo".to_string()), + char_end: 9, + byte_end: 17, + script: Script::Cj, + char_map: Some(vec![ + (1, 0), + (3, 3), + (3, 3), + (3, 3), + (3, 3), + (1, 0), + (1, 1), + (1, 1), + (1, 0), + ]), + kind: TokenKind::Word, + ..Default::default() + }, + Token { + lemma: Owned("็”Ÿ่€Œ่‡ช็”ฑoo".to_string()), + char_end: 9, + byte_end: 17, + script: Script::Cj, + char_map: Some(vec![ + (1, 0), + (3, 3), + (3, 3), + (3, 3), + (3, 3), + (1, 0), + (1, 1), + (1, 1), + (1, 0), + ]), + kind: TokenKind::Word, + ..Default::default() + }, + ] + } + test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens()); } diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 3f105518..31c783b3 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use once_cell::sync::Lazy; pub use self::arabic::ArabicNormalizer; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-normalization")] pub use self::chinese::ChineseNormalizer; pub use self::classify::{Classifier, ClassifierOption}; pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer; @@ -21,7 +21,7 @@ use crate::segmenter::SegmentedTokenIter; use crate::Token; mod arabic; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-normalization")] mod chinese; mod classify; mod compatibility_decomposition; @@ -50,7 +50,7 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { vec![ Box::new(LowercaseNormalizer), Box::new(QuoteNormalizer), - #[cfg(feature = "chinese")] + #[cfg(feature = "chinese-normalization")] Box::new(ChineseNormalizer), #[cfg(feature = "japanese-transliteration")] Box::new(JapaneseNormalizer), diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs index 631936ff..9af1ad11 100644 --- a/charabia/src/segmenter/chinese.rs +++ b/charabia/src/segmenter/chinese.rs @@ -64,6 +64,7 @@ mod test { ]; // Segmented and normalized version of the text. + #[cfg(feature = "chinese-normalization-pinyin")] const TOKENIZED: &[&str] = &[ "rรฉnrรฉn", "shฤ“ngรฉrzรฌyรณu", @@ -99,6 +100,42 @@ mod test { "ใ€‚", ]; + #[cfg(not(feature = "chinese-normalization-pinyin"))] + const TOKENIZED: &[&str] = &[ + "ไบบไบบ", + "็”Ÿ่€Œ่‡ช็”ฑ", + ",", + "ๅœจ", + "ๅฐŠ", + "ๅšด", + "ๅ’Œ", + "ๆฌŠ", + "ๅˆฉ", + "ไธŠ", + "ไธ€ๅพ‹ๅนณ็ญ‰", + "ใ€‚", + "ไป–", + "ๅ€‘", + "่ณฆ", + "ๆœ‰", + "็†ๆ€ง", + "ๅ’Œ", + "่‰ฏๅฟƒ", + ",", + "ไธฆ", + "ๆ‡‰", + "ไปฅ", + "ๅ…„ๅผŸ", + "้—œ", + "ไฟ‚", + "็š„", + "็ฒพ็ฅž", + "ไบ’็›ธ", + "ๅฐ", + "ๅพ…", + "ใ€‚", + ]; + // Macro that run several tests on the Segmenter. test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn); } diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs index 0cff67d9..579d3389 100644 --- a/charabia/src/segmenter/mod.rs +++ b/charabia/src/segmenter/mod.rs @@ -3,11 +3,13 @@ use std::collections::HashMap; use aho_corasick::{AhoCorasick, FindIter, MatchKind}; pub use arabic::ArabicSegmenter; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-segmentation")] pub use chinese::ChineseSegmenter; use either::Either; #[cfg(feature = "japanese")] pub use japanese::JapaneseSegmenter; +#[cfg(feature = "khmer")] +pub use khmer::KhmerSegmenter; #[cfg(feature = "korean")] pub use korean::KoreanSegmenter; pub use latin::LatinSegmenter; @@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy; #[cfg(feature = "thai")] pub use thai::ThaiSegmenter; -#[cfg(feature = "khmer")] -pub use khmer::KhmerSegmenter; - use crate::detection::{Detect, Language, Script, StrDetection}; use crate::separators::DEFAULT_SEPARATORS; use crate::token::Token; mod arabic; -#[cfg(feature = "chinese")] +#[cfg(feature = "chinese-segmentation")] mod chinese; #[cfg(feature = "japanese")] mod japanese; @@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy>> = L // latin segmenter ((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box), // chinese segmenter - #[cfg(feature = "chinese")] + #[cfg(feature = "chinese-segmentation")] ((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box), // japanese segmenter #[cfg(feature = "japanese")] From 2b61ed2fa1a0a6cca00ee1ce04d0aa62d7bfc09e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Apr 2024 17:04:32 +0200 Subject: [PATCH 5/8] clippy --- charabia/src/tokenizer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs index 7e4c5a93..2f706e9f 100644 --- a/charabia/src/tokenizer.rs +++ b/charabia/src/tokenizer.rs @@ -313,7 +313,7 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { // TODO: avoid recreating the automaton if nothing changed match (self.normalizer_option.classifier.separators, self.words_dict) { (Some(separators), None) => { - let pattern = separators.into_iter().filter(|s| !s.is_empty()); + let pattern = separators.iter().filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) .build(pattern) @@ -326,7 +326,7 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { let separators = separators.unwrap_or(DEFAULT_SEPARATORS); // merge both lists together and create the Aho-Corasick automaton. let pattern = - words.into_iter().chain(separators.into_iter()).filter(|s| !s.is_empty()); + words.iter().chain(separators).filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) .build(pattern) From 654a0c970c85ac79b773bb892626f6a58aacd3d3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Apr 2024 17:07:28 +0200 Subject: [PATCH 6/8] rustfmt --- charabia/src/tokenizer.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs index 2f706e9f..e9b5f4dd 100644 --- a/charabia/src/tokenizer.rs +++ b/charabia/src/tokenizer.rs @@ -325,8 +325,7 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> { // use the default separators' list if a custom words' list is given but no custom separators' list. let separators = separators.unwrap_or(DEFAULT_SEPARATORS); // merge both lists together and create the Aho-Corasick automaton. - let pattern = - words.iter().chain(separators).filter(|s| !s.is_empty()); + let pattern = words.iter().chain(separators).filter(|s| !s.is_empty()); let aho = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) .build(pattern) From 2796c4ff46c640cc8f67a1005825ae52912d444e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 18 Apr 2024 08:13:38 +0000 Subject: [PATCH 7/8] Update version for the next release (v0.8.9) in Cargo.toml files --- charabia/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index a3071075..0b6628e8 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "charabia" -version = "0.8.8" +version = "0.8.9" license = "MIT" authors = ["Many "] edition = "2021" From 0547264431d2f4ae05d222863a9947e547d623bf Mon Sep 17 00:00:00 2001 From: Many the fish Date: Thu, 18 Apr 2024 10:18:16 +0200 Subject: [PATCH 8/8] Update README.md --- charabia/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/README.md b/charabia/README.md index 50a76848..9bc0d01b 100644 --- a/charabia/README.md +++ b/charabia/README.md @@ -19,7 +19,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor | **Latin** | โœ… CamelCase segmentation | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `ร vs ฤ` spoofing normalization | ๐ŸŸฉ ~23MiB/sec | ๐ŸŸจ ~9MiB/sec | | **Greek** | โŒ | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | ๐ŸŸฉ ~27MiB/sec | ๐ŸŸจ ~8MiB/sec | | **Cyrillic** - **Georgian** | โŒ | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | ๐ŸŸฉ ~27MiB/sec | ๐ŸŸจ ~9MiB/sec | -| **Chinese** **CMN** ๐Ÿ‡จ๐Ÿ‡ณ | โœ… [jieba](https://github.com/messense/jieba-rs) | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + pinyin conversion | ๐ŸŸจ ~10MiB/sec | ๐ŸŸง ~5MiB/sec | +| **Chinese** **CMN** ๐Ÿ‡จ๐Ÿ‡ณ | โœ… [jieba](https://github.com/messense/jieba-rs) | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | ๐ŸŸจ ~10MiB/sec | ๐ŸŸง ~5MiB/sec | | **Hebrew** ๐Ÿ‡ฎ๐Ÿ‡ฑ | โŒ | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal | ๐ŸŸฉ ~33MiB/sec | ๐ŸŸจ ~11MiB/sec | | **Arabic** | โœ… `ุงู„` segmentation | โœ… [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + [Tatweel, Alef, Yeh, and Taa Marbuta normalization] | ๐ŸŸฉ ~36MiB/sec | ๐ŸŸจ ~11MiB/sec | | **Japanese** ๐Ÿ‡ฏ๐Ÿ‡ต | โœ… [lindera](https://github.com/lindera-morphology/lindera) IPA-dict | โŒ [compatibility decomposition](https://unicode.org/reports/tr15/) | ๐ŸŸง ~3MiB/sec | ๐ŸŸง ~3MiB/sec |