Merge branch 'main' of github.com:Soham1803/charabia into feat/normal…

…ize-ae-oe
meilisearch · Apr 26, 2024 · 2ff1080 · 2ff1080
2 parents 4f29a36 + 5a64163
commit 2ff1080
Show file tree

Hide file tree

Showing 12 changed files with 197 additions and 37 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -26,6 +26,8 @@ jobs:
       run: cargo test --verbose
     - name: Run tests with japanese-transliteration on
       run: cargo test --verbose --features japanese-transliteration
+    - name: Run tests with chinese-normalization-pinyin on
+      run: cargo test --verbose --features chinese chinese-normalization-pinyin
     - name: Run irg-kvariants tests
       run: cargo test -p irg-kvariants --verbose
 

diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "charabia"
-version = "0.8.8"
+version = "0.8.9"
 license = "MIT"
 authors = ["Many <[email protected]>"]
 edition = "2021"
@@ -24,9 +24,7 @@ once_cell = "1.19.0"
 serde = "1.0"
 slice-group-by = "0.3.1"
 whatlang = "0.16.4"
-lindera-core = "=0.28.0"
-lindera-dictionary = "=0.28.0"
-lindera-tokenizer = { version = "=0.28.0", default-features = false, optional = true }
+lindera = { version = "=0.30.0", default-features = false, optional = true }
 pinyin = { version = "0.10", default-features = false, features = [
   "with_tone",
 ], optional = true }
@@ -41,19 +39,22 @@ jemalloc-sys = "0.5.4"
 default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
 
 # allow chinese specialized tokenization
-chinese = ["dep:pinyin", "dep:jieba-rs"]
+chinese = ["chinese-segmentation", "chinese-normalization"]
+chinese-segmentation = ["dep:jieba-rs"]
+chinese-normalization = []
+chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"]
 
 # allow hebrew specialized tokenization
 hebrew = []
 
 # allow japanese specialized tokenization
 japanese = ["japanese-segmentation-unidic"]
-japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
-japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
+japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"]
+japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"]
 japanese-transliteration = ["dep:wana_kana"]
 
 # allow korean specialized tokenization
-korean = ["lindera-tokenizer/ko-dic", "lindera-tokenizer/ko-dic-compress"]
+korean = ["lindera/ko-dic", "lindera/compress"]
 
 # allow thai specialized tokenization
 thai = []

diff --git a/charabia/README.md b/charabia/README.md
@@ -19,7 +19,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
 | **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization         | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
 | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization         | 🟩 ~27MiB/sec    | 🟨 ~8MiB/sec    |
 | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase          | 🟩 ~27MiB/sec    | 🟨 ~9MiB/sec    |
-| **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + pinyin conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |
+| **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |
 | **Hebrew** 🇮🇱 | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal  | 🟩 ~33MiB/sec    | 🟨 ~11MiB/sec    |
 | **Arabic**  | ✅ `ال` segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + [Tatweel, Alef, Yeh, and Taa Marbuta normalization]  | 🟩 ~36MiB/sec    | 🟨 ~11MiB/sec    |
 | **Japanese** 🇯🇵 | ✅ [lindera](https://github.com/lindera-morphology/lindera) IPA-dict | ❌ [compatibility decomposition](https://unicode.org/reports/tr15/) | 🟧 ~3MiB/sec    | 🟧 ~3MiB/sec    |

diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs
@@ -1,3 +1,4 @@
+#[cfg(feature = "chinese-normalization-pinyin")]
 use pinyin::ToPinyin;
 
 use super::CharNormalizer;
@@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer {
         // Normalize to Pinyin
         // If we don't manage to convert the kvariant, we try to convert the original character.
         // If none of them are converted, we return the kvariant.
-        match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
+        #[cfg(feature = "chinese-normalization-pinyin")]
+        let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
             Some(converted) => {
                 let with_tone = converted.with_tone();
 
-                Some(with_tone.to_string().into())
+                with_tone.to_string()
             }
-            None => Some(kvariant.into()), // e.g. 杤
-        }
+            None => kvariant, // e.g. 杤
+        };
+
+        Some(kvariant.into())
     }
 
     fn should_normalize(&self, token: &Token) -> bool {
@@ -77,6 +81,7 @@ mod test {
     }
 
     // expected result of the current Normalizer.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalizer_result() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -113,6 +118,7 @@ mod test {
     }
 
     // expected result of the complete Normalizer pieline.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -148,5 +154,79 @@ mod test {
         ]
     }
 
+    // expected result of the current Normalizer.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("尊嚴".to_string()),
+                char_end: 2,
+                byte_end: 6,
+                char_map: Some(vec![(3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("生而自由".to_string()),
+                char_end: 4,
+                byte_end: 12,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("澳䁈亞本刃𣜜".to_string()),
+                char_end: 5,
+                byte_end: 15,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("尊嚴".to_string()),
+                char_start: 0,
+                char_end: 2,
+                byte_start: 0,
+                byte_end: 6,
+                char_map: Some(vec![(3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("生而自由".to_string()),
+                char_start: 0,
+                char_end: 4,
+                byte_start: 0,
+                byte_end: 12,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("澳䁈亞本刃𣜜".to_string()),
+                char_start: 0,
+                char_end: 5,
+                byte_start: 0,
+                byte_end: 15,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+        ]
+    }
+
     test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }
diff --git a/charabia/src/normalizer/control_char.rs b/charabia/src/normalizer/control_char.rs
@@ -103,6 +103,7 @@ mod test {
     }
 
     // expected result of the complete Normalizer pieline.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -146,5 +147,50 @@ mod test {
         ]
     }
 
+    // expected result of the complete Normalizer pieline.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("生而自由oo".to_string()),
+                char_end: 9,
+                byte_end: 17,
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (1, 0),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (1, 0),
+                    (1, 1),
+                    (1, 1),
+                    (1, 0),
+                ]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("生而自由oo".to_string()),
+                char_end: 9,
+                byte_end: 17,
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (1, 0),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (1, 0),
+                    (1, 1),
+                    (1, 1),
+                    (1, 0),
+                ]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+        ]
+    }
+
     test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -3,7 +3,7 @@ use std::borrow::Cow;
 use once_cell::sync::Lazy;
 
 pub use self::arabic::ArabicNormalizer;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-normalization")]
 pub use self::chinese::ChineseNormalizer;
 pub use self::classify::{Classifier, ClassifierOption};
 pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
@@ -23,7 +23,7 @@ use crate::Token;
 pub use self::ae_oe_normalizer::AeOeNormalizer;
 
 mod arabic;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-normalization")]
 mod chinese;
 mod classify;
 mod compatibility_decomposition;
@@ -55,7 +55,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(LowercaseNormalizer),
         Box::new(QuoteNormalizer),
         Box::new(AeOeNormalizer),
-        #[cfg(feature = "chinese")]
+        #[cfg(feature = "chinese-normalization")]
         Box::new(ChineseNormalizer),
         #[cfg(feature = "japanese-transliteration")]
         Box::new(JapaneseNormalizer),

diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs
@@ -64,6 +64,7 @@ mod test {
     ];
 
     // Segmented and normalized version of the text.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     const TOKENIZED: &[&str] = &[
         "rénrén",
         "shēngérzìyóu",
@@ -99,6 +100,42 @@ mod test {
         "。",
     ];
 
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    const TOKENIZED: &[&str] = &[
+        "人人",
+        "生而自由",
+        ",",
+        "在",
+        "尊",
+        "嚴",
+        "和",
+        "權",
+        "利",
+        "上",
+        "一律平等",
+        "。",
+        "他",
+        "們",
+        "賦",
+        "有",
+        "理性",
+        "和",
+        "良心",
+        ",",
+        "並",
+        "應",
+        "以",
+        "兄弟",
+        "關",
+        "係",
+        "的",
+        "精神",
+        "互相",
+        "對",
+        "待",
+        "。",
+    ];
+
     // Macro that run several tests on the Segmenter.
     test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);
 }
diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
@@ -1,8 +1,6 @@
-use lindera_core::mode::Mode;
 #[cfg(feature = "japanese-segmentation-ipadic")]
-use lindera_core::mode::Penalty;
-use lindera_dictionary::{DictionaryConfig, DictionaryKind};
-use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::Penalty;
+use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
 
 use crate::segmenter::Segmenter;

diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs
@@ -1,6 +1,4 @@
-use lindera_core::mode::{Mode, Penalty};
-use lindera_dictionary::{DictionaryConfig, DictionaryKind};
-use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
 
 use crate::segmenter::Segmenter;

diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
@@ -3,11 +3,13 @@ use std::collections::HashMap;
 
 use aho_corasick::{AhoCorasick, FindIter, MatchKind};
 pub use arabic::ArabicSegmenter;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-segmentation")]
 pub use chinese::ChineseSegmenter;
 use either::Either;
 #[cfg(feature = "japanese")]
 pub use japanese::JapaneseSegmenter;
+#[cfg(feature = "khmer")]
+pub use khmer::KhmerSegmenter;
 #[cfg(feature = "korean")]
 pub use korean::KoreanSegmenter;
 pub use latin::LatinSegmenter;
@@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy;
 #[cfg(feature = "thai")]
 pub use thai::ThaiSegmenter;
 
-#[cfg(feature = "khmer")]
-pub use khmer::KhmerSegmenter;
-
 use crate::detection::{Detect, Language, Script, StrDetection};
 use crate::separators::DEFAULT_SEPARATORS;
 use crate::token::Token;
 
 mod arabic;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-segmentation")]
 mod chinese;
 #[cfg(feature = "japanese")]
 mod japanese;
@@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy<HashMap<(Script, Language), Box<dyn Segmenter>>> = L
         // latin segmenter
         ((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
         // chinese segmenter
-        #[cfg(feature = "chinese")]
+        #[cfg(feature = "chinese-segmentation")]
         ((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
         // japanese segmenter
         #[cfg(feature = "japanese")]

diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs
@@ -59,7 +59,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[
     "𑪠", "𑪡", "𑪢", "𑱁", "𑱂", "𑱃", "𑱄", "𑱅", "𑱰", "𑱱", "𑻷", "𑻸", "𑿿", "𒑰", "𒑱", "𒑲", "𒑳", "𒑴", "𖩮",
     "𖩯", "𖫵", "𖬷", "𖬸", "𖬹", "𖬺", "𖬻", "𖭄", "𖺗", "𖺘", "𖺙", "𖺚", "𖿢", "𛲟", "𝪇", "𝪈", "𝪉", "𝪊", "𝪋",
     "𞥞", "𞥟", "\n", "\r", "\u{2029}", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ",
-    " ", "　", "`"
+    " ", "　", "`", "\t"
 ];
 
 #[rustfmt::skip]