From 7acd9838cecb86d55b3b95a31a61e653048e55c6 Mon Sep 17 00:00:00 2001
From: Minoru OSUKA <minoru.osuka@gmail.com>
Date: Sat, 13 Apr 2024 22:51:44 +0900
Subject: [PATCH 1/8] Update Lindera to 0.30.0

---
 charabia/Cargo.toml                | 10 ++++------
 charabia/src/segmenter/japanese.rs |  6 ++----
 charabia/src/segmenter/korean.rs   |  4 +---
 3 files changed, 7 insertions(+), 13 deletions(-)
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index ac738b4c..4b9819a0 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -24,9 +24,7 @@ once_cell = "1.19.0"
 serde = "1.0"
 slice-group-by = "0.3.1"
 whatlang = "0.16.4"
-lindera-core = "=0.28.0"
-lindera-dictionary = "=0.28.0"
-lindera-tokenizer = { version = "=0.28.0", default-features = false, optional = true }
+lindera = { version = "=0.30.0", default-features = false, optional = true }
 pinyin = { version = "0.10", default-features = false, features = [
   "with_tone",
 ], optional = true }
@@ -47,12 +45,12 @@ hebrew = []
 
 # allow japanese specialized tokenization
 japanese = ["japanese-segmentation-unidic"]
-japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
-japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
+japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"]
+japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"]
 japanese-transliteration = ["dep:wana_kana"]
 
 # allow korean specialized tokenization
-korean = ["lindera-tokenizer/ko-dic", "lindera-tokenizer/ko-dic-compress"]
+korean = ["lindera/ko-dic", "lindera/compress"]
 
 # allow thai specialized tokenization
 thai = []
diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
index da256718..e19c4e94 100644
--- a/charabia/src/segmenter/japanese.rs
+++ b/charabia/src/segmenter/japanese.rs
@@ -1,8 +1,6 @@
-use lindera_core::mode::Mode;
 #[cfg(feature = "japanese-segmentation-ipadic")]
-use lindera_core::mode::Penalty;
-use lindera_dictionary::{DictionaryConfig, DictionaryKind};
-use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::Penalty;
+use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
 
 use crate::segmenter::Segmenter;
diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs
index b0c001b5..31604929 100644
--- a/charabia/src/segmenter/korean.rs
+++ b/charabia/src/segmenter/korean.rs
@@ -1,6 +1,4 @@
-use lindera_core::mode::{Mode, Penalty};
-use lindera_dictionary::{DictionaryConfig, DictionaryKind};
-use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
 
 use crate::segmenter::Segmenter;

From e41dfd5ede94c399cd6f9b0c0df198ece0e508b2 Mon Sep 17 00:00:00 2001
From: Gusted <postmaster@gusted.xyz>
Date: Mon, 15 Apr 2024 23:54:47 +0200
Subject: [PATCH 2/8] Add `\t` as recognized separator

Currently `\t` isn't seen as an recognized separator. This was causing
issues for meilisearch, when it was trying to search on a
keyword (fuzzy or exact match) and in the document the keyword was
present but the character before the keyword was an `\t` charabia would
create a token that was `\t<keyword>` which in turn led to meilisearch
returning the document as part of the search but not returning the
positions of matches (`_matchesPosition` field).

The actual reproducer for this bug was code files of the Linux
kernel (such as `fs/ext4/readpage.c`) which uses tabs for indentation
and searching for keywords like `while` would usually be 'prefixed' by
an tab causing the described issue. Making `\t` a separator fixed this
issue.
---
 charabia/src/separators.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs
index 3d4a8498..60a00483 100644
--- a/charabia/src/separators.rs
+++ b/charabia/src/separators.rs
@@ -59,7 +59,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[
     "𑪠", "𑪡", "𑪢", "𑱁", "𑱂", "𑱃", "𑱄", "𑱅", "𑱰", "𑱱", "𑻷", "𑻸", "𑿿", "𒑰", "𒑱", "𒑲", "𒑳", "𒑴", "𖩮",
     "𖩯", "𖫵", "𖬷", "𖬸", "𖬹", "𖬺", "𖬻", "𖭄", "𖺗", "𖺘", "𖺙", "𖺚", "𖿢", "𛲟", "𝪇", "𝪈", "𝪉", "𝪊", "𝪋",
     "𞥞", "𞥟", "\n", "\r", "\u{2029}", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ",
-    " ", "　", "`"
+    " ", "　", "`", "\t"
 ];
 
 #[rustfmt::skip]

From b7d1c991868f6f0c50a8f73cf0a8c50c01ae45ba Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 17 Apr 2024 15:31:13 +0200
Subject: [PATCH 3/8] Filter empty token before inserting them in the
 AhoCorasick automaton avoiding a char boundary panic

---
 charabia/src/tokenizer.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs
index fc1e1aac..7e4c5a93 100644
--- a/charabia/src/tokenizer.rs
+++ b/charabia/src/tokenizer.rs
@@ -313,26 +313,26 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
         // TODO: avoid recreating the automaton if nothing changed
         match (self.normalizer_option.classifier.separators, self.words_dict) {
             (Some(separators), None) => {
+                let pattern = separators.into_iter().filter(|s| !s.is_empty());
                 let aho = AhoCorasick::builder()
                     .match_kind(MatchKind::LeftmostLongest)
-                    .build(separators)
+                    .build(pattern)
                     .unwrap();
 
-                self.segmenter_option.aho = Some(aho);
+                self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
             }
             (separators, Some(words)) => {
                 // use the default separators' list if a custom words' list is given but no custom separators' list.
                 let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
                 // merge both lists together and create the Aho-Corasick automaton.
-                let mut vec = Vec::with_capacity(separators.len() + words.len());
-                vec.extend_from_slice(words);
-                vec.extend_from_slice(separators);
+                let pattern =
+                    words.into_iter().chain(separators.into_iter()).filter(|s| !s.is_empty());
                 let aho = AhoCorasick::builder()
                     .match_kind(MatchKind::LeftmostLongest)
-                    .build(vec)
+                    .build(pattern)
                     .unwrap();
 
-                self.segmenter_option.aho = Some(aho);
+                self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
             }
             // reset the state in case the builder is reused.
             (None, None) => self.segmenter_option.aho = None,

From 5f2c7377eeb5ae0f40983398a978941abb0c6438 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 17 Apr 2024 16:56:12 +0200
Subject: [PATCH 4/8] Make the pinyin-normalization optional

---
 .github/workflows/rust.yml              |  2 +
 charabia/Cargo.toml                     |  5 +-
 charabia/src/normalizer/chinese.rs      | 88 +++++++++++++++++++++++--
 charabia/src/normalizer/control_char.rs | 46 +++++++++++++
 charabia/src/normalizer/mod.rs          |  6 +-
 charabia/src/segmenter/chinese.rs       | 37 +++++++++++
 charabia/src/segmenter/mod.rs           | 11 ++--
 7 files changed, 181 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index e98a3fc8..c0fb7664 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -26,6 +26,8 @@ jobs:
       run: cargo test --verbose
     - name: Run tests with japanese-transliteration on
       run: cargo test --verbose --features japanese-transliteration
+    - name: Run tests with chinese-normalization-pinyin on
+      run: cargo test --verbose --features chinese chinese-normalization-pinyin
     - name: Run irg-kvariants tests
       run: cargo test -p irg-kvariants --verbose
 
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index ac738b4c..3c17b8e4 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -40,7 +40,10 @@ zerovec = "0.10.1"
 default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
 
 # allow chinese specialized tokenization
-chinese = ["dep:pinyin", "dep:jieba-rs"]
+chinese = ["chinese-segmentation", "chinese-normalization"]
+chinese-segmentation = ["dep:jieba-rs"]
+chinese-normalization = []
+chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"]
 
 # allow hebrew specialized tokenization
 hebrew = []
diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs
index 4739903c..3b3d8ec0 100644
--- a/charabia/src/normalizer/chinese.rs
+++ b/charabia/src/normalizer/chinese.rs
@@ -1,3 +1,4 @@
+#[cfg(feature = "chinese-normalization-pinyin")]
 use pinyin::ToPinyin;
 
 use super::CharNormalizer;
@@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer {
         // Normalize to Pinyin
         // If we don't manage to convert the kvariant, we try to convert the original character.
         // If none of them are converted, we return the kvariant.
-        match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
+        #[cfg(feature = "chinese-normalization-pinyin")]
+        let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
             Some(converted) => {
                 let with_tone = converted.with_tone();
 
-                Some(with_tone.to_string().into())
+                with_tone.to_string()
             }
-            None => Some(kvariant.into()), // e.g. 杤
-        }
+            None => kvariant, // e.g. 杤
+        };
+
+        Some(kvariant.into())
     }
 
     fn should_normalize(&self, token: &Token) -> bool {
@@ -77,6 +81,7 @@ mod test {
     }
 
     // expected result of the current Normalizer.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalizer_result() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -113,6 +118,7 @@ mod test {
     }
 
     // expected result of the complete Normalizer pieline.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -148,5 +154,79 @@ mod test {
         ]
     }
 
+    // expected result of the current Normalizer.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("尊嚴".to_string()),
+                char_end: 2,
+                byte_end: 6,
+                char_map: Some(vec![(3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("生而自由".to_string()),
+                char_end: 4,
+                byte_end: 12,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("澳䁈亞本刃𣜜".to_string()),
+                char_end: 5,
+                byte_end: 15,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+                ..Default::default()
+            },
+        ]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("尊嚴".to_string()),
+                char_start: 0,
+                char_end: 2,
+                byte_start: 0,
+                byte_end: 6,
+                char_map: Some(vec![(3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("生而自由".to_string()),
+                char_start: 0,
+                char_end: 4,
+                byte_start: 0,
+                byte_end: 12,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+            Token {
+                kind: TokenKind::Word,
+                lemma: Owned("澳䁈亞本刃𣜜".to_string()),
+                char_start: 0,
+                char_end: 5,
+                byte_start: 0,
+                byte_end: 15,
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
+                script: Script::Cj,
+                language: Some(Language::Cmn),
+            },
+        ]
+    }
+
     test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }
diff --git a/charabia/src/normalizer/control_char.rs b/charabia/src/normalizer/control_char.rs
index 94f4dd5f..c0d0d478 100644
--- a/charabia/src/normalizer/control_char.rs
+++ b/charabia/src/normalizer/control_char.rs
@@ -103,6 +103,7 @@ mod test {
     }
 
     // expected result of the complete Normalizer pieline.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
@@ -146,5 +147,50 @@ mod test {
         ]
     }
 
+    // expected result of the complete Normalizer pieline.
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![
+            Token {
+                lemma: Owned("生而自由oo".to_string()),
+                char_end: 9,
+                byte_end: 17,
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (1, 0),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (1, 0),
+                    (1, 1),
+                    (1, 1),
+                    (1, 0),
+                ]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+            Token {
+                lemma: Owned("生而自由oo".to_string()),
+                char_end: 9,
+                byte_end: 17,
+                script: Script::Cj,
+                char_map: Some(vec![
+                    (1, 0),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (3, 3),
+                    (1, 0),
+                    (1, 1),
+                    (1, 1),
+                    (1, 0),
+                ]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
+        ]
+    }
+
     test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens());
 }
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index 3f105518..31c783b3 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -3,7 +3,7 @@ use std::borrow::Cow;
 use once_cell::sync::Lazy;
 
 pub use self::arabic::ArabicNormalizer;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-normalization")]
 pub use self::chinese::ChineseNormalizer;
 pub use self::classify::{Classifier, ClassifierOption};
 pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
@@ -21,7 +21,7 @@ use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
 mod arabic;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-normalization")]
 mod chinese;
 mod classify;
 mod compatibility_decomposition;
@@ -50,7 +50,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
         Box::new(LowercaseNormalizer),
         Box::new(QuoteNormalizer),
-        #[cfg(feature = "chinese")]
+        #[cfg(feature = "chinese-normalization")]
         Box::new(ChineseNormalizer),
         #[cfg(feature = "japanese-transliteration")]
         Box::new(JapaneseNormalizer),
diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs
index 631936ff..9af1ad11 100644
--- a/charabia/src/segmenter/chinese.rs
+++ b/charabia/src/segmenter/chinese.rs
@@ -64,6 +64,7 @@ mod test {
     ];
 
     // Segmented and normalized version of the text.
+    #[cfg(feature = "chinese-normalization-pinyin")]
     const TOKENIZED: &[&str] = &[
         "rénrén",
         "shēngérzìyóu",
@@ -99,6 +100,42 @@ mod test {
         "。",
     ];
 
+    #[cfg(not(feature = "chinese-normalization-pinyin"))]
+    const TOKENIZED: &[&str] = &[
+        "人人",
+        "生而自由",
+        ",",
+        "在",
+        "尊",
+        "嚴",
+        "和",
+        "權",
+        "利",
+        "上",
+        "一律平等",
+        "。",
+        "他",
+        "們",
+        "賦",
+        "有",
+        "理性",
+        "和",
+        "良心",
+        ",",
+        "並",
+        "應",
+        "以",
+        "兄弟",
+        "關",
+        "係",
+        "的",
+        "精神",
+        "互相",
+        "對",
+        "待",
+        "。",
+    ];
+
     // Macro that run several tests on the Segmenter.
     test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);
 }
diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
index 0cff67d9..579d3389 100644
--- a/charabia/src/segmenter/mod.rs
+++ b/charabia/src/segmenter/mod.rs
@@ -3,11 +3,13 @@ use std::collections::HashMap;
 
 use aho_corasick::{AhoCorasick, FindIter, MatchKind};
 pub use arabic::ArabicSegmenter;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-segmentation")]
 pub use chinese::ChineseSegmenter;
 use either::Either;
 #[cfg(feature = "japanese")]
 pub use japanese::JapaneseSegmenter;
+#[cfg(feature = "khmer")]
+pub use khmer::KhmerSegmenter;
 #[cfg(feature = "korean")]
 pub use korean::KoreanSegmenter;
 pub use latin::LatinSegmenter;
@@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy;
 #[cfg(feature = "thai")]
 pub use thai::ThaiSegmenter;
 
-#[cfg(feature = "khmer")]
-pub use khmer::KhmerSegmenter;
-
 use crate::detection::{Detect, Language, Script, StrDetection};
 use crate::separators::DEFAULT_SEPARATORS;
 use crate::token::Token;
 
 mod arabic;
-#[cfg(feature = "chinese")]
+#[cfg(feature = "chinese-segmentation")]
 mod chinese;
 #[cfg(feature = "japanese")]
 mod japanese;
@@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy<HashMap<(Script, Language), Box<dyn Segmenter>>> = L
         // latin segmenter
         ((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
         // chinese segmenter
-        #[cfg(feature = "chinese")]
+        #[cfg(feature = "chinese-segmentation")]
         ((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
         // japanese segmenter
         #[cfg(feature = "japanese")]

From 2b61ed2fa1a0a6cca00ee1ce04d0aa62d7bfc09e Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 17 Apr 2024 17:04:32 +0200
Subject: [PATCH 5/8] clippy

---
 charabia/src/tokenizer.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs
index 7e4c5a93..2f706e9f 100644
--- a/charabia/src/tokenizer.rs
+++ b/charabia/src/tokenizer.rs
@@ -313,7 +313,7 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
         // TODO: avoid recreating the automaton if nothing changed
         match (self.normalizer_option.classifier.separators, self.words_dict) {
             (Some(separators), None) => {
-                let pattern = separators.into_iter().filter(|s| !s.is_empty());
+                let pattern = separators.iter().filter(|s| !s.is_empty());
                 let aho = AhoCorasick::builder()
                     .match_kind(MatchKind::LeftmostLongest)
                     .build(pattern)
@@ -326,7 +326,7 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
                 let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
                 // merge both lists together and create the Aho-Corasick automaton.
                 let pattern =
-                    words.into_iter().chain(separators.into_iter()).filter(|s| !s.is_empty());
+                    words.iter().chain(separators).filter(|s| !s.is_empty());
                 let aho = AhoCorasick::builder()
                     .match_kind(MatchKind::LeftmostLongest)
                     .build(pattern)

From 654a0c970c85ac79b773bb892626f6a58aacd3d3 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 17 Apr 2024 17:07:28 +0200
Subject: [PATCH 6/8] rustfmt

---
 charabia/src/tokenizer.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/charabia/src/tokenizer.rs b/charabia/src/tokenizer.rs
index 2f706e9f..e9b5f4dd 100644
--- a/charabia/src/tokenizer.rs
+++ b/charabia/src/tokenizer.rs
@@ -325,8 +325,7 @@ impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
                 // use the default separators' list if a custom words' list is given but no custom separators' list.
                 let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
                 // merge both lists together and create the Aho-Corasick automaton.
-                let pattern =
-                    words.iter().chain(separators).filter(|s| !s.is_empty());
+                let pattern = words.iter().chain(separators).filter(|s| !s.is_empty());
                 let aho = AhoCorasick::builder()
                     .match_kind(MatchKind::LeftmostLongest)
                     .build(pattern)

From 2796c4ff46c640cc8f67a1005825ae52912d444e Mon Sep 17 00:00:00 2001
From: ManyTheFish <ManyTheFish@users.noreply.github.com>
Date: Thu, 18 Apr 2024 08:13:38 +0000
Subject: [PATCH 7/8] Update version for the next release (v0.8.9) in
 Cargo.toml files

---
 charabia/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index a3071075..0b6628e8 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "charabia"
-version = "0.8.8"
+version = "0.8.9"
 license = "MIT"
 authors = ["Many <many@meilisearch.com>"]
 edition = "2021"

From 0547264431d2f4ae05d222863a9947e547d623bf Mon Sep 17 00:00:00 2001
From: Many the fish <many@meilisearch.com>
Date: Thu, 18 Apr 2024 10:18:16 +0200
Subject: [PATCH 8/8] Update README.md

---
 charabia/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charabia/README.md b/charabia/README.md
index 50a76848..9bc0d01b 100644
--- a/charabia/README.md
+++ b/charabia/README.md
@@ -19,7 +19,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
 | **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization         | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
 | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization         | 🟩 ~27MiB/sec    | 🟨 ~8MiB/sec    |
 | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase          | 🟩 ~27MiB/sec    | 🟨 ~9MiB/sec    |
-| **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + pinyin conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |
+| **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |
 | **Hebrew** 🇮🇱 | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal  | 🟩 ~33MiB/sec    | 🟨 ~11MiB/sec    |
 | **Arabic**  | ✅ `ال` segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + [Tatweel, Alef, Yeh, and Taa Marbuta normalization]  | 🟩 ~36MiB/sec    | 🟨 ~11MiB/sec    |
 | **Japanese** 🇯🇵 | ✅ [lindera](https://github.com/lindera-morphology/lindera) IPA-dict | ❌ [compatibility decomposition](https://unicode.org/reports/tr15/) | 🟧 ~3MiB/sec    | 🟧 ~3MiB/sec    |