Merge #279

279: Update Lindera to 0.30.0 r=ManyTheFish a=mosuka # Pull Request ## Related issue #265 ## What does this PR do? - Update Lindera version to 0.30.0. This downloads the dictionary from a GitHub repository, which seems to be more stable. - Optimize built dict.words file size. For unidic, dict.words size is decreased from 200.1MB to 106.3MB. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Minoru OSUKA <[email protected]>
meilisearch · Apr 16, 2024 · a44a213 · a44a213
2 parents e3df008 + 7acd983
commit a44a213
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 13 deletions.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -24,9 +24,7 @@ once_cell = "1.19.0"
 serde = "1.0"
 slice-group-by = "0.3.1"
 whatlang = "0.16.4"
-lindera-core = "=0.28.0"
-lindera-dictionary = "=0.28.0"
-lindera-tokenizer = { version = "=0.28.0", default-features = false, optional = true }
+lindera = { version = "=0.30.0", default-features = false, optional = true }
 pinyin = { version = "0.10", default-features = false, features = [
   "with_tone",
 ], optional = true }
@@ -47,12 +45,12 @@ hebrew = []
 
 # allow japanese specialized tokenization
 japanese = ["japanese-segmentation-unidic"]
-japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
-japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
+japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"]
+japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"]
 japanese-transliteration = ["dep:wana_kana"]
 
 # allow korean specialized tokenization
-korean = ["lindera-tokenizer/ko-dic", "lindera-tokenizer/ko-dic-compress"]
+korean = ["lindera/ko-dic", "lindera/compress"]
 
 # allow thai specialized tokenization
 thai = []

diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
@@ -1,8 +1,6 @@
-use lindera_core::mode::Mode;
 #[cfg(feature = "japanese-segmentation-ipadic")]
-use lindera_core::mode::Penalty;
-use lindera_dictionary::{DictionaryConfig, DictionaryKind};
-use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::Penalty;
+use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
 
 use crate::segmenter::Segmenter;

diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs
@@ -1,6 +1,4 @@
-use lindera_core::mode::{Mode, Penalty};
-use lindera_dictionary::{DictionaryConfig, DictionaryKind};
-use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
 
 use crate::segmenter::Segmenter;