meilisearch · ManyTheFish · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "charabia"
-version = "0.8.12"
+version = "0.9.0"
 license = "MIT"
 authors = ["Many <[email protected]>"]
 edition = "2021"
@@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
 irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]

diff --git a/charabia/src/normalizer/arabic.rs b/charabia/src/normalizer/arabic.rs
@@ -8,8 +8,8 @@ use crate::{Script, Token};
 /// - normalizing the arabic Alef 'أ','إ','آ','ٱ' to 'ا'
 /// - normalizing the arabic Yeh 'ى' to 'ي'
 /// - Normalizing the arabic Taa Marbuta 'ة' to 'ه'
-/// https://en.wikipedia.org/wiki/Arabic_alphabet
-/// https://en.wikipedia.org/wiki/Kashida
+///   https://en.wikipedia.org/wiki/Arabic_alphabet
+///   https://en.wikipedia.org/wiki/Kashida
 
 pub struct ArabicNormalizer;
 

diff --git a/charabia/src/normalizer/swedish_recomposition.rs b/charabia/src/normalizer/swedish_recomposition.rs
@@ -8,7 +8,7 @@ use crate::normalizer::NormalizerOption;
 use crate::{Language, Token};
 
 static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
-    AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
+    AhoCorasick::new(["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
         .unwrap()
 });
 

diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs
@@ -12,8 +12,8 @@
 /// - Zl Line Separator
 /// - Zp Paragraph Separator
 /// - Zs Space Separator
-/// plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators
-/// and "`" to understand markdown formatted text
+///   plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators
+///   and "`" to understand markdown formatted text
 #[rustfmt::skip]
 pub const DEFAULT_SEPARATORS: &[&str] = &[
     "\0", ". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "＿", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–",