diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 69562d5..6bf0568 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "charabia" -version = "0.8.12" +version = "0.9.0" license = "MIT" authors = ["Many "] edition = "2021" @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] diff --git a/charabia/src/normalizer/arabic.rs b/charabia/src/normalizer/arabic.rs index b12812f..306b06e 100644 --- a/charabia/src/normalizer/arabic.rs +++ b/charabia/src/normalizer/arabic.rs @@ -8,8 +8,8 @@ use crate::{Script, Token}; /// - normalizing the arabic Alef 'أ','إ','آ','ٱ' to 'ا' /// - normalizing the arabic Yeh 'ى' to 'ي' /// - Normalizing the arabic Taa Marbuta 'ة' to 'ه' -/// https://en.wikipedia.org/wiki/Arabic_alphabet -/// https://en.wikipedia.org/wiki/Kashida +/// https://en.wikipedia.org/wiki/Arabic_alphabet +/// https://en.wikipedia.org/wiki/Kashida pub struct ArabicNormalizer; diff --git a/charabia/src/normalizer/swedish_recomposition.rs b/charabia/src/normalizer/swedish_recomposition.rs index 9a38da4..cd77e28 100644 --- a/charabia/src/normalizer/swedish_recomposition.rs +++ b/charabia/src/normalizer/swedish_recomposition.rs @@ -8,7 +8,7 @@ use crate::normalizer::NormalizerOption; use crate::{Language, Token}; static MATCHING_STR: Lazy = Lazy::new(|| { - AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"]) + AhoCorasick::new(["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"]) .unwrap() }); diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index c821204..29fc26a 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -12,8 +12,8 @@ /// - Zl Line Separator /// - Zp Paragraph Separator /// - Zs Space Separator -/// plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators -/// and "`" to understand markdown formatted text +/// plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators +/// and "`" to understand markdown formatted text #[rustfmt::skip] pub const DEFAULT_SEPARATORS: &[&str] = &[ "\0", ". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–",