From 4189dad13198896f9b28734f14bc8c13215ecc53 Mon Sep 17 00:00:00 2001 From: ngdbao Date: Wed, 17 Jan 2024 00:46:56 +0700 Subject: [PATCH 1/3] =?UTF-8?q?normalize=20=C3=90=20and=20=C4=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- charabia/Cargo.toml | 5 ++++- charabia/src/normalizer/mod.rs | 5 +++++ charabia/src/normalizer/vietnamese.rs | 23 +++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 charabia/src/normalizer/vietnamese.rs diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 6c3bdeb9..6dd4e1cd 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -37,7 +37,7 @@ litemap = "0.6.1" zerovec = "0.9.3" [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] # allow chinese specialized tokenization chinese = ["dep:pinyin", "dep:jieba-rs"] @@ -65,6 +65,9 @@ latin-camelcase = ["dep:finl_unicode"] khmer = [] +# allow vietnamese specialized tokenization +vietnamese = [] + # allow splitting snake_case latin words latin-snakecase = ["dep:finl_unicode"] diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 4fe13de9..cd07b618 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -13,6 +13,7 @@ use self::greek::GreekNormalizer; #[cfg(feature = "japanese-transliteration")] pub use self::japanese::JapaneseNormalizer; pub use self::lowercase::LowercaseNormalizer; +pub use self::vietnamese::VietnameseNormalizer; use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; use crate::segmenter::SegmentedTokenIter; @@ -31,6 +32,8 @@ mod japanese; mod lowercase; mod nonspacing_mark; mod quote; +#[cfg(feature = "vietnamese")] +mod vietnamese; /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy. pub static NORMALIZERS: Lazy>> = Lazy::new(|| { @@ -54,6 +57,8 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { Box::new(GreekNormalizer), Box::new(ArabicNormalizer), Box::new(NonspacingMarkNormalizer), + #[cfg(feature = "vietnamese")] + Box::new(VietnameseNormalizer), ] }); diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs new file mode 100644 index 00000000..84c28bc9 --- /dev/null +++ b/charabia/src/normalizer/vietnamese.rs @@ -0,0 +1,23 @@ +use super::{CharNormalizer, CharOrStr}; +use crate::Token; +use crate::Script; + +pub struct VietnameseNormalizer; + +impl CharNormalizer for VietnameseNormalizer { + fn normalize_char(&self, c: char) -> Option { + match c { + 'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters + _ => None, + } + } + + fn should_normalize(&self, token: &Token) -> bool { + token.script == Script::Latin && token.lemma.chars().any(is_should_normalize) + } + +} + +fn is_should_normalize(c: char) -> bool { + matches!(c, 'Ð' | 'Đ' | 'đ') +} From 14be492e7beff2326ef37dedd194a73fbeae96c2 Mon Sep 17 00:00:00 2001 From: ngdbao Date: Wed, 17 Jan 2024 01:27:02 +0700 Subject: [PATCH 2/3] format code --- charabia/src/normalizer/mod.rs | 2 +- charabia/src/normalizer/vietnamese.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index cd07b618..404346be 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -13,9 +13,9 @@ use self::greek::GreekNormalizer; #[cfg(feature = "japanese-transliteration")] pub use self::japanese::JapaneseNormalizer; pub use self::lowercase::LowercaseNormalizer; -pub use self::vietnamese::VietnameseNormalizer; use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; +pub use self::vietnamese::VietnameseNormalizer; use crate::segmenter::SegmentedTokenIter; use crate::Token; diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs index 84c28bc9..78218bf7 100644 --- a/charabia/src/normalizer/vietnamese.rs +++ b/charabia/src/normalizer/vietnamese.rs @@ -1,6 +1,6 @@ use super::{CharNormalizer, CharOrStr}; -use crate::Token; use crate::Script; +use crate::Token; pub struct VietnameseNormalizer; @@ -15,7 +15,6 @@ impl CharNormalizer for VietnameseNormalizer { fn should_normalize(&self, token: &Token) -> bool { token.script == Script::Latin && token.lemma.chars().any(is_should_normalize) } - } fn is_should_normalize(c: char) -> bool { From 4103a7345b39916f5da543dd881fc33fc94f894a Mon Sep 17 00:00:00 2001 From: ngdbao Date: Mon, 22 Jan 2024 18:57:03 +0700 Subject: [PATCH 3/3] fix format comply CI --- charabia/src/normalizer/vietnamese.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs index 78218bf7..31e0836f 100644 --- a/charabia/src/normalizer/vietnamese.rs +++ b/charabia/src/normalizer/vietnamese.rs @@ -14,7 +14,7 @@ impl CharNormalizer for VietnameseNormalizer { fn should_normalize(&self, token: &Token) -> bool { token.script == Script::Latin && token.lemma.chars().any(is_should_normalize) - } + } } fn is_should_normalize(c: char) -> bool {