meilisearch · meili-bors · Jan 24, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 22, 2024
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -37,7 +37,7 @@ litemap = "0.6.1"
 zerovec = "0.9.3"
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
 
 # allow chinese specialized tokenization
 chinese = ["dep:pinyin", "dep:jieba-rs"]
@@ -65,6 +65,9 @@ latin-camelcase = ["dep:finl_unicode"]
 
 khmer = []
 
+# allow vietnamese specialized tokenization
+vietnamese = []
+
 # allow splitting snake_case latin words
 latin-snakecase = ["dep:finl_unicode"]
 

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -13,6 +13,7 @@ use self::greek::GreekNormalizer;
 #[cfg(feature = "japanese-transliteration")]
 pub use self::japanese::JapaneseNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
+pub use self::vietnamese::VietnameseNormalizer;
 use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
 use crate::segmenter::SegmentedTokenIter;
@@ -31,6 +32,8 @@ mod japanese;
 mod lowercase;
 mod nonspacing_mark;
 mod quote;
+#[cfg(feature = "vietnamese")]
+mod vietnamese;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
@@ -54,6 +57,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(GreekNormalizer),
         Box::new(ArabicNormalizer),
         Box::new(NonspacingMarkNormalizer),
+        #[cfg(feature = "vietnamese")]
+        Box::new(VietnameseNormalizer),
     ]
 });
 

diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs
@@ -0,0 +1,23 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::Token;
+use crate::Script;
+
+pub struct VietnameseNormalizer;
+
+impl CharNormalizer for VietnameseNormalizer {
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        match c {
+            'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
+            _ => None,
+        }
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
+    } 
+
+}
+
+fn is_should_normalize(c: char) -> bool {
+    matches!(c, 'Ð' | 'Đ' | 'đ')
+}