From 4189dad13198896f9b28734f14bc8c13215ecc53 Mon Sep 17 00:00:00 2001
From: ngdbao <ngdbao94@Gmail.com>
Date: Wed, 17 Jan 2024 00:46:56 +0700
Subject: [PATCH 1/3] =?UTF-8?q?normalize=20=C3=90=20and=20=C4=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 charabia/Cargo.toml                   |  5 ++++-
 charabia/src/normalizer/mod.rs        |  5 +++++
 charabia/src/normalizer/vietnamese.rs | 23 +++++++++++++++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 charabia/src/normalizer/vietnamese.rs
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index 6c3bdeb9..6dd4e1cd 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -37,7 +37,7 @@ litemap = "0.6.1"
 zerovec = "0.9.3"
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
 
 # allow chinese specialized tokenization
 chinese = ["dep:pinyin", "dep:jieba-rs"]
@@ -65,6 +65,9 @@ latin-camelcase = ["dep:finl_unicode"]
 
 khmer = []
 
+# allow vietnamese specialized tokenization
+vietnamese = []
+
 # allow splitting snake_case latin words
 latin-snakecase = ["dep:finl_unicode"]
 
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index 4fe13de9..cd07b618 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -13,6 +13,7 @@ use self::greek::GreekNormalizer;
 #[cfg(feature = "japanese-transliteration")]
 pub use self::japanese::JapaneseNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
+pub use self::vietnamese::VietnameseNormalizer;
 use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
 use crate::segmenter::SegmentedTokenIter;
@@ -31,6 +32,8 @@ mod japanese;
 mod lowercase;
 mod nonspacing_mark;
 mod quote;
+#[cfg(feature = "vietnamese")]
+mod vietnamese;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
@@ -54,6 +57,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(GreekNormalizer),
         Box::new(ArabicNormalizer),
         Box::new(NonspacingMarkNormalizer),
+        #[cfg(feature = "vietnamese")]
+        Box::new(VietnameseNormalizer),
     ]
 });
 
diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs
new file mode 100644
index 00000000..84c28bc9
--- /dev/null
+++ b/charabia/src/normalizer/vietnamese.rs
@@ -0,0 +1,23 @@
+use super::{CharNormalizer, CharOrStr};
+use crate::Token;
+use crate::Script;
+
+pub struct VietnameseNormalizer;
+
+impl CharNormalizer for VietnameseNormalizer {
+    fn normalize_char(&self, c: char) -> Option<CharOrStr> {
+        match c {
+            'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters
+            _ => None,
+        }
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
+    } 
+
+}
+
+fn is_should_normalize(c: char) -> bool {
+    matches!(c, 'Ð' | 'Đ' | 'đ')
+}

From 14be492e7beff2326ef37dedd194a73fbeae96c2 Mon Sep 17 00:00:00 2001
From: ngdbao <ngdbao94@Gmail.com>
Date: Wed, 17 Jan 2024 01:27:02 +0700
Subject: [PATCH 2/3] format code

---
 charabia/src/normalizer/mod.rs        | 2 +-
 charabia/src/normalizer/vietnamese.rs | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
index cd07b618..404346be 100644
--- a/charabia/src/normalizer/mod.rs
+++ b/charabia/src/normalizer/mod.rs
@@ -13,9 +13,9 @@ use self::greek::GreekNormalizer;
 #[cfg(feature = "japanese-transliteration")]
 pub use self::japanese::JapaneseNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
-pub use self::vietnamese::VietnameseNormalizer;
 use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
+pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs
index 84c28bc9..78218bf7 100644
--- a/charabia/src/normalizer/vietnamese.rs
+++ b/charabia/src/normalizer/vietnamese.rs
@@ -1,6 +1,6 @@
 use super::{CharNormalizer, CharOrStr};
-use crate::Token;
 use crate::Script;
+use crate::Token;
 
 pub struct VietnameseNormalizer;
 
@@ -15,7 +15,6 @@ impl CharNormalizer for VietnameseNormalizer {
     fn should_normalize(&self, token: &Token) -> bool {
         token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
     } 
-
 }
 
 fn is_should_normalize(c: char) -> bool {

From 4103a7345b39916f5da543dd881fc33fc94f894a Mon Sep 17 00:00:00 2001
From: ngdbao <ngdbao94@Gmail.com>
Date: Mon, 22 Jan 2024 18:57:03 +0700
Subject: [PATCH 3/3] fix format comply CI

---
 charabia/src/normalizer/vietnamese.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs
index 78218bf7..31e0836f 100644
--- a/charabia/src/normalizer/vietnamese.rs
+++ b/charabia/src/normalizer/vietnamese.rs
@@ -14,7 +14,7 @@ impl CharNormalizer for VietnameseNormalizer {
 
     fn should_normalize(&self, token: &Token) -> bool {
         token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
-    } 
+    }
 }
 
 fn is_should_normalize(c: char) -> bool {