Merge branch 'main' into feature/german-compound-words

luflow · web-flow · commit 61634c97db7d · 2024-08-28T13:15:41.000+02:00
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
 irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "german-segmentation"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -71,7 +71,10 @@ latin-snakecase = ["dep:finl_unicode"]
 # force Charabia to recompose Swedish characters
 swedish-recomposition = []
 
-# force Charabia to decompose German composite words
+# allow turkish specialized tokenization
+turkish = []
+
+# allow decomposition of German composite words
 german-segmentation = []
 
 [dev-dependencies]
diff --git a/charabia/README.md b/charabia/README.md
@@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
 
 |  Script / Language  |                           specialized segmentation                            | specialized normalization | Segmentation Performance level | Tokenization Performance level |
 |---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
-| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization         | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
+| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
 | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization         | 🟩 ~27MiB/sec    | 🟨 ~8MiB/sec    |
 | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase          | 🟩 ~27MiB/sec    | 🟨 ~9MiB/sec    |
 | **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |
diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -17,6 +17,8 @@ use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
 #[cfg(feature = "swedish-recomposition")]
 use self::swedish_recomposition::SwedishRecompositionNormalizer;
+#[cfg(feature = "turkish")]
+pub use self::turkish::TurkishNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
@@ -39,6 +41,8 @@ mod nonspacing_mark;
 mod quote;
 #[cfg(feature = "swedish-recomposition")]
 mod swedish_recomposition;
+#[cfg(feature = "turkish")]
+mod turkish;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
 
@@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(NonspacingMarkNormalizer),
         #[cfg(feature = "vietnamese")]
         Box::new(VietnameseNormalizer),
+        #[cfg(feature = "turkish")]
+        Box::new(TurkishNormalizer),
     ]
 });
 
diff --git a/charabia/src/normalizer/turkish.rs b/charabia/src/normalizer/turkish.rs