Skip to content

Commit 61634c9

Browse files
authored
Merge branch 'main' into feature/german-compound-words
2 parents f6999c6 + dd260b9 commit 61634c9

File tree

4 files changed

+433
-3
lines changed

4 files changed

+433
-3
lines changed

charabia/Cargo.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
3131
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
3232

3333
[features]
34-
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "german-segmentation"]
34+
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]
3535

3636
# allow chinese specialized tokenization
3737
chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -71,7 +71,10 @@ latin-snakecase = ["dep:finl_unicode"]
7171
# force Charabia to recompose Swedish characters
7272
swedish-recomposition = []
7373

74-
# force Charabia to decompose German composite words
74+
# allow turkish specialized tokenization
75+
turkish = []
76+
77+
# allow decomposition of German composite words
7578
german-segmentation = []
7679

7780
[dev-dependencies]

charabia/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
1616

1717
| Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level |
1818
|---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
19-
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
19+
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
2020
| **Greek** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec |
2121
| **Cyrillic** - **Georgian** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec |
2222
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |

charabia/src/normalizer/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ use self::nonspacing_mark::NonspacingMarkNormalizer;
1717
use self::quote::QuoteNormalizer;
1818
#[cfg(feature = "swedish-recomposition")]
1919
use self::swedish_recomposition::SwedishRecompositionNormalizer;
20+
#[cfg(feature = "turkish")]
21+
pub use self::turkish::TurkishNormalizer;
2022
#[cfg(feature = "vietnamese")]
2123
pub use self::vietnamese::VietnameseNormalizer;
2224
use crate::segmenter::SegmentedTokenIter;
@@ -39,6 +41,8 @@ mod nonspacing_mark;
3941
mod quote;
4042
#[cfg(feature = "swedish-recomposition")]
4143
mod swedish_recomposition;
44+
#[cfg(feature = "turkish")]
45+
mod turkish;
4246
#[cfg(feature = "vietnamese")]
4347
mod vietnamese;
4448

@@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
7175
Box::new(NonspacingMarkNormalizer),
7276
#[cfg(feature = "vietnamese")]
7377
Box::new(VietnameseNormalizer),
78+
#[cfg(feature = "turkish")]
79+
Box::new(TurkishNormalizer),
7480
]
7581
});
7682

0 commit comments

Comments
 (0)