Skip to content

Commit

Permalink
Merge branch 'main' into feature/german-compound-words
Browse files Browse the repository at this point in the history
  • Loading branch information
luflow authored Aug 28, 2024
2 parents f6999c6 + dd260b9 commit 61634c9
Show file tree
Hide file tree
Showing 4 changed files with 433 additions and 3 deletions.
7 changes: 5 additions & 2 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "german-segmentation"]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]

# allow chinese specialized tokenization
chinese = ["chinese-segmentation", "chinese-normalization"]
Expand Down Expand Up @@ -71,7 +71,10 @@ latin-snakecase = ["dep:finl_unicode"]
# force Charabia to recompose Swedish characters
swedish-recomposition = []

# force Charabia to decompose German composite words
# allow turkish specialized tokenization
turkish = []

# allow decomposition of German composite words
german-segmentation = []

[dev-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion charabia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor

| Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level |
|---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Greek** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec |
| **Cyrillic** - **Georgian** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec |
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |
Expand Down
6 changes: 6 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ use self::nonspacing_mark::NonspacingMarkNormalizer;
use self::quote::QuoteNormalizer;
#[cfg(feature = "swedish-recomposition")]
use self::swedish_recomposition::SwedishRecompositionNormalizer;
#[cfg(feature = "turkish")]
pub use self::turkish::TurkishNormalizer;
#[cfg(feature = "vietnamese")]
pub use self::vietnamese::VietnameseNormalizer;
use crate::segmenter::SegmentedTokenIter;
Expand All @@ -39,6 +41,8 @@ mod nonspacing_mark;
mod quote;
#[cfg(feature = "swedish-recomposition")]
mod swedish_recomposition;
#[cfg(feature = "turkish")]
mod turkish;
#[cfg(feature = "vietnamese")]
mod vietnamese;

Expand Down Expand Up @@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Box::new(NonspacingMarkNormalizer),
#[cfg(feature = "vietnamese")]
Box::new(VietnameseNormalizer),
#[cfg(feature = "turkish")]
Box::new(TurkishNormalizer),
]
});

Expand Down
Loading

0 comments on commit 61634c9

Please sign in to comment.