Skip to content

Commit

Permalink
Merge branch 'main' of github.com:Soham1803/charabia into feat/normal…
Browse files Browse the repository at this point in the history
…ize-ae-oe
  • Loading branch information
Soham1803 committed Apr 26, 2024
2 parents 4f29a36 + 5a64163 commit 2ff1080
Show file tree
Hide file tree
Showing 12 changed files with 197 additions and 37 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
run: cargo test --verbose
- name: Run tests with japanese-transliteration on
run: cargo test --verbose --features japanese-transliteration
- name: Run tests with chinese-normalization-pinyin on
run: cargo test --verbose --features chinese chinese-normalization-pinyin
- name: Run irg-kvariants tests
run: cargo test -p irg-kvariants --verbose

Expand Down
17 changes: 9 additions & 8 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "charabia"
version = "0.8.8"
version = "0.8.9"
license = "MIT"
authors = ["Many <[email protected]>"]
edition = "2021"
Expand All @@ -24,9 +24,7 @@ once_cell = "1.19.0"
serde = "1.0"
slice-group-by = "0.3.1"
whatlang = "0.16.4"
lindera-core = "=0.28.0"
lindera-dictionary = "=0.28.0"
lindera-tokenizer = { version = "=0.28.0", default-features = false, optional = true }
lindera = { version = "=0.30.0", default-features = false, optional = true }
pinyin = { version = "0.10", default-features = false, features = [
"with_tone",
], optional = true }
Expand All @@ -41,19 +39,22 @@ jemalloc-sys = "0.5.4"
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]

# allow chinese specialized tokenization
chinese = ["dep:pinyin", "dep:jieba-rs"]
chinese = ["chinese-segmentation", "chinese-normalization"]
chinese-segmentation = ["dep:jieba-rs"]
chinese-normalization = []
chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"]

# allow hebrew specialized tokenization
hebrew = []

# allow japanese specialized tokenization
japanese = ["japanese-segmentation-unidic"]
japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"]
japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"]
japanese-transliteration = ["dep:wana_kana"]

# allow korean specialized tokenization
korean = ["lindera-tokenizer/ko-dic", "lindera-tokenizer/ko-dic-compress"]
korean = ["lindera/ko-dic", "lindera/compress"]

# allow thai specialized tokenization
thai = []
Expand Down
2 changes: 1 addition & 1 deletion charabia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Greek** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec |
| **Cyrillic** - **Georgian** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec |
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + pinyin conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |
| **Hebrew** 🇮🇱 ||[compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal | 🟩 ~33MiB/sec | 🟨 ~11MiB/sec |
| **Arabic** |`ال` segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + [Tatweel, Alef, Yeh, and Taa Marbuta normalization] | 🟩 ~36MiB/sec | 🟨 ~11MiB/sec |
| **Japanese** 🇯🇵 |[lindera](https://github.com/lindera-morphology/lindera) IPA-dict |[compatibility decomposition](https://unicode.org/reports/tr15/) | 🟧 ~3MiB/sec | 🟧 ~3MiB/sec |
Expand Down
88 changes: 84 additions & 4 deletions charabia/src/normalizer/chinese.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#[cfg(feature = "chinese-normalization-pinyin")]
use pinyin::ToPinyin;

use super::CharNormalizer;
Expand All @@ -23,14 +24,17 @@ impl CharNormalizer for ChineseNormalizer {
// Normalize to Pinyin
// If we don't manage to convert the kvariant, we try to convert the original character.
// If none of them are converted, we return the kvariant.
match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
#[cfg(feature = "chinese-normalization-pinyin")]
let kvariant = match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
Some(converted) => {
let with_tone = converted.with_tone();

Some(with_tone.to_string().into())
with_tone.to_string()
}
None => Some(kvariant.into()), // e.g. 杤
}
None => kvariant, // e.g. 杤
};

Some(kvariant.into())
}

fn should_normalize(&self, token: &Token) -> bool {
Expand Down Expand Up @@ -77,6 +81,7 @@ mod test {
}

// expected result of the current Normalizer.
#[cfg(feature = "chinese-normalization-pinyin")]
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
Expand Down Expand Up @@ -113,6 +118,7 @@ mod test {
}

// expected result of the complete Normalizer pieline.
#[cfg(feature = "chinese-normalization-pinyin")]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
Expand Down Expand Up @@ -148,5 +154,79 @@ mod test {
]
}

// expected result of the current Normalizer.
#[cfg(not(feature = "chinese-normalization-pinyin"))]
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("尊嚴".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
Token {
lemma: Owned("生而自由".to_string()),
char_end: 4,
byte_end: 12,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
Token {
lemma: Owned("澳䁈亞本刃𣜜".to_string()),
char_end: 5,
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
]
}

// expected result of the complete Normalizer pieline.
#[cfg(not(feature = "chinese-normalization-pinyin"))]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
kind: TokenKind::Word,
lemma: Owned("尊嚴".to_string()),
char_start: 0,
char_end: 2,
byte_start: 0,
byte_end: 6,
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
},
Token {
kind: TokenKind::Word,
lemma: Owned("生而自由".to_string()),
char_start: 0,
char_end: 4,
byte_start: 0,
byte_end: 12,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
},
Token {
kind: TokenKind::Word,
lemma: Owned("澳䁈亞本刃𣜜".to_string()),
char_start: 0,
char_end: 5,
byte_start: 0,
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
},
]
}

test_normalizer!(ChineseNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
46 changes: 46 additions & 0 deletions charabia/src/normalizer/control_char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ mod test {
}

// expected result of the complete Normalizer pieline.
#[cfg(feature = "chinese-normalization-pinyin")]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
Expand Down Expand Up @@ -146,5 +147,50 @@ mod test {
]
}

// expected result of the complete Normalizer pieline.
#[cfg(not(feature = "chinese-normalization-pinyin"))]
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("生而自由oo".to_string()),
char_end: 9,
byte_end: 17,
script: Script::Cj,
char_map: Some(vec![
(1, 0),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(1, 0),
(1, 1),
(1, 1),
(1, 0),
]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("生而自由oo".to_string()),
char_end: 9,
byte_end: 17,
script: Script::Cj,
char_map: Some(vec![
(1, 0),
(3, 3),
(3, 3),
(3, 3),
(3, 3),
(1, 0),
(1, 1),
(1, 1),
(1, 0),
]),
kind: TokenKind::Word,
..Default::default()
},
]
}

test_normalizer!(ControlCharNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
6 changes: 3 additions & 3 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::borrow::Cow;
use once_cell::sync::Lazy;

pub use self::arabic::ArabicNormalizer;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-normalization")]
pub use self::chinese::ChineseNormalizer;
pub use self::classify::{Classifier, ClassifierOption};
pub use self::compatibility_decomposition::CompatibilityDecompositionNormalizer;
Expand All @@ -23,7 +23,7 @@ use crate::Token;
pub use self::ae_oe_normalizer::AeOeNormalizer;

mod arabic;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-normalization")]
mod chinese;
mod classify;
mod compatibility_decomposition;
Expand Down Expand Up @@ -55,7 +55,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Box::new(LowercaseNormalizer),
Box::new(QuoteNormalizer),
Box::new(AeOeNormalizer),
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-normalization")]
Box::new(ChineseNormalizer),
#[cfg(feature = "japanese-transliteration")]
Box::new(JapaneseNormalizer),
Expand Down
37 changes: 37 additions & 0 deletions charabia/src/segmenter/chinese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ mod test {
];

// Segmented and normalized version of the text.
#[cfg(feature = "chinese-normalization-pinyin")]
const TOKENIZED: &[&str] = &[
"rénrén",
"shēngérzìyóu",
Expand Down Expand Up @@ -99,6 +100,42 @@ mod test {
"。",
];

#[cfg(not(feature = "chinese-normalization-pinyin"))]
const TOKENIZED: &[&str] = &[
"人人",
"生而自由",
",",
"在",
"尊",
"嚴",
"和",
"權",
"利",
"上",
"一律平等",
"。",
"他",
"們",
"賦",
"有",
"理性",
"和",
"良心",
",",
"並",
"應",
"以",
"兄弟",
"關",
"係",
"的",
"精神",
"互相",
"對",
"待",
"。",
];

// Macro that run several tests on the Segmenter.
test_segmenter!(ChineseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Cmn);
}
6 changes: 2 additions & 4 deletions charabia/src/segmenter/japanese.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
use lindera_core::mode::Mode;
#[cfg(feature = "japanese-segmentation-ipadic")]
use lindera_core::mode::Penalty;
use lindera_dictionary::{DictionaryConfig, DictionaryKind};
use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
use lindera::Penalty;
use lindera::{DictionaryConfig, DictionaryKind, Mode, Tokenizer, TokenizerConfig};
use once_cell::sync::Lazy;

use crate::segmenter::Segmenter;
Expand Down
4 changes: 1 addition & 3 deletions charabia/src/segmenter/korean.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use lindera_core::mode::{Mode, Penalty};
use lindera_dictionary::{DictionaryConfig, DictionaryKind};
use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
use lindera::{DictionaryConfig, DictionaryKind, Mode, Penalty, Tokenizer, TokenizerConfig};
use once_cell::sync::Lazy;

use crate::segmenter::Segmenter;
Expand Down
11 changes: 5 additions & 6 deletions charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ use std::collections::HashMap;

use aho_corasick::{AhoCorasick, FindIter, MatchKind};
pub use arabic::ArabicSegmenter;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-segmentation")]
pub use chinese::ChineseSegmenter;
use either::Either;
#[cfg(feature = "japanese")]
pub use japanese::JapaneseSegmenter;
#[cfg(feature = "khmer")]
pub use khmer::KhmerSegmenter;
#[cfg(feature = "korean")]
pub use korean::KoreanSegmenter;
pub use latin::LatinSegmenter;
Expand All @@ -16,15 +18,12 @@ use slice_group_by::StrGroupBy;
#[cfg(feature = "thai")]
pub use thai::ThaiSegmenter;

#[cfg(feature = "khmer")]
pub use khmer::KhmerSegmenter;

use crate::detection::{Detect, Language, Script, StrDetection};
use crate::separators::DEFAULT_SEPARATORS;
use crate::token::Token;

mod arabic;
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-segmentation")]
mod chinese;
#[cfg(feature = "japanese")]
mod japanese;
Expand Down Expand Up @@ -54,7 +53,7 @@ pub static SEGMENTERS: Lazy<HashMap<(Script, Language), Box<dyn Segmenter>>> = L
// latin segmenter
((Script::Latin, Language::Other), Box::new(LatinSegmenter) as Box<dyn Segmenter>),
// chinese segmenter
#[cfg(feature = "chinese")]
#[cfg(feature = "chinese-segmentation")]
((Script::Cj, Language::Cmn), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
// japanese segmenter
#[cfg(feature = "japanese")]
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/separators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[
"𑪠", "𑪡", "𑪢", "𑱁", "𑱂", "𑱃", "𑱄", "𑱅", "𑱰", "𑱱", "𑻷", "𑻸", "𑿿", "𒑰", "𒑱", "𒑲", "𒑳", "𒑴", "𖩮",
"𖩯", "𖫵", "𖬷", "𖬸", "𖬹", "𖬺", "𖬻", "𖭄", "𖺗", "𖺘", "𖺙", "𖺚", "𖿢", "𛲟", "𝪇", "𝪈", "𝪉", "𝪊", "𝪋",
"𞥞", "𞥟", "\n", "\r", "\u{2029}", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ",
" ", " ", "`"
" ", " ", "`", "\t"
];

#[rustfmt::skip]
Expand Down
Loading

0 comments on commit 2ff1080

Please sign in to comment.