Skip to content

Commit

Permalink
Merge #278
Browse files Browse the repository at this point in the history
278: Adds a new normalizer to normalize œ to oe and æ to ae r=ManyTheFish a=Soham1803

# Pull Request

## Related issue
Fixes #268

## What does this PR do?
- Creates a new normalizer *ae_oe_normalizer*
- normalizes `œ` and `Œ`  to `oe`,  `æ` and `Æ` to `ae`. 

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Soham <[email protected]>
Co-authored-by: Clémentine <[email protected]>
  • Loading branch information
3 people authored May 21, 2024
2 parents eb3326d + ddda698 commit 6b270fc
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 0 deletions.
154 changes: 154 additions & 0 deletions charabia/src/normalizer/ae_oe_normalizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
use super::{CharNormalizer, CharOrStr};
use crate::{Script, Token};

/// This module contains the implementation of the `AeOeNormalizer` struct, which is a character normalizer
/// that replaces the characters 'œ', 'æ', 'Œ', and 'Æ' with their respective replacements 'oe', 'ae', 'OE', and 'AE'.
/// It also provides a test suite to validate the normalizer's functionality.
pub struct AeOeNormalizer;

// All normalizers only need to implement the method `normalize_char` and the method `should_normalize` of the `CharNormalizer` trait.
impl CharNormalizer for AeOeNormalizer {
// Creates the normalized version of the provided char.
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
match c {
'œ' | 'Œ' => Some("oe".to_string().into()),
'æ' | 'Æ' => Some("ae".to_string().into()),
_ => Some(c.into()),
}
}

// Returns `true` if the Normalizer should be used.
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && token.lemma.chars().any(is_should_normalize)
}
}
fn is_should_normalize(c: char) -> bool {
matches!(c, 'œ' | 'æ' | 'Œ' | 'Æ')
}

// Test the normalizer:
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::{Normalizer, NormalizerOption};
use crate::token::TokenKind;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("œ".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("Œ".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("æ".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
..Default::default()
},
Token {
lemma: Owned("Æ".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
..Default::default()
},
]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("oe".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
..Default::default()
},
Token {
lemma: Owned("oe".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
..Default::default()
},
Token {
lemma: Owned("ae".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
..Default::default()
},
Token {
lemma: Owned("ae".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
..Default::default()
},
]
}

// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("oe".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("oe".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("ae".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("ae".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Latin,
char_map: Some(vec![(2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
]
}

test_normalizer!(AeOeNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
5 changes: 5 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ pub use self::vietnamese::VietnameseNormalizer;
use crate::segmenter::SegmentedTokenIter;
use crate::Token;

pub use self::ae_oe_normalizer::AeOeNormalizer;

mod arabic;
#[cfg(feature = "chinese-normalization")]
mod chinese;
Expand All @@ -40,6 +42,8 @@ mod swedish_recomposition;
#[cfg(feature = "vietnamese")]
mod vietnamese;

mod ae_oe_normalizer;

/// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
vec![
Expand All @@ -56,6 +60,7 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
vec![
Box::new(LowercaseNormalizer),
Box::new(QuoteNormalizer),
Box::new(AeOeNormalizer),
#[cfg(feature = "chinese-normalization")]
Box::new(ChineseNormalizer),
#[cfg(feature = "japanese-transliteration")]
Expand Down

0 comments on commit 6b270fc

Please sign in to comment.