Skip to content

Commit

Permalink
Merge #211
Browse files Browse the repository at this point in the history
211: Enhance Quotation marks support r=ManyTheFish a=ManyTheFish

# Pull Request
This PR now segment and normalize all single-high-quotation-marks as single quotes (`'`).
## Related issue
Related to [meilisearch#3689](meilisearch/meilisearch#3689), this PR is an hotfix for Meilisearch v1.2 before the `separator customization feature` that will refactor the Latin segmenter in v1.3
## What does this PR do?
- Make the Latin Segmenter segment on the other type of single high quotes
- Normalize Unicode single high quotes into single quotes.

Co-authored-by: ManyTheFish <[email protected]>
  • Loading branch information
bors[bot] and ManyTheFish authored Apr 26, 2023
2 parents 37e495a + d3a724c commit cd1ad65
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 3 deletions.
3 changes: 3 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::classifier::ClassifiedTokenIter;
#[cfg(feature = "greek")]
use crate::normalizer::greek::GreekNormalizer;
use crate::normalizer::nonspacing_mark::NonspacingMarkNormalizer;
use crate::normalizer::quote::QuoteNormalizer;
use crate::Token;

mod arabic;
Expand All @@ -27,6 +28,7 @@ mod greek;
mod japanese;
mod lowercase;
mod nonspacing_mark;
mod quote;

/// List of [`Normalizer`]s used by [`Normalize::normalize`].
pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Expand All @@ -40,6 +42,7 @@ pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
#[cfg(feature = "greek")]
Box::new(GreekNormalizer),
Box::new(ControlCharNormalizer),
Box::new(QuoteNormalizer),
Box::new(NonspacingMarkNormalizer),
Box::new(ArabicNormalizer),
]
Expand Down
71 changes: 71 additions & 0 deletions charabia/src/normalizer/quote.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
use super::{CharNormalizer, CharOrStr};
use crate::detection::Script;
use crate::Token;

/// Latin specialized [`Normalizer`].
///
/// This Normalizer replaces unicode high quotation marks by a single quote.
pub struct QuoteNormalizer;

impl CharNormalizer for QuoteNormalizer {
fn normalize_char(&self, c: char) -> Option<CharOrStr> {
if is_unicode_high_quotation_mark(c) {
Some('\''.into())
} else {
Some(c.into())
}
}

fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && token.lemma.chars().any(is_unicode_high_quotation_mark)
}
}

fn is_unicode_high_quotation_mark(c: char) -> bool {
matches!(c, '’' | '‘' | '‛')
}

#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::{Normalizer, NormalizerOption};

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("l'l’l‘l‛".to_string()),
char_end: 8,
byte_end: 14,
script: Script::Latin,
..Default::default()
}]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("l'l'l'l'".to_string()),
char_end: 8,
byte_end: 14,
script: Script::Latin,
char_map: Some(vec![(1, 1), (1, 1), (1, 1), (3, 1), (1, 1), (3, 1), (1, 1), (3, 1)]),
..Default::default()
}]
}

// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("l'l'l'l'".to_string()),
char_end: 8,
byte_end: 14,
script: Script::Latin,
char_map: Some(vec![(1, 1), (1, 1), (1, 1), (3, 1), (1, 1), (3, 1), (1, 1), (3, 1)]),
..Default::default()
}]
}

test_normalizer!(QuoteNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
6 changes: 3 additions & 3 deletions charabia/src/segmenter/latin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ impl Segmenter for LatinSegmenter {
fn segment_str<'o>(&self, s: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
let lemmas = s
.split_word_bounds()
.flat_map(|lemma| lemma.split_inclusive('\''))
.flat_map(|lemma| lemma.split_inclusive(['\'', '’', '‘', '‛']))
.flat_map(split_camel_case_bounds);

Box::new(lemmas)
Expand All @@ -32,9 +32,9 @@ mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str =
"The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F! camelCase";
"The quick (\"brown\") fox cant jump 32.3 feet, right? Brr, it's 29.3°F! camelCase";
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can'", "t", " ",
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "t", " ",
"jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", " ", "Brr", ",", " ", "it'", "s",
" ", "29.3", "°", "F", "!", " ", "camel", "Case",
];
Expand Down

0 comments on commit cd1ad65

Please sign in to comment.