Skip to content
This repository has been archived by the owner on Jul 15, 2024. It is now read-only.

Commit

Permalink
[WIP] split normalization and tagging into separate enums
Browse files Browse the repository at this point in the history
  • Loading branch information
TTWNO committed Jun 5, 2024
1 parent 6327ead commit 3eb4391
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 19 deletions.
10 changes: 10 additions & 0 deletions data/abbr.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Mrs. = misses
Ms. = mister
Dr. = doctor
Dr = drive
St = street
St. = saint
Jr. = junior
Prof. = professor
Ave. = avenue
Ave = avenue
79 changes: 60 additions & 19 deletions fry_normalize/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![no_std]
#![feature(lazy_cell)]

extern crate alloc;

Expand All @@ -9,10 +10,12 @@ use num2words::{
Currency
};
use alloc::{
collections::BTreeMap,
string::{String, ToString},
boxed::Box,
vec::Vec,
};
use core::cell::LazyCell;
use regex::Regex;
use num_bigfloat::BigFloat;

Expand Down Expand Up @@ -50,7 +53,7 @@ macro_rules! regex_get_all {
macro_rules! regex_match {
($regex:expr, $func:ident, $list:ident) => {
{
let boxed: Box<dyn NormalizationHandler> = Box::new($func);
let boxed: Box<dyn TaggingHandler<'static>> = Box::new($func);
let set = (
Regex::new($regex).unwrap(),
boxed
Expand Down Expand Up @@ -79,20 +82,31 @@ pub enum Error {
auto_into_enum!(Error, Error::Regex, regex::Error);
auto_into_enum!(Error, Error::Num, Num2Err);

pub trait NormalizationHandler {
fn normalize(&self, input: &str) -> String;
pub trait NormalizationHandler<'a> {
fn normalize(&self, input: &'a str) -> TaggedWord<'a>;
}
impl<F> NormalizationHandler for F
impl<'a, F> NormalizationHandler<'a> for F
where
F: Fn(&str) -> Result<String, Error> {
fn normalize(&self, input: &str) -> String {
self(input).unwrap_or(input.to_string())
F: Fn(&'a str) -> Result<TaggedWord<'a>, Error> {
fn normalize(&self, input: &'a str) -> TaggedWord<'a> {
self(input).unwrap_or(TaggedWord::Word(input))
}
}

type NormalizationItems = Vec<(Regex, Box<dyn NormalizationHandler>)>;
pub trait TaggingHandler<'a> {
fn tag(&self, input: &'a str) -> TaggedWord<'a>;
}
impl<'a, F> TaggingHandler<'a> for F
where
F: Fn(&'a str) -> Result<TaggedWord<'a>, Error> {
fn tag(&self, input: &'a str) -> TaggedWord<'a> {
self(input).unwrap_or(TaggedWord::Word(input))
}
}

fn normalize_number(input: &str) -> Result<String, Error> {
type TaggingItems<'a> = Vec<(Regex, Box<dyn TaggingHandler<'a>>)>;

fn normalize_number(input: &str) -> Result<TaggedWord, Error> {
let ordinal = input.ends_with("th") || input.ends_with("st") || input.ends_with("nd");
let currency = input.starts_with("$");
// remove all non-number values
Expand All @@ -104,43 +118,70 @@ fn normalize_number(input: &str) -> Result<String, Error> {
if ordinal {
ntw = ntw.ordinal();
}
Ok(ntw.to_words()?)
Ok(TaggedWord::Number(ntw))
}
fn normalize_word(word: &str) -> Result<String, Error> {
Ok(word.to_string())
fn normalize_word(word: &str) -> Result<TaggedWord, Error> {
Ok(TaggedWord::Word(word))
}
fn normalize_symbol(sym: &str) -> Result<String, Error> {
fn normalize_symbol(sym: &str) -> Result<TaggedWord, Error> {
todo!()
}

const NUMBER_REGEX: &str = "\\$?[0-9,]+((st)|(nd)|(th))?";
const REGEX_WORD: &str = "[a-zA-Z]?[a-z']+";
// All uppercasae words are symbols and are spoken letter by letter
const SYMBOL_REGEX: &str = "[A-Z.]+";
const ABBR_LIST: LazyCell<BTreeMap<&'static str, &'static str>> = LazyCell::new(|| {
let data = include_str!("../../data/abbr.txt");
let mut map = BTreeMap::new();
for line in data.lines() {
if let Some((def, res)) = line.split_once(" = ") {
map.insert(def, res);
}
}
map
});

fn regex_map() -> Result<NormalizationItems, Error> {
fn regex_map() -> Result<TaggingItems<'static>, Error> {
let mut resp = Vec::new();
regex_match!(NUMBER_REGEX, normalize_number, resp);
regex_match!(REGEX_WORD, normalize_word, resp);
regex_match!(SYMBOL_REGEX, normalize_symbol, resp);
Ok(resp)
}

fn normalize(s: &str) -> String {
#[derive(Debug)]
pub struct Abbreviation(&'static str, &'static str);

impl Abbreviation {
fn from_str(from: &str) -> Option<Self> {
ABBR_LIST.get_key_value(from)
.map(move |(k,v)| Abbreviation(k, v))
}
}

#[derive(Debug, Eq, PartialEq)]
pub enum TaggedWord<'a> {
Word(&'a str),
Symbol(&'a str),
Abbr(&'a str),
Number(&'a str),
}

fn normalize(s: &'static str) -> Vec<TaggedWord> {
let rm = regex_map().unwrap();
s.split_whitespace()
.map(|word| {
for (regex, func) in rm.iter() {
if let Some(mtc) = regex.find(word) {
if mtc.start() == 0 && mtc.end() == word.len() {
return func.normalize(word);
return func.tag(word);
}
}
}
word.to_string()
TaggedWord::Word(word)
})
.collect::<Vec<String>>()
.join(" ")
.collect::<Vec<TaggedWord>>()
}

#[cfg(test)]
Expand Down

0 comments on commit 3eb4391

Please sign in to comment.