-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalizer.py
48 lines (34 loc) · 1.49 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from re import compile as re_compile
from re import escape as re_escape
from re import sub as re_sub
from string import punctuation
class Normalizer():
@property
def _punct(self):
"""Get the list of punction as an re escaped string.
"""
if not hasattr(self, '_punct_initialized'):
self._punct_initialized = re_escape(punctuation)
return self._punct_initialized
@property
def _punct_re(self):
"""Compile the normalizing punctuation substitution regex pattern for use.
"""
if not hasattr(self, '_punct_re_initialized'):
self._punct_re_initialized = re_compile(r'([{}])'.format(self._punct))
return self._punct_re_initialized
@property
def _s_punct_re(self):
"""Compile the denormalizing punctuation substitution regex pattern for use.
"""
if not hasattr(self, '_s_punct_re_initialized'):
self._s_punct_re_initialized = re_compile(r'(\s[{}])'.format(self._punct))
return self._s_punct_re_initialized
def normalize_symbol_boundaries(self, text: str):
"""Given a text, inserts whitespace around commas, quotations, parens, etc.
"""
return re_sub(r'\s+', ' ', re_sub(self._punct_re, r' \1 ', text)).strip()
def denormalize_symbol_boundaries(self, text: str):
"""Given a text, trims leading whitespace before commas, quotations, parens, etc.
"""
return re_sub(self._s_punct_re, lambda x: x.group()[1:], text)