From 6eb66ba35db8648e0d7722bd029976357dbfec5a Mon Sep 17 00:00:00 2001 From: aikimark Date: Sat, 20 Mar 2021 11:36:23 -0400 Subject: [PATCH 1/4] regex changes * changed regexes from set to list * added labels/tags as third item in tuples * changed processing of regexes input to TupleManager * added filtered list for nickname patterns * added/changed regexes --- nameparser/config/__init__.py | 2 +- nameparser/config/regexes.py | 27 ++++++++++++++---- nameparser/config/testREGEXES.py | 49 ++++++++++++++++++++++++++++++++ nameparser/parser.py | 25 ++++++++++------ 4 files changed, 88 insertions(+), 15 deletions(-) create mode 100644 nameparser/config/testREGEXES.py diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 4f1e4f2..4eaf15b 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -231,7 +231,7 @@ def __init__(self, self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) - self.regexes = TupleManager(regexes) + self.regexes = TupleManager([tpl[:2] for tpl in REGEXES]) self._pst = None @property diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index bd4b320..e520169 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -18,20 +18,37 @@ '[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) -REGEXES = set([ +REGEXES = [ ("spaces", re.compile(r"\s+", re.U)), ("word", re.compile(r"(\w|\.)+", re.U)), ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("quoted_word", re.compile(r'(? Date: Sun, 21 Mar 2021 08:40:02 -0400 Subject: [PATCH 2/4] remove testREGEXES remove testREGEXES.py from repository --- nameparser/config/testREGEXES.py | 49 -------------------------------- nameparser/config/titles.py | 2 ++ 2 files changed, 2 insertions(+), 49 deletions(-) delete mode 100644 nameparser/config/testREGEXES.py diff --git a/nameparser/config/testREGEXES.py b/nameparser/config/testREGEXES.py deleted file mode 100644 index 91ea6ce..0000000 --- a/nameparser/config/testREGEXES.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 18 04:54:12 2021 - -@author: New User -""" -import re - -class TupleManager(dict): - ''' - A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants - more friendly. - ''' - def __getattr__(self, attr): - return self.get(attr) - __setattr__= dict.__setitem__ - __delattr__= dict.__delitem__ - - def __getstate__(self): - return dict(self) - - def __setstate__(self, state): - self.__init__(state) - - def __reduce__(self): - return (TupleManager, (), self.__getstate__()) - -REGEXES = [ - ("spaces", re.compile(r"\s+", re.U)), - ("word", re.compile(r"(\w|\.)+", re.U)), - ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), - ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("quoted_word", re.compile(r'(? Date: Mon, 22 Mar 2021 14:11:19 -0400 Subject: [PATCH 3/4] added nickname tests * test for adding nickname * test for multiple nicknames --- nameparser/config/regexes.py | 1 + nameparser/config/suffixes.py | 2 + nameparser/parser.py | 75 +++++++++++++++++++++++++++++------ tests.py | 45 +++++++++++++++++++++ 4 files changed, 110 insertions(+), 13 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index e520169..ab2e8bf 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -36,6 +36,7 @@ ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), + ("nn_sep_safe", re.compile(r'[^ ,]', re.U)), ] """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 9765b92..7af82b8 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -6,6 +6,7 @@ 'esq', 'esquire', 'jr', + 'jr.', 'jnr', 'junior', 'sr', @@ -25,6 +26,7 @@ """ SUFFIX_ACRONYMS = set([ '(ret)', + '(ret.)', '(vet)', '8-vsb', 'aas', diff --git a/nameparser/parser.py b/nameparser/parser.py index e49fd8f..4b9abd9 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -254,7 +254,11 @@ def nickname(self): The person's nicknames. Any text found inside of quotes (``""``) or parenthesis (``()``) """ - return " ".join(self.nickname_list) or self.C.empty_attribute_default + if len(self.nickname_list) <= 1: + f_string = '{0}' + else: + f_string = '"{0}"' + return ", ".join([f_string.format(nn) for nn in self.nickname_list]) or self.C.empty_attribute_default @property def surnames_list(self): @@ -408,11 +412,14 @@ def pre_process(self): def post_process(self): """ This happens at the end of the :py:func:`parse_full_name` after - all other processing has taken place. Runs :py:func:`handle_firstnames` - and :py:func:`handle_capitalization`. + all other processing has taken place. Runs + :py:func:`handle_firstnames` + :py:func:`handle_capitalization` + :py:func:`check_suffixes_in_nicknames` #skipping this feature """ self.handle_firstnames() self.handle_capitalization() + #self.check_suffixes_in_nicknames() def fix_phd(self): _re = self.C.regexes.phd @@ -423,21 +430,49 @@ def fix_phd(self): def parse_nicknames(self): """ - The content of parenthesis or quotes in the name will be added to the + The content of defined nickname regex patterns in the name will be added to the nicknames list. This happens before any other processing of the name. - - Single quotes cannot span white space characters and must border - white space to allow for quotes in names like O'Connor and Kawai'ae'a. - Double quotes and parenthesis can span white space. - + + Some basic rules for nickname processing: + * Nicknames must begin with a word character. + * Nickname patterns should include an outer (not processed) + delimiter that excludes word characters. + Loops through :py:data:`~nameparser.config.regexes.REGEXES` with label/tag like "nickname" """ - + #ToDo: + # * create a list of matches + # * sort the list by span + # * check inter-match strings for commas + # * remove the commas if safe to remove + # safe = character(s) between matches are ONLY + # spaces and commas + # * iterate the matches, collecting the nicknames + # and removing the matches from self._full_name + nn_matches = [] + nn_sep = self.C.regexes.nn_sep_safe + _fn = self._full_name for _re in self._nickname_regexes: - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub(' ', self._full_name) + if _re.search(_fn): + nn_matches.extend( _re.finditer(_fn) ) + #remove matches from string + for _match in _re.finditer(_fn): + _fn = (' ' * (_match.end() - _match.start())).join([_fn[:_match.start()], _fn[_match.end():]]) + + if len(nn_matches) == 0: + return #"empty matches" + + nn_matches.sort(key=lambda x: x.span()) + + #remove any inter-match commas, if safe to do so + for low, high in zip(nn_matches[0:-1], nn_matches[1:]): + if nn_sep.search(self._full_name[low.span()[1]:high.span()[0]]) is None: + self._full_name = ' '.join([self._full_name[:low.span()[1]], self._full_name[high.span()[0]:] ]) + + for nn_match in nn_matches: + self.nickname_list.append( nn_match.groups(0)[0] ) + self._full_name = nn_match.re.sub(' ', self._full_name, 1) def squash_emoji(self): """ @@ -459,6 +494,20 @@ def handle_firstnames(self): and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last + def check_suffixes_in_nicknames(self): + """ + Iterate the nicknames, testing whether any of them are suffixes. + If there isn't (also) an identical suffix, then move that nickname + to the suffix_list + """ + for _nn in self.nickname_list: + if (_nn.lower() in self.C.suffix_acronyms or \ + _nn.lower() in self.C.suffix_not_acronyms) and \ + _nn not in self.suffix_list: + self.suffix_list.append(_nn) + self.nickname_list.remove(_nn) + + def parse_full_name(self): """ diff --git a/tests.py b/tests.py index 5f976b8..5c88c6a 100644 --- a/tests.py +++ b/tests.py @@ -27,6 +27,7 @@ from nameparser import HumanName from nameparser.util import u from nameparser.config import Constants +import re log = logging.getLogger('HumanName') @@ -1491,7 +1492,36 @@ def test_nickname_and_last_name_with_title(self): self.m(hn.last, "Edmonds", hn) self.m(hn.nickname, "Rick", hn) + def test_append_nickname(self): + hn = HumanName() + new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE) + hn._nickname_regexes.append(new_rgx) + self.assertEqual(hn._nickname_regexes[-1], new_rgx) + hn.full_name = r"Benjamin (_openBen):close Franklin" + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, ":close", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "_openBen", hn) + def test_prepend_nickname(self): + hn = HumanName() + new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE) + hn._nickname_regexes.insert(0, new_rgx) + self.assertEqual(hn._nickname_regexes[0], new_rgx) + hn.full_name = r"Benjamin (_openBen):close Franklin" + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Ben", hn) + + def test_multiple_nicknames(self): + hn = HumanName('Chief Justice John (JR), "No Glove, No Love" Glover Roberts, Jr.') + self.m(hn.title, 'Chief Justice', hn) + self.m(hn.first, "John", hn) + self.m(hn.middle, "Glover", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "Jr.", hn) + self.m(hn.nickname, '"JR", "No Glove, No Love"', hn) # class MaidenNameTestCase(HumanNameTestBase): # @@ -1766,6 +1796,21 @@ def test_suffix_with_periods_with_lastname_comma(self): self.m(hn.last, "Doe", hn) self.m(hn.suffix, "Msc.Ed.", hn) + @unittest.SkipTest + def test_suffix_in_nickname_dup(self): + hn = HumanName("John (JR) Roberts, JR") + self.m(hn.first, "John", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "JR", hn) + self.m(hn.nickname, "JR", hn) + + @unittest.SkipTest + def test_suffix_in_nickname_solo(self): + hn = HumanName("John (JR) Roberts") + self.m(hn.first, "John", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "JR", hn) + self.m(hn.nickname, "", hn) class TitleTestCase(HumanNameTestBase): From 129667cbfb893d9f2439ca4245ce0c075bf1501a Mon Sep 17 00:00:00 2001 From: aikimark Date: Thu, 25 Mar 2021 10:21:00 -0400 Subject: [PATCH 4/4] added suffix preprocessing * parse parenthesized suffixes (ret, vet) * converted unused suffix processing routine to preprocess the fullname ahead of nickname processing, since nickname patterns include a parenthesis-delimited pattern --- nameparser/config/regexes.py | 1 + nameparser/parser.py | 29 +++++++++++++++-------------- tests.py | 23 ++++++++--------------- 3 files changed, 24 insertions(+), 29 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index ab2e8bf..9be2f1e 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -37,6 +37,7 @@ ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), ("nn_sep_safe", re.compile(r'[^ ,]', re.U)), + ("paren_suffix", re.compile(r'(?!\w)(\((?:ret|vet)\.?\))(?!\w)', re.I | re.U)), ] """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index 4b9abd9..eb319b3 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -402,10 +402,15 @@ def pre_process(self): This method happens at the beginning of the :py:func:`parse_full_name` before any other processing of the string aside from unicode normalization, so it's a good place to do any custom handling in a - subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. + subclass. Runs + :py:func:`fix_phd` + :py:func:`parse_parenthesized_suffixes` + :py:func:`parse_nicknames` + :py:func:`squash_emoji`. """ self.fix_phd() + self.parse_parenthesized_suffixes() self.parse_nicknames() self.squash_emoji() @@ -415,11 +420,9 @@ def post_process(self): all other processing has taken place. Runs :py:func:`handle_firstnames` :py:func:`handle_capitalization` - :py:func:`check_suffixes_in_nicknames` #skipping this feature """ self.handle_firstnames() self.handle_capitalization() - #self.check_suffixes_in_nicknames() def fix_phd(self): _re = self.C.regexes.phd @@ -471,7 +474,7 @@ def parse_nicknames(self): self._full_name = ' '.join([self._full_name[:low.span()[1]], self._full_name[high.span()[0]:] ]) for nn_match in nn_matches: - self.nickname_list.append( nn_match.groups(0)[0] ) + self.nickname_list.append( nn_match.group(1) ) self._full_name = nn_match.re.sub(' ', self._full_name, 1) def squash_emoji(self): @@ -494,18 +497,16 @@ def handle_firstnames(self): and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last - def check_suffixes_in_nicknames(self): + def parse_parenthesized_suffixes(self): """ - Iterate the nicknames, testing whether any of them are suffixes. - If there isn't (also) an identical suffix, then move that nickname - to the suffix_list + Extract any parenthesized suffixes: (ret. | ret | vet. | vet) """ - for _nn in self.nickname_list: - if (_nn.lower() in self.C.suffix_acronyms or \ - _nn.lower() in self.C.suffix_not_acronyms) and \ - _nn not in self.suffix_list: - self.suffix_list.append(_nn) - self.nickname_list.remove(_nn) + _re = self.C.regexes.paren_suffix + if _re.search(self._full_name): + for _match in _re.finditer(self._full_name): + self.suffix_list.append(_match.group(1)) + + self._full_name = _re.sub(' ', self._full_name) def parse_full_name(self): diff --git a/tests.py b/tests.py index 5c88c6a..b19a0cc 100644 --- a/tests.py +++ b/tests.py @@ -1796,21 +1796,14 @@ def test_suffix_with_periods_with_lastname_comma(self): self.m(hn.last, "Doe", hn) self.m(hn.suffix, "Msc.Ed.", hn) - @unittest.SkipTest - def test_suffix_in_nickname_dup(self): - hn = HumanName("John (JR) Roberts, JR") - self.m(hn.first, "John", hn) - self.m(hn.last, "Roberts", hn) - self.m(hn.suffix, "JR", hn) - self.m(hn.nickname, "JR", hn) - - @unittest.SkipTest - def test_suffix_in_nickname_solo(self): - hn = HumanName("John (JR) Roberts") - self.m(hn.first, "John", hn) - self.m(hn.last, "Roberts", hn) - self.m(hn.suffix, "JR", hn) - self.m(hn.nickname, "", hn) + def test_suffix_parenthesized_with_nickname(self): + hn = HumanName("Gen Dwight David (Ike) Eisenhower (ret.) KG") + self.m(hn.title, "Gen", hn) + self.m(hn.first, "Dwight", hn) + self.m(hn.middle, "David", hn) + self.m(hn.last, "Eisenhower", hn) + self.m(hn.suffix, "(ret.), KG", hn) + self.m(hn.nickname, "Ike", hn) class TitleTestCase(HumanNameTestBase):