diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 7b2baef..ae4cbe8 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ The :py:mod:`nameparser.config` module manages the configuration of the -nameparser. +nameparser. A module-level instance of :py:class:`~nameparser.config.Constants` is created and used by default for all HumanName instances. You can adjust the entire module's @@ -25,11 +25,12 @@ >>> hn.parse_full_name() # need to run this again after config changes **Potential Gotcha**: If you do not pass ``None`` as the second argument, -``hn.C`` will be a reference to the module config, possibly yielding +``hn.C`` will be a reference to the module config, possibly yielding unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals import sys + try: # Python 3.3+ from collections.abc import Set @@ -46,6 +47,7 @@ from nameparser.config.titles import TITLES from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import REGEXES +from nameparser.config.affixes import AFFIXES DEFAULT_ENCODING = 'UTF-8' @@ -57,7 +59,7 @@ class SetManager(Set): Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) - when they are add()ed and remove()d and allow passing multiple + when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. ''' @@ -125,7 +127,7 @@ def remove(self, *strings): class TupleManager(dict): ''' - A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants + A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' @@ -148,23 +150,25 @@ class Constants(object): """ An instance of this class hold all of the configuration constants for the parser. - :param set prefixes: + :param set prefixes: + :py:attr:`prefixes` wrapped with :py:class:`SetManager`. + :param set family prefixes: :py:attr:`prefixes` wrapped with :py:class:`SetManager`. - :param set titles: + :param set titles: :py:attr:`titles` wrapped with :py:class:`SetManager`. - :param set first_name_titles: + :param set first_name_titles: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. - :param set suffix_acronyms: + :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set conjunctions: + :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :type capitalization_exceptions: tuple or dict - :param capitalization_exceptions: + :param capitalization_exceptions: :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. :type regexes: tuple or dict - :param regexes: + :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ @@ -187,9 +191,9 @@ class Constants(object): empty_attribute_default = '' """ Default return value for empty attributes. - + .. doctest:: - + >>> from nameparser.config import CONSTANTS >>> CONSTANTS.empty_attribute_default = None >>> name = HumanName("John Doe") @@ -197,7 +201,7 @@ class Constants(object): None >>>name.first 'John' - + """ capitalize_name = False @@ -233,6 +237,7 @@ class Constants(object): def __init__(self, prefixes=PREFIXES, + family_affixes=AFFIXES, suffix_acronyms=SUFFIX_ACRONYMS, suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, titles=TITLES, @@ -242,6 +247,7 @@ def __init__(self, regexes=REGEXES ): self.prefixes = SetManager(prefixes) + self.family_affixes = SetManager(family_affixes) self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) self.titles = SetManager(titles) diff --git a/nameparser/config/affixes.py b/nameparser/config/affixes.py new file mode 100644 index 0000000..fead9e3 --- /dev/null +++ b/nameparser/config/affixes.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +# https://en.wikipedia.org/wiki/List_of_family_name_affixes + +AFFIXES = set([ + 'a', + 'ab', + 'af', + 'av', + 'ap', + 'abu', + 'ait', + 'aït', + 'alam', + 'at', + 'ath', + 'aust', + 'austre', + 'bar', + 'bat', + 'bath', + 'ben', + 'bin', + 'ibn', + 'bert', + 'bet', + 'bint', + 'da', + 'das', + 'de', + 'degli', + 'del', + 'dele', + 'della', + 'den', + 'der', + 'di', + 'dos', + 'du', + 'e', + 'el', + 'fetch', + 'vetch', + 'fitz', + 'i', + 'kil', + 'gil', + 'la', + 'le', + 'lille', + 'lu', + 'm\'', + 'mc', + 'mac', + 'mck', + 'mhic', + 'mic', + 'mala', + 'mellom', + 'myljom', + 'na', + 'ned', + 'nedre', + 'neder', + 'nic', + 'ni', + 'ní', + 'nin', + 'nord', + 'norr', + 'nord', + 'nordre', + 'ny', + 'o', + 'ua', + 'ua', + 'ui', + 'uí', + 'opp', + 'upp', + 'ofver', + 'ost', + 'oster', + 'over', + 'ovste', + 'ovre', + 'oz', + 'pour', + 'putra', + 'putera', + 'putri', + 'putera', + 'setia', + 'setya', + 'stor', + 'soder', + 'sor', + 'sonder', + 'syd', + 'sondre', + 'syndre', + 'sore', + 'ter', + '\'t', + 'tre', + 'van', + 'het', + 'de', + 'vast', + 'väst', + 'vaster', + 'väster', + 'verch', + 'erch', + 'vest', + 'vestre', + 'vesle', + 'vetle', + 'von', + 'war', + 'zu', +]) diff --git a/nameparser/parser.py b/nameparser/parser.py index 5e3f32f..ffde81e 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -47,6 +47,8 @@ class HumanName(object): * :py:attr:`suffix` * :py:attr:`nickname` * :py:attr:`surnames` + * :py:attr:`family` + * :py:attr:`family_prefix` :param str full_name: The name string to be parsed. :param constants constants: @@ -300,6 +302,16 @@ def last(self): """ return " ".join(self.last_list) or self.C.empty_attribute_default + @property + def family(self): + """ + The person's family name. + """ + s = "" + for affix, family in self.family_list: + s += " ".join([*affix, *family]) or self.C.empty_attribute_default + return s + @property def suffix(self): """ @@ -399,6 +411,19 @@ def is_prefix(self, piece): else: return lc(piece) in self.C.prefixes + def is_family_affix(self, piece): + """ + Lowercase and no periods version of piece is in the + :py:data:`~nameparser.config.family_affixes.AFFIXES` set. + """ + if isinstance(piece, list): + for item in piece: + if self.is_family_affix(item): + return True + else: + return lc(piece) in self.C.family_affixes + + def is_roman_numeral(self, value): """ Matches the ``roman_numeral`` regular expression in @@ -513,9 +538,9 @@ def parse_nicknames(self): Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; `quoted_word`, `double_quotes` and `parenthesis`. """ - + empty_re = re.compile("") - + re_quoted_word = self.C.regexes.quoted_word or empty_re re_double_quotes = self.C.regexes.double_quotes or empty_re re_parenthesis = self.C.regexes.parenthesis or empty_re @@ -563,6 +588,7 @@ def parse_full_name(self): self.first_list = [] self.middle_list = [] self.last_list = [] + self.family_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True @@ -699,6 +725,19 @@ def parse_full_name(self): except IndexError: pass + for last in self.last_list: + if " " in last: + affix = [] + family = [] + for part in last.split(" "): + if self.is_family_affix(part): + affix.append(part) + else: + family.append(part) + self.family_list.append([affix, family]) + else: + self.family_list.append([[], [last]]) + if len(self) < 0: log.info("Unparsable: \"%s\" ", self.original) else: @@ -968,6 +1007,7 @@ def capitalize(self, force=None): self.first_list = self.cap_piece(self.first, 'first').split(' ') self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') self.last_list = self.cap_piece(self.last, 'last').split(' ') + # self.family_list = self.cap_piece(self.family, 'family').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') def handle_capitalization(self): diff --git a/tests.py b/tests.py index 91917a4..b639f7e 100644 --- a/tests.py +++ b/tests.py @@ -187,6 +187,20 @@ def test_prefix_names(self): self.m(hn.first, "vai", hn) self.m(hn.last, "la", hn) + def test_family_name_and_prefix(self): + hn = HumanName("Vincent van Gogh") + self.m(hn.family, "van Gogh", hn) + self.assertEqual(hn.family_list, [ + [["van"], ["Gogh"]] + ]) + + def test_family_name_and_double_prefix(self): + hn = HumanName("Vincent van der Gogh") + self.m(hn.family, "van der Gogh", hn) + self.assertEqual(hn.family_list, [ + [["van", "der"], ["Gogh"]], + ]) + def test_blank_name(self): hn = HumanName() self.m(hn.first, "", hn)