From 675f1a86164e1a6b3e8d488c24ca8a80aa862f9a Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 11:51:11 -0700 Subject: [PATCH 001/163] remove `suffixes` from config, fix #49 --- nameparser/config/__init__.py | 5 +---- nameparser/config/suffixes.py | 4 ---- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 37bb338..7bddf90 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -37,7 +37,6 @@ from nameparser.config.prefixes import PREFIXES from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS from nameparser.config.conjunctions import CONJUNCTIONS -from nameparser.config.suffixes import SUFFIXES from nameparser.config.suffixes import SUFFIX_ACRONYMS from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS from nameparser.config.titles import TITLES @@ -171,7 +170,6 @@ class Constants(object): def __init__(self, prefixes=PREFIXES, - suffixes=SUFFIXES, suffix_acronyms=SUFFIX_ACRONYMS, suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, titles=TITLES, @@ -181,7 +179,6 @@ def __init__(self, regexes=REGEXES ): self.prefixes = SetManager(prefixes) - self.suffixes = SetManager(suffixes) self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) self.titles = SetManager(titles) @@ -194,7 +191,7 @@ def __init__(self, @property def suffixes_prefixes_titles(self): if not self._pst: - self._pst = self.prefixes | self.suffixes | self.titles + self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst def __repr__(self): diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index b0f1751..b8c4f4b 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -122,7 +122,3 @@ when matching against these pieces. """ -SUFFIXES = SUFFIX_ACRONYMS | SUFFIX_NOT_ACRONYMS -""" -A union of the sets :py:attr:`SUFFIX_ACRONYMS` and :py:attr:`SUFFIX_NOT_ACRONYMS` -""" From e585122fef77852fcdb777f747584c5e066ae9c7 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 11:51:31 -0700 Subject: [PATCH 002/163] add "du" to prefixes --- nameparser/config/prefixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index d01d99f..64731e8 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -15,6 +15,7 @@ 'de', 'di', 'dí', + 'du', 'ibn', 'la', 'le', From 46fdebcff59b444b66ec17301a69fe5048cca60c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 11:52:04 -0700 Subject: [PATCH 003/163] Add "sheikh" variations to titles --- nameparser/config/titles.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index f718a3b..0fb78e6 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -16,6 +16,14 @@ 'sir', 'sister', 'uncle', + 'sheikh', + 'sheik', + 'shaik', + 'shayk', + 'shaykh', + 'shaikh', + 'cheikh', + 'shekh', ]) """ When these titles appear with a single other name, that name is a first name, e.g. From 93bb7861747078b62f8a36bb393ed92a4abc2ed1 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 11:52:33 -0700 Subject: [PATCH 004/163] update release log --- docs/release_log.rst | 4 ++++ nameparser/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 9e092e7..c7579e3 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,9 @@ Release Log =========== +* 0.4.0 - June 2, 2016 + - Remove "CONSTANTS.suffixes", replaced by "suffix_acronyms" and "suffix_not_acronyms" (#49) + - Add "du" to prefixes + - Add "sheikh" variations to titles * 0.3.16 - March 24, 2016 - Clarify LGPL licence version (#47) - Skip pickle tests if pickle not installed (#48) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 9e05702..80ae1e7 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 3, 16) +VERSION = (0, 4, 0) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From ad3ef650ef31bbb75702c79abb1b4c5ea1a5269f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 12:16:05 -0700 Subject: [PATCH 005/163] fix support for multiple suffixes separated by commas, add tests e.g. "John Doe Msc.Ed." --- nameparser/parser.py | 2 +- tests.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index dd86174..07f15bd 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -559,7 +559,7 @@ def parse_pieces(self, parts, additional_parts_count=0): self.C.titles.add(part) continue if len(list(suffixes)): - self.C.suffixes.add(part) + self.C.suffix_not_acronyms.add(part) continue return self.join_on_conjunctions(output, additional_parts_count) diff --git a/tests.py b/tests.py index 0ca9677..2b0c872 100644 --- a/tests.py +++ b/tests.py @@ -1517,6 +1517,13 @@ def test_king(self): self.m(hn.last, "King", hn) self.m(hn.suffix, "Jr", hn) + def test_suffix_with_periods(self): + hn = HumanName("John Doe Msc.Ed.") + self.m(hn.first,"John", hn) + self.m(hn.last,"Doe", hn) + self.m(hn.suffix,"Msc.Ed.", hn) + + class TitleTestCase(HumanNameTestBase): def test_last_name_is_also_title(self): @@ -1730,6 +1737,11 @@ def test_title_as_suffix(self): self.m(hn.first, "J.", hn) self.m(hn.last, "Smith", hn) + def test_title_with_periods(self): + hn = HumanName("Lt.Gov. John Doe") + self.m(hn.title,"Lt.Gov.", hn) + self.m(hn.first,"John", hn) + self.m(hn.last,"Doe", hn) class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): From 9d9e83b698f5fd0f5b8e4f9b72ba60c0ec20d93c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 12:23:20 -0700 Subject: [PATCH 006/163] more tests --- tests.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/tests.py b/tests.py index 2b0c872..1825997 100644 --- a/tests.py +++ b/tests.py @@ -1523,6 +1523,18 @@ def test_suffix_with_periods(self): self.m(hn.last,"Doe", hn) self.m(hn.suffix,"Msc.Ed.", hn) + def test_suffix_with_periods_with_comma(self): + hn = HumanName("John Doe, Msc.Ed.") + self.m(hn.first,"John", hn) + self.m(hn.last,"Doe", hn) + self.m(hn.suffix,"Msc.Ed.", hn) + + def test_suffix_with_periods_with_lastname_comma(self): + hn = HumanName("Doe, John Msc.Ed.") + self.m(hn.first,"John", hn) + self.m(hn.last,"Doe", hn) + self.m(hn.suffix,"Msc.Ed.", hn) + class TitleTestCase(HumanNameTestBase): @@ -1726,23 +1738,18 @@ def test_last_name_also_prefix(self): self.m(hn.first, "Jane", hn) self.m(hn.last, "Doctor", hn) - @unittest.expectedFailure - def test_title_as_suffix(self): - """ - Semantically, PhD is a title, not a suffix. - http://code.google.com/p/python-nameparser/issues/detail?id=7 - """ - hn = HumanName("J. Smith, PhD") - self.m(hn.title, "PhD", hn) - self.m(hn.first, "J.", hn) - self.m(hn.last, "Smith", hn) - def test_title_with_periods(self): hn = HumanName("Lt.Gov. John Doe") self.m(hn.title,"Lt.Gov.", hn) self.m(hn.first,"John", hn) self.m(hn.last,"Doe", hn) + def test_title_with_periods_lastname_comma(self): + hn = HumanName("Doe, Lt.Gov. John") + self.m(hn.title,"Lt.Gov.", hn) + self.m(hn.first,"John", hn) + self.m(hn.last,"Doe", hn) + class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): hn = HumanName('juan q. xavier velasquez y garcia iii') From bf1e0a55c456722ada3109e6ccfefce454ab43a6 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 12:39:48 -0700 Subject: [PATCH 007/163] Add parameter to force capitalization of mixed case strings --- docs/release_log.rst | 1 + nameparser/parser.py | 18 ++++++++++++------ tests.py | 5 +++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index c7579e3..d3cecb4 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -4,6 +4,7 @@ Release Log - Remove "CONSTANTS.suffixes", replaced by "suffix_acronyms" and "suffix_not_acronyms" (#49) - Add "du" to prefixes - Add "sheikh" variations to titles + - Add parameter to force capitalization of mixed case strings * 0.3.16 - March 24, 2016 - Clarify LGPL licence version (#47) - Skip pickle tests if pickle not installed (#48) diff --git a/nameparser/parser.py b/nameparser/parser.py index 07f15bd..215ae56 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -544,8 +544,8 @@ def parse_pieces(self, parts, additional_parts_count=0): raise TypeError("Name parts must be strings. Got {0}".format(type(part))) output += [x.strip(' ,') for x in part.split(' ')] - # If there's periods, check if it's titles without spaces and add spaces - # so they get picked up later as titles. + # If part contains periods, check if it's multiple titles or suffixes together without spaces + # if so, add the new part with periods to the constants so they get parsed correctly later for part in output: # if this part has a period not at the beginning or end if self.C.regexes.period_not_at_end.match(part): @@ -684,12 +684,15 @@ def cap_piece(self, piece): replacement = lambda m: self.cap_word(m.group(0)) return self.C.regexes.word.sub(replacement, piece) - def capitalize(self): + def capitalize(self, force=False): """ The HumanName class can try to guess the correct capitalization - of name entered in all upper or lower case. It will not adjust - the case of names entered in mixed case. + of name entered in all upper or lower case. By default, it will not adjust + the case of names entered in mixed case. To run capitalization on all names + pass the parameter `force=True`. + :param bool force: force capitalization of strings that include mixed case + **Usage** .. doctest:: capitalize @@ -703,10 +706,13 @@ def capitalize(self): >>> name.capitalize() >>> str(name) 'Shirley Maclaine' + >>> name.capitalize(force=True) + >>> str(name) + 'Shirley MacLaine' """ name = u(self) - if not (name == name.upper() or name == name.lower()): + if not force and not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title ).split(' ') self.first_list = self.cap_piece(self.first ).split(' ') diff --git a/tests.py b/tests.py index 1825997..579bf13 100644 --- a/tests.py +++ b/tests.py @@ -1791,6 +1791,11 @@ def test_no_change_to_mixed_chase(self): hn.capitalize() self.m(str(hn), 'Shirley Maclaine', hn) + def test_force_capitalization(self): + hn = HumanName('Shirley Maclaine') + hn.capitalize(force=True) + self.m(str(hn), 'Shirley MacLaine', hn) + def test_capitalize_diacritics(self): hn = HumanName('matthëus schmidt') hn.capitalize() From 62779b314a865bdcdf825549d61b213b5cf41f15 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 12:47:47 -0700 Subject: [PATCH 008/163] adjust repr() to not show 'None' in quotes when CONSTANTS.empty_attribute_default = None. It looks like an error because None has quotes around it. --- nameparser/parser.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 215ae56..fc2c173 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -134,12 +134,12 @@ def __repr__(self): else: _string = "<%(class)s : [\n\ttitle: '%(title)s' \n\tfirst: '%(first)s' \n\tmiddle: '%(middle)s' \n\tlast: '%(last)s' \n\tsuffix: '%(suffix)s'\n\tnickname: '%(nickname)s'\n]>" % { 'class': self.__class__.__name__, - 'title': self.title, - 'first': self.first, - 'middle': self.middle, - 'last': self.last, - 'suffix': self.suffix, - 'nickname': self.nickname, + 'title': self.title or '', + 'first': self.first or '', + 'middle': self.middle or '', + 'last': self.last or '', + 'suffix': self.suffix or '', + 'nickname': self.nickname or '', } if sys.version >= '3': return _string From 50370b468bbb46a6a6a4330130df40620171dc00 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 13:09:04 -0700 Subject: [PATCH 009/163] update readme --- README.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 6422810..da2265c 100644 --- a/README.rst +++ b/README.rst @@ -16,7 +16,11 @@ individual components. * hn.suffix * hn.nickname -Supports 3 different comma placement variations in the input string. +Supported Name Structures +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The supported name structure is generally "Title First Middle Last Suffix", where all pieces +are optional. Comma-separated format like "Last, First" is also supported. 1. Title Firstname "Nickname" Middle Middle Lastname Suffix 2. Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix] @@ -33,7 +37,7 @@ of names that are all upper- or lowercase names. It attempts the best guess that can be made with a simple, rule-based approach. Its main use case is English and it is not likely to be useful for languages -that do not share the same structure as English names. It's not perfect, but it +that do not conform to the supported name structure. It's not perfect, but it gets you pretty far. Installation @@ -73,6 +77,8 @@ Quick Start Example 'de la Vega' >>> name.as_dict() {'last': 'de la Vega', 'suffix': 'III', 'title': 'Dr.', 'middle': 'Q. Xavier', 'nickname': 'Doc Vega', 'first': 'Juan'} + >>> str(name) + 'Dr. Juan Q. Xavier de la Vega III (Doc Vega)' >>> name.string_format = "{first} {last}" >>> str(name) 'Juan de la Vega' From 4c6cd93ea56f3c8c6a9d99552817a84bdace26c8 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 13:32:06 -0700 Subject: [PATCH 010/163] update capitalization docs --- docs/usage.rst | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index af33ae4..9711331 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -63,21 +63,15 @@ The examples use Python 3, but Python 2.6+ is supported. ['Dr.', 'Juan', 'Q. Xavier', 'de la Vega', 'III'] >>> name[1:-2] ['Juan', 'Q. Xavier', 'de la Vega'] - >>> name = HumanName('bob v. de la macdole-eisenhower phd') - >>> name.capitalize() - >>> str(name) - 'Bob V. de la MacDole-Eisenhower Ph.D.' - >>> # Don't touch mixed case names - >>> name = HumanName('Shirley Maclaine') - >>> name.capitalize() - >>> str(name) - 'Shirley Maclaine' + Capitalization Support ---------------------- The HumanName class can try to guess the correct capitalization of name -entered in all upper or lower case. +entered in all upper or lower case. By default, it will not adjust +the case of names entered in mixed case. To run capitalization on all names +pass the parameter `force=True`. Capitalize the name. @@ -90,8 +84,13 @@ entered in all upper or lower case. >>> name.capitalize() >>> str(name) 'Bob V. de la MacDole-Eisenhower Ph.D.' - -It will not adjust the case of mixed case names. + >>> name = HumanName('Shirley Maclaine') # Don't change mixed case names + >>> name.capitalize() + >>> str(name) + 'Shirley Maclaine' + >>> name.capitalize(force=True) + >>> str(name) + 'Shirley MacLaine' Nickname Handling From f2842bd377eff4f6bc9bf8d2c0e63e90c1c8a4a4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 14:06:16 -0700 Subject: [PATCH 011/163] fix missing headings on docs --- docs/customize.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index 7c92aca..845b81f 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -63,6 +63,9 @@ Other editable attributes Parser Customization Examples ----------------------------- +Removing a Title +~~~~~~~~~~~~~~~~ + Take a look at the :py:mod:`nameparser.config` documentation to see what's in the constants. Here's a quick walk through of some examples where you might want to adjust them. @@ -104,6 +107,8 @@ constant so that "Hon" can be parsed as a first name. nickname: '' ]> +Adding a Title +~~~~~~~~~~~~~~~~ "Dean" is a common first name so it is not included in the default titles constant. But in some contexts it is more common as a title. If you would @@ -134,8 +139,8 @@ making them lower case and removing periods. ]> -Parser Customizations Are Module-Wide -+++++++++++++++++++++++++++++++++++++ +Module-level Shared Configuration Instance +------------------------------------------ When you modify the configuration, by default this will modify the behavior all HumanName instances. This could be a handy way to set it up for your entire @@ -191,7 +196,7 @@ reference to the module-level config values with the behavior described above. Config Changes May Need Parse Refresh -+++++++++++++++++++++++++++++++++++++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The full name is parsed upon assignment to the ``full_name`` attribute or instantiation. Sometimes after making changes to configuration or other inner @@ -237,7 +242,7 @@ those changes with ``repr()``. Adjusting names after parsing them -=================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Each attribute has a corresponding ordered list of name pieces. If you're doing pre- or post-processing you may wish to manipulate these lists directly. From 45972f6fb845280e2633536b3cdfcf71c484e729 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 16:45:45 -0700 Subject: [PATCH 012/163] update docs --- docs/customize.rst | 64 ++++++++++------------------------------------ 1 file changed, 14 insertions(+), 50 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index 845b81f..cd53e8e 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -39,24 +39,24 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). Editable attributes of nameparser.config.CONSTANTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:attr:`~nameparser.config.Constants.titles` - Pieces that come before the name. Cannot include things that may be first names -* :py:attr:`~nameparser.config.Constants.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David" -* :py:attr:`~nameparser.config.Constants.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d." -* :py:attr:`~nameparser.config.Constants.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr." -* :py:attr:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceeding piece to the following piece. -* :py:attr:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceeding -* :py:attr:`~nameparser.config.Constants.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D" -* :py:attr:`~nameparser.config.Constants.regexes` - Regular expressions used to find words, initials, nicknames, etc. - -Each set of constants comes with `add()` and `remove()` methods for tuning +* :py:obj:`~nameparser.config.Constants.titles` - Pieces that come before the name. Cannot include things that may be first names +* :py:obj:`~nameparser.config.Constants.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David" +* :py:obj:`~nameparser.config.Constants.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d." +* :py:obj:`~nameparser.config.Constants.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr." +* :py:obj:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceeding piece to the following piece. +* :py:obj:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceeding +* :py:obj:`~nameparser.config.Constants.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D" +* :py:obj:`~nameparser.config.Constants.regexes` - Regular expressions used to find words, initials, nicknames, etc. + +Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning the constants for your project. These methods automatically lower case and remove punctuation to normalize them for comparison. Other editable attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:attr:`~nameparser.config.Constants.string_format` -* :py:attr:`~nameparser.config.Constants.empty_attribute_default` +* :py:obj:`~nameparser.config.Constants.string_format` - controls output from `str()` +* :py:obj:`~nameparser.config.Constants.empty_attribute_default` - value returned by empty attributes, defaults to empty string @@ -204,42 +204,6 @@ data after assigning the full name, the name will need to be re-parsed with the :py:func:`~nameparser.parser.HumanName.parse_full_name()` method before you see those changes with ``repr()``. -:: - - >>> from nameparser import HumanName - >>> from nameparser.config import CONSTANTS - >>> hn = HumanName("Dean Robert Johns") - >>> hn - - >>> CONSTANTS.titles.add('dean') - SetManager({'right', ..., 'tax'}) - >>> hn - - >>> hn.parse_full_name() - >>> hn - - Adjusting names after parsing them ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -260,10 +224,10 @@ The strings returned by the attribute names just join these lists with spaces. >>> hn = HumanName("Juan Q. Xavier Velasquez y Garcia, Jr.") >>> hn.middle_list - [u'Q.', u'Xavier'] + ['Q.', 'Xavier'] >>> hn.middle_list += ["Ricardo"] >>> hn.middle_list - [u'Q.', u'Xavier', 'Ricardo'] + ['Q.', 'Xavier', 'Ricardo'] You can also replace any name bucket's contents by assigning a string or a list From 886318aa9f389b68cd45909aaa89915edd1a62b4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 2 Jun 2016 16:46:47 -0700 Subject: [PATCH 013/163] make has_own_config a property method --- nameparser/parser.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index fc2c173..1bcbd16 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -60,12 +60,9 @@ class HumanName(object): def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING, string_format=None): - global CONSTANTS self.C = constants - if not self.C: + if type(self.C) is not type(CONSTANTS): self.C = Constants() - if self.C is not CONSTANTS: - self.has_own_config = True self.ENCODING = encoding self.string_format = string_format or self.C.string_format @@ -170,7 +167,11 @@ def as_dict(self, include_empty=True): if val: d[m] = val return d - + + @property + def has_own_config(self): + return self.C is not CONSTANTS + ### attributes @property From 51569b04d701740f601218eae66dc9e5140bfdda Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 25 Jul 2016 17:48:34 -0700 Subject: [PATCH 014/163] remove "bishop" from titles, fix #51 --- docs/release_log.rst | 2 ++ nameparser/config/titles.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index d3cecb4..d69de9a 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 0.4.1 - July 25, 2016 + - Remove "bishop" from titles because it also could be a first name * 0.4.0 - June 2, 2016 - Remove "CONSTANTS.suffixes", replaced by "suffix_acronyms" and "suffix_not_acronyms" (#49) - Add "du" to prefixes diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 0fb78e6..28c6d01 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -93,7 +93,6 @@ 'bench', 'bg', 'bgen', - 'bishop', 'blessed', 'bodhisattva', 'brigadier', From 95975f2513ff1ce0f1bfbe99b1e7ef22086c55e4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 25 Jul 2016 17:59:15 -0700 Subject: [PATCH 015/163] add note for removing all titles, fix #52 --- docs/customize.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/customize.rst b/docs/customize.rst index cd53e8e..4e8ba48 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -107,6 +107,12 @@ constant so that "Hon" can be parsed as a first name. nickname: '' ]> + +If you don't want to detect any titles at all, you can remove all of them: + + >>> CONSTANTS.titles.remove(*CONSTANTS.titles) + + Adding a Title ~~~~~~~~~~~~~~~~ From a017a60d9744fa901cb1c0d5627cfaccf7d3a8ca Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 25 Jul 2016 18:21:36 -0700 Subject: [PATCH 016/163] fix handling of prefixes with periods in them, fix #50 --- docs/release_log.rst | 1 + nameparser/parser.py | 4 ++-- tests.py | 5 +++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index d69de9a..1bb754e 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -2,6 +2,7 @@ Release Log =========== * 0.4.1 - July 25, 2016 - Remove "bishop" from titles because it also could be a first name + - Fix handling of lastname prefixes with periods, e.g. "Jane St. John" (#50) * 0.4.0 - June 2, 2016 - Remove "CONSTANTS.suffixes", replaced by "suffix_acronyms" and "suffix_not_acronyms" (#49) - Add "du" to prefixes diff --git a/nameparser/parser.py b/nameparser/parser.py index 1bcbd16..44e2287 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -274,8 +274,8 @@ def is_conjunction(self, piece): return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) def is_prefix(self, piece): - """Is in the prefixes set and not :py:func:`is_an_initial()`.""" - return piece.lower() in self.C.prefixes and not self.is_an_initial(piece) + """Lowercase and no periods version of piece is in the `~nameparser.config.titles.PREFIXES` set.""" + return lc(piece) in self.C.prefixes def is_roman_numeral(self, value): """ diff --git a/tests.py b/tests.py index 579bf13..b442fb1 100644 --- a/tests.py +++ b/tests.py @@ -1378,6 +1378,11 @@ def test_prefix(self): self.m(hn.first, "Juan", hn) self.m(hn.last, "del Sur", hn) + def test_prefix_with_period(self): + hn = HumanName("Jill St. John") + self.m(hn.first, "Jill", hn) + self.m(hn.last, "St. John", hn) + def test_prefix_before_two_part_last_name(self): hn = HumanName("pennie von bergen wessels") self.m(hn.first, "pennie", hn) From 51df0253cfc68be49f0e37b5885eb142836883a0 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 25 Jul 2016 18:41:16 -0700 Subject: [PATCH 017/163] v0.4.1, add publishing instructions --- CONTRIBUTING.md | 18 +++++------------- docs/contributing.rst | 9 --------- nameparser/__init__.py | 2 +- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b4a0208..2e1b8ae 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -64,19 +64,11 @@ comma variations of these names automatically and make sure things don't blow up, so it can be a helpful regression indicator. -Provide Example Data ----------------------- +New Releases +------------ -We humans are the learning machine behind this code, and we can't do -it without real world data. If it doesn't work, start a new issue -because we probably don't know. +[https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/](Publishing to Pypi Guide) -If you have a dataset that has lots of issues, add the data to a -[gist](https://gist.github.com) and [create a new -issue](https://github.com/derek73/python-nameparser/issues) so we can -try to get it working as expected. - -Feel free to update this documentation to address any questions that I -missed. GitHub makes it pretty easy to edit it right on the web site -now. + $ python setup.py sdist bdist_wheel + $ twine upload dist/* diff --git a/docs/contributing.rst b/docs/contributing.rst index 90f53d4..2a3f6c2 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -9,12 +9,3 @@ Find more information about running tests and contributing the project at the pr https://github.com/derek73/python-nameparser/blob/master/CONTRIBUTING.md -Providing Example Data ----------------------- - -We humans are the learning machine behind this code, and we can't do it without real world data. If it doesn't work, start a new issue because we probably don't know. - -If you have a dataset that has lots of issues, add the data to a [gist](https://gist.github.com) and [create a new issue](https://github.com/derek73/python-nameparser/issues) so we can try to get it working as expected. - -Feel free to update this documentation to address any questions that I missed. GitHub makes it pretty easy to edit it right on the web site now. - diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 80ae1e7..29467bc 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 4, 0) +VERSION = (0, 4, 1) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 5f3da6c8322cbfb9b8da1693ebd14688c29592a7 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 4 Aug 2016 21:13:11 -0700 Subject: [PATCH 018/163] refactor join_on_conjunctions(), works on python 2.6 --- nameparser/parser.py | 139 ++++++++++++++++++++++++++----------------- tests.py | 10 +++- 2 files changed, 92 insertions(+), 57 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 44e2287..2ea605d 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -2,6 +2,9 @@ from __future__ import unicode_literals import sys +from operator import itemgetter +from itertools import groupby + from nameparser.util import u from nameparser.util import text_types, binary_type from nameparser.util import lc @@ -9,9 +12,19 @@ from nameparser.config import CONSTANTS from nameparser.config import Constants - ENCODING = 'utf-8' +def group_contiguous_integers(data): + """ + return list of tuples containing first and last index + position of contiguous numbers in a series + """ + ranges = [] + for key, group in groupby(enumerate(data), lambda (index, item): index - item): + group = map(itemgetter(1), group) + if len(group) > 1: + ranges.append((group[0], group[-1])) + return ranges class HumanName(object): """ @@ -568,7 +581,15 @@ def parse_pieces(self, parts, additional_parts_count=0): def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces, e.g.: - ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia'] + + ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] + v + ['Mr. and Mrs.', 'John', 'Doe'] + + + ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] + v + ['The Secretary of State', 'Hillary', 'Clinton'] :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: @@ -580,70 +601,76 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # don't join on conjuctions if there's only 2 parts if length < 3: return pieces - - for conj in filter(self.is_conjunction, pieces[::-1]): # reverse sorted list - - # loop through the pieces backwards, starting at the end of the list. - # Join conjunctions to the pieces on either side of them. - - rootname_pieces = [p for p in pieces if self.is_rootname(p)] - total_length= len(rootname_pieces) + additional_parts_count - if len(conj) == 1 and total_length < 4: - # if there are only 3 total parts (minus known titles, suffixes and prefixes) - # and this conjunction is a single letter, prefer treating it as an initial - # rather than a conjunction. - # http://code.google.com/p/python-nameparser/issues/detail?id=11 - continue + rootname_pieces = [p for p in pieces if self.is_rootname(p)] + total_length= len(rootname_pieces) + additional_parts_count + + # find all the conjunctions, join any conjunctions that are next to each other, then join those newly joined conjunctions and any single conjunctions to the piece before and after it + conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] + + contiguous_conj_i = [] + for i, val in enumerate(conj_index): try: - i = pieces.index((conj)) - except ValueError: - log.error("Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}".format(**locals())) + if conj_index[i+1] == val+1: + contiguous_conj_i += [val] + except IndexError: + pass + + contiguous_conj_i = group_contiguous_integers(conj_index) + + delete_i = [] + for i in contiguous_conj_i: + if type(i) == tuple: + new_piece = " ".join(pieces[ i[0] : i[1]+1] ) + delete_i += list(xrange( i[0]+1, i[1]+1 )) + pieces[i[0]] = new_piece + else: + new_piece = " ".join(pieces[ i : i+2 ]) + delete_i += [i+1] + pieces[i] = new_piece + #add newly joined conjunctions to constants to be found later + self.C.conjunctions.add(new_piece) + + for i in reversed(delete_i): + # delete pieces in reverse order or the index changes on each delete + del pieces[i] + + # refresh conjunction index locations + conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] + + for i in conj_index: + if len(pieces[i]) == 1 and total_length < 4: + # if there are only 3 total parts (minus known titles, suffixes + # and prefixes) and this conjunction is a single letter, prefer + # treating it as an initial rather than a conjunction. + # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue - if i < len(pieces) - 1: - # if this is not the last piece - - if i is 0: - # if this is the first piece and it's a conjunction - nxt = pieces[i+1] - const = self.C.conjunctions - if self.is_title(nxt): - const = self.C.titles - new_piece = ' '.join(pieces[0:2]) - const.add(new_piece) - pieces[i] = new_piece - pieces.pop(i+1) - continue - - if self.is_conjunction(pieces[i-1]): - - # if the piece in front of this one is a conjunction too, - # add new_piece (this conjuction and the following piece) - # to the conjuctions constant so that it is recognized - # as a conjunction in the next loop. - # e.g. for ["Lord","of","the Universe"], put "the Universe" - # into the conjunctions constant. - - new_piece = ' '.join(pieces[i:i+2]) - self.C.conjunctions.add(new_piece) - pieces[i] = new_piece - pieces.pop(i+1) - continue + if i is 0: + new_piece = " ".join(pieces[i:i+2]) + if self.is_title(pieces[i+1]): + # when joining to a title, make new_piece a title too + self.C.titles.add(new_piece) + pieces[i] = new_piece + pieces.pop(i+1) + # subtract 1 from the index of all the remaining conjunctions + for j,val in enumerate(conj_index): + if val > i: + conj_index[j]=val-1 - new_piece = ' '.join(pieces[i-1:i+2]) + else: + new_piece = " ".join(pieces[i-1:i+2]) if self.is_title(pieces[i-1]): - - # if the second name is a title, assume the first one is too and add the - # two titles with the conjunction between them to the titles constant - # so the combo we just created gets parsed as a title. - # e.g. "Mr. and Mrs." becomes a title. - + # when joining to a title, make new_piece a title too self.C.titles.add(new_piece) - pieces[i-1] = new_piece pieces.pop(i) pieces.pop(i) + # subtract 2 from the index of all the remaining conjunctions + for j,val in enumerate(conj_index): + if val > i: + conj_index[j]=val-2 + # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) diff --git a/tests.py b/tests.py index b442fb1..6572ddc 100644 --- a/tests.py +++ b/tests.py @@ -1061,11 +1061,19 @@ def test119(self): class HumanNameConjunctionTestCase(HumanNameTestBase): # Last name with conjunction - def test117(self): + def test_last_name_with_conjunction(self): hn = HumanName('Jose Aznar y Lopez') self.m(hn.first, "Jose", hn) self.m(hn.last, "Aznar y Lopez", hn) + def test_multiple_conjunctions(self): + hn = HumanName("part1 of The part2 of the part3 and part4") + self.m(hn.first, "part1 of The part2 of the part3 and part4", hn) + + def test_multiple_conjunctions2(self): + hn = HumanName("part1 of and The part2 of the part3 And part4") + self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn) + # Potential conjunction/prefix treated as initial (because uppercase) def test_uppercase_middle_initial_conflict_with_conjunction(self): hn = HumanName('John E Smith') From 67a07d566bb7b8d121c3feafd87973f51b56129b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 4 Aug 2016 21:31:17 -0700 Subject: [PATCH 019/163] fix lambda to work on python 3.5 --- nameparser/parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 2ea605d..257660f 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -20,8 +20,8 @@ def group_contiguous_integers(data): position of contiguous numbers in a series """ ranges = [] - for key, group in groupby(enumerate(data), lambda (index, item): index - item): - group = map(itemgetter(1), group) + for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]): + group = list(map(itemgetter(1), group)) if len(group) > 1: ranges.append((group[0], group[-1])) return ranges @@ -565,8 +565,8 @@ def parse_pieces(self, parts, additional_parts_count=0): if self.C.regexes.period_not_at_end.match(part): # split on periods, any of the split pieces titles or suffixes? ("Lt.Gov.") period_chunks = part.split(".") - titles = filter(self.is_title, period_chunks) - suffixes = filter(self.is_suffix, period_chunks) + titles = list(filter(self.is_title, period_chunks)) + suffixes = list(filter(self.is_suffix, period_chunks)) # add the part to the constant so it will be found if len(list(titles)): @@ -622,7 +622,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for i in contiguous_conj_i: if type(i) == tuple: new_piece = " ".join(pieces[ i[0] : i[1]+1] ) - delete_i += list(xrange( i[0]+1, i[1]+1 )) + delete_i += list(range( i[0]+1, i[1]+1 )) pieces[i[0]] = new_piece else: new_piece = " ".join(pieces[ i : i+2 ]) From f21e7beab58f55836057d19b8cdce24d9a33a301 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 4 Aug 2016 21:43:13 -0700 Subject: [PATCH 020/163] v0.5.0, update docs --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- nameparser/parser.py | 18 ++++++++++-------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 1bb754e..3f5c949 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 0.5.0 - August 4, 2016 + - Refactor join_on_conjunctions(), fix #53 * 0.4.1 - July 25, 2016 - Remove "bishop" from titles because it also could be a first name - Fix handling of lastname prefixes with periods, e.g. "Jane St. John" (#50) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 29467bc..33706c5 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 4, 1) +VERSION = (0, 5, 0) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/parser.py b/nameparser/parser.py index 257660f..2baf892 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -580,16 +580,18 @@ def parse_pieces(self, parts, additional_parts_count=0): def join_on_conjunctions(self, pieces, additional_parts_count=0): """ - Join conjunctions to surrounding pieces, e.g.: + Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: - ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] - v - ['Mr. and Mrs.', 'John', 'Doe'] + ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> + ['Mr. and Mrs.', 'John', 'Doe'] - - ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] - v - ['The Secretary of State', 'Hillary', 'Clinton'] + ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> + ['The Secretary of State', 'Hillary', 'Clinton'] + + When joining titles, saves newly formed piece to the instance's titles + constant so they will be parsed correctly later. E.g. after parsing the + example names above, 'The Secretary of State' and 'Mr. and Mrs.' would + be present in the titles constant set. :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: From 0f1d4267909356c860b5f085ae28378920abd784 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 12 Aug 2016 13:16:59 -0700 Subject: [PATCH 021/163] Fix #54 error for names that end with conjunction --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- nameparser/parser.py | 16 ++++++++++++---- tests.py | 20 ++++++++++++++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 3f5c949..28f1f85 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 0.5.1 - August 12, 2016 + - Fix error for names that end with conjunction (#54) * 0.5.0 - August 4, 2016 - Refactor join_on_conjunctions(), fix #53 * 0.4.1 - July 25, 2016 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 33706c5..71c4f7a 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 0) +VERSION = (0, 5, 1) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/parser.py b/nameparser/parser.py index 2baf892..d697a69 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -607,7 +607,9 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length= len(rootname_pieces) + additional_parts_count - # find all the conjunctions, join any conjunctions that are next to each other, then join those newly joined conjunctions and any single conjunctions to the piece before and after it + # find all the conjunctions, join any conjunctions that are next to each + # other, then join those newly joined conjunctions and any single + # conjunctions to the piece before and after it conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] contiguous_conj_i = [] @@ -667,11 +669,17 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): self.C.titles.add(new_piece) pieces[i-1] = new_piece pieces.pop(i) - pieces.pop(i) - # subtract 2 from the index of all the remaining conjunctions + rm_count = 2 + try: + pieces.pop(i) + except IndexError: + rm_count = 1 + pass + # subtract the number of removed pieces from the index + # of all the remaining conjunctions for j,val in enumerate(conj_index): if val > i: - conj_index[j]=val-2 + conj_index[j] = val - rm_count # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] diff --git a/tests.py b/tests.py index 6572ddc..32e0b74 100644 --- a/tests.py +++ b/tests.py @@ -1073,6 +1073,26 @@ def test_multiple_conjunctions(self): def test_multiple_conjunctions2(self): hn = HumanName("part1 of and The part2 of the part3 And part4") self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn) + + def test_ends_with_conjunction(self): + hn = HumanName("Jon Dough and") + self.m(hn.first, "Jon", hn) + self.m(hn.last, "Dough and", hn) + + def test_ends_with_two_conjunctions(self): + hn = HumanName("Jon Dough and of") + self.m(hn.first, "Jon", hn) + self.m(hn.last, "Dough and of", hn) + + def test_starts_with_conjunction(self): + hn = HumanName("and Jon Dough") + self.m(hn.first, "and Jon", hn) + self.m(hn.last, "Dough", hn) + + def test_starts_with_two_conjunctions(self): + hn = HumanName("the and Jon Dough") + self.m(hn.first, "the and Jon", hn) + self.m(hn.last, "Dough", hn) # Potential conjunction/prefix treated as initial (because uppercase) def test_uppercase_middle_initial_conflict_with_conjunction(self): From 9b3233966174cadbddbc600ce458322be3441d18 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 12 Aug 2016 21:40:52 -0700 Subject: [PATCH 022/163] whitespace, remove duplicate has_own_config --- nameparser/parser.py | 136 ++++++++++++++++++++++++++----------------- 1 file changed, 84 insertions(+), 52 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index d697a69..55f574a 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -30,9 +30,9 @@ class HumanName(object): """ Parse a person's name into individual components. - Instantiation assigns to ``full_name``, and assignment to :py:attr:`full_name` - triggers :py:func:`parse_full_name`. After parsing the name, these instance - attributes are available. + Instantiation assigns to ``full_name``, and assignment to + :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the + name, these instance attributes are available. **HumanName Instance Attributes** @@ -51,14 +51,12 @@ class HumanName(object): :param str string_format: python string formatting """ - has_own_config = False - """True if this instance is not using the shared module-level configuration. Read only.""" - C = CONSTANTS """ - A reference to the configuration for this instance, which may or may not be a - reference to the shared, module-wide instance at :py:mod:`~nameparser.config.CONSTANTS`. - See `Customizing the Parser `_. + A reference to the configuration for this instance, which may or may not be + a reference to the shared, module-wide instance at + :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser + `_. """ original = '' @@ -183,6 +181,10 @@ def as_dict(self, include_empty=True): @property def has_own_config(self): + """ + True if this instance is not using the shared module-level + configuration. + """ return self.C is not CONSTANTS ### attributes @@ -191,7 +193,8 @@ def has_own_config(self): def title(self): """ The person's titles. Any string of consecutive pieces in - :py:mod:`~nameparser.config.titles` or :py:mod:`~nameparser.config.conjunctions` + :py:mod:`~nameparser.config.titles` or + :py:mod:`~nameparser.config.conjunctions` at the beginning of :py:attr:`full_name`. """ return " ".join(self.title_list) or self.C.empty_attribute_default @@ -207,8 +210,8 @@ def first(self): @property def middle(self): """ - The person's middle names. All name pieces after the first name and before - the last name parsed from :py:attr:`full_name`. + The person's middle names. All name pieces after the first name and + before the last name parsed from :py:attr:`full_name`. """ return " ".join(self.middle_list) or self.C.empty_attribute_default @@ -225,8 +228,9 @@ def suffix(self): """ The persons's suffixes. Pieces at the end of the name that are found in :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end - of comma separated formats, e.g. "Lastname, Title Firstname Middle[,] Suffix - [, Suffix]" parsed from :py:attr:`full_name`. + of comma separated formats, e.g. + "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed + from :py:attr:`full_name`. """ return ", ".join(self.suffix_list) or self.C.empty_attribute_default @@ -248,8 +252,9 @@ def _set_list(self, attr, value): elif value is None: val = [] else: - raise TypeError("Can only assign strings, lists or None to name attributes. " - "Got {0}".format(type(value))) + raise TypeError( + "Can only assign strings, lists or None to name attributes." + " Got {0}".format(type(value))) setattr(self, attr+"_list", self.parse_pieces(val)) @title.setter @@ -287,7 +292,10 @@ def is_conjunction(self, piece): return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) def is_prefix(self, piece): - """Lowercase and no periods version of piece is in the `~nameparser.config.titles.PREFIXES` set.""" + """ + Lowercase and no periods version of piece is in the + `~nameparser.config.titles.PREFIXES` set. + """ return lc(piece) in self.C.prefixes def is_roman_numeral(self, value): @@ -318,7 +326,9 @@ def are_suffixes(self, pieces): return True def is_rootname(self, piece): - '''Is not a known title, suffix or prefix. Just first, middle, last names.''' + """ + Is not a known title, suffix or prefix. Just first, middle, last names. + """ return lc(piece) not in self.C.suffixes_prefixes_titles \ and not self.is_an_initial(piece) @@ -353,10 +363,12 @@ def collapse_whitespace(self, string): def pre_process(self): """ - This method happens at the beginning of the :py:func:`parse_full_name` before - any other processing of the string aside from unicode normalization, so - it's a good place to do any custom handling in a subclass. - Runs :py:func:`parse_nicknames`. + + This method happens at the beginning of the :py:func:`parse_full_name` + before any other processing of the string aside from unicode + normalization, so it's a good place to do any custom handling in a + subclass. Runs :py:func:`parse_nicknames`. + """ self.parse_nicknames() @@ -394,11 +406,14 @@ def handle_firstnames(self): def parse_full_name(self): """ - The main parse method for the parser. This method is run upon assignment to the - :py:attr:`full_name` attribute or instantiation. + + The main parse method for the parser. This method is run upon + assignment to the :py:attr:`full_name` attribute or instantiation. - Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It - then splits on commas and chooses a code path depending on the number of commas. + Basic flow is to hand off to :py:func:`pre_process` to handle + nicknames. It then splits on commas and chooses a code path depending + on the number of commas. + :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ @@ -436,7 +451,9 @@ def parse_full_name(self): nxt = None # title must have a next piece, unless it's just a title - if self.is_title(piece) and (nxt or p_len == 1) and not self.first: + if self.is_title(piece) \ + and (nxt or p_len == 1) \ + and not self.first: self.title_list.append(piece) continue if not self.first: @@ -444,8 +461,8 @@ def parse_full_name(self): continue if self.are_suffixes(pieces[i+1:]) or \ ( - # if the next piece is the last piece and a roman numeral - # but this piece is not an initial + # if the next piece is the last piece and a roman + # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): @@ -458,13 +475,16 @@ def parse_full_name(self): self.middle_list.append(piece) else: - # if all the end parts are suffixes and there is more than one piece in - # the first part. (Suffixes will never appear after last names only, and - # allows potential first names to be in suffixes, e.g. "Johnson, Bart" - if self.are_suffixes(parts[1].split(' ')) and len(parts[0].split(' ')) > 1: + # if all the end parts are suffixes and there is more than one piece + # in the first part. (Suffixes will never appear after last names + # only, and allows potential first names to be in suffixes, e.g. + # "Johnson, Bart" + if self.are_suffixes(parts[1].split(' ')) \ + and len(parts[0].split(' ')) > 1: - # suffix comma: title first middle last [suffix], suffix [suffix] [, suffix] - # parts[0], parts[1:...] + # suffix comma: + # title first middle last [suffix], suffix [suffix] [, suffix] + # parts[0], parts[1:...] self.suffix_list += parts[1:] @@ -476,7 +496,9 @@ def parse_full_name(self): except IndexError: nxt = None - if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first: + if self.is_title(piece) \ + and (nxt or len(pieces) == 1) \ + and not self.first: self.title_list.append(piece) continue if not self.first: @@ -492,8 +514,9 @@ def parse_full_name(self): self.middle_list.append(piece) else: - # lastname comma: last [suffix], title first middles[,] suffix [,suffix] - # parts[0], parts[1], parts[2:...] + # lastname comma: + # last [suffix], title first middles[,] suffix [,suffix] + # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: {0}".format(u(pieces))) @@ -501,7 +524,8 @@ def parse_full_name(self): # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: - # the first one is always a last name, even if it look like a suffix + # the first one is always a last name, even if it look like + # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: @@ -513,7 +537,9 @@ def parse_full_name(self): except IndexError: nxt = None - if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first: + if self.is_title(piece) \ + and (nxt or len(pieces) == 1) \ + and not self.first: self.title_list.append(piece) continue if not self.first: @@ -547,7 +573,8 @@ def parse_pieces(self, parts, additional_parts_count=0): :param int additional_parts_count: if the comma format contains other parts, we need to know - how many there are to decide if things should be considered a conjunction. + how many there are to decide if things should be considered a + conjunction. :return: pieces split on spaces and joined on conjunctions :rtype: list """ @@ -555,15 +582,18 @@ def parse_pieces(self, parts, additional_parts_count=0): output = [] for part in parts: if not isinstance(part, text_types): - raise TypeError("Name parts must be strings. Got {0}".format(type(part))) + raise TypeError("Name parts must be strings. " + "Got {0}".format(type(part))) output += [x.strip(' ,') for x in part.split(' ')] - # If part contains periods, check if it's multiple titles or suffixes together without spaces - # if so, add the new part with periods to the constants so they get parsed correctly later + # If part contains periods, check if it's multiple titles or suffixes + # together without spaces if so, add the new part with periods to the + # constants so they get parsed correctly later for part in output: # if this part has a period not at the beginning or end if self.C.regexes.period_not_at_end.match(part): - # split on periods, any of the split pieces titles or suffixes? ("Lt.Gov.") + # split on periods, any of the split pieces titles or suffixes? + # ("Lt.Gov.") period_chunks = part.split(".") titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) @@ -595,7 +625,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: - :return: new list with piece next to conjunctions merged into one piece with spaces in it. + :return: new list with piece next to conjunctions merged into one piece + with spaces in it. :rtype: list """ @@ -605,12 +636,13 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): return pieces rootname_pieces = [p for p in pieces if self.is_rootname(p)] - total_length= len(rootname_pieces) + additional_parts_count + total_length = len(rootname_pieces) + additional_parts_count # find all the conjunctions, join any conjunctions that are next to each # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it - conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] + conj_index = [i for i, piece in enumerate(pieces) + if self.is_conjunction(piece)] contiguous_conj_i = [] for i, val in enumerate(conj_index): @@ -674,7 +706,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): pieces.pop(i) except IndexError: rm_count = 1 - pass + # subtract the number of removed pieces from the index # of all the remaining conjunctions for j,val in enumerate(conj_index): @@ -724,9 +756,9 @@ def cap_piece(self, piece): def capitalize(self, force=False): """ - The HumanName class can try to guess the correct capitalization - of name entered in all upper or lower case. By default, it will not adjust - the case of names entered in mixed case. To run capitalization on all names + The HumanName class can try to guess the correct capitalization of name + entered in all upper or lower case. By default, it will not adjust the + case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. :param bool force: force capitalization of strings that include mixed case From 04101ac1c037f03473c8046a8adacf05949b6f46 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 14 Feb 2017 19:59:10 -0800 Subject: [PATCH 023/163] Add common titles from #57 --- nameparser/__init__.py | 2 +- nameparser/config/titles.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 71c4f7a..7661fab 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 1) +VERSION = (0, 5, 2) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 28c6d01..70cf295 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -82,6 +82,7 @@ 'asst', 'attache', 'attorney', + 'author', 'ayatollah', 'baba', 'bailiff', @@ -128,6 +129,8 @@ 'commander', 'commander-in-chief', 'commodore', + 'composer', + 'compositeur', 'comptroller', 'controller', 'corporal', From 43bc225f1ead123b44e403596866f17e85ba44b3 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 14 Feb 2017 20:51:41 -0800 Subject: [PATCH 024/163] add some respectful forms of address to titles #57 --- nameparser/config/titles.py | 6 ++++++ tests.py | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 70cf295..628d9d9 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -178,10 +178,13 @@ 'ekegbian', 'elder', 'elerunwon', + 'eminence', 'emperor', 'empress', 'ens', 'envoy', + 'excellency', + 'excellent', 'exec', 'executive', 'fadm', @@ -213,7 +216,9 @@ 'her', 'hereditary', 'high', + 'highness', 'his', + 'holiness', 'hon', # sorry Hon Solo, but judges seem more common. 'honorable', 'honourable', @@ -275,6 +280,7 @@ 'misses', 'mister', 'monsignor', + 'most', 'mpco-cg', 'mr', 'mrs', diff --git a/tests.py b/tests.py index 32e0b74..113c725 100644 --- a/tests.py +++ b/tests.py @@ -1231,6 +1231,18 @@ def test_le_as_last_name_with_middle_initial(self): self.m(hn.first, "Yin", hn) self.m(hn.middle, "a", hn) self.m(hn.last, "Le", hn) + + def test_conjunction_in_an_address_with_a_title(self): + hn = HumanName("His Excellency Lord Duncan") + self.m(hn.title, "His Excellency Lord", hn) + self.m(hn.last, "Duncan", hn) + + @unittest.expectedFailure + def test_conjunction_in_an_address_with_a_first_name_title(self): + hn = HumanName("Her Majesty Queen Elizabeth") + self.m(hn.title, "Her Majesty Queen", hn) + # if you want to be technical, Queen is in FIRST_NAME_TITLES + self.m(hn.first, "Elizabeth", hn) class ConstantsCustomization(HumanNameTestBase): From 115628c49cb001c178ca642ad5fad748ca4ac306 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Mar 2017 21:33:45 -0700 Subject: [PATCH 025/163] Add python 3.6 to travis config --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 571244d..84bf5dd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors unittest2; fi From 8bca9cea3d7990e5c4462f399310f3364385a3c6 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Mar 2017 21:36:01 -0700 Subject: [PATCH 026/163] add titles from VIAF (#57) --- nameparser/config/titles.py | 201 ++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 628d9d9..0f37a19 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -43,22 +43,38 @@ "marquis", "marquise", "queen's", + '10th', '1lt', '1sgt', + '1st', '1stlt', '1stsgt', '2lt', + '2nd', '2ndlt', + '3rd', + '4th', + '5th', + '6th', + '7th', + '8th', + '9th', 'a1c', 'ab', 'abbess', 'abbot', + 'abolitionist', 'academic', 'acolyte', + 'activist', + 'actor ', + 'actress', 'adept', 'adjutant', 'adm', 'admiral', + 'advertising', + 'adviser', 'advocate', 'air', 'akhoond', @@ -67,6 +83,9 @@ 'ambassador', 'amn', 'analytics', + 'anarchist', + 'animator', + 'anthropologist', 'appellate', 'apprentice', 'arbitrator', @@ -75,73 +94,123 @@ 'archdruid', 'archduchess', 'archduke', + 'archeologist', + 'architect', 'arhat', + 'army', + 'arranger', 'assistant', 'assoc', 'associate', 'asst', + 'astronomer', 'attache', 'attorney', 'author', + 'award-winning', 'ayatollah', 'baba', 'bailiff', + 'ballet', + 'bandleader', + 'banker', 'banner', 'bard', 'baron', 'barrister', + 'baseball', 'bearer', + 'behavioral', 'bench', 'bg', 'bgen', + 'biblical', + 'bibliographer', + 'biochemist', + 'biographer', + 'biologist', + 'bishop', 'blessed', + 'blogger', + 'blues', 'bodhisattva', + 'bookseller', + 'botanist', 'brigadier', 'briggen', + 'british', + 'broadcaster', 'buddha', 'burgess', + 'burlesque', 'business', + 'businessman', + 'businesswoman', 'bwana', 'canon', 'capt', 'captain', 'cardinal', + 'cartographer', + 'cartoonist', 'catholicos', 'ccmsgt', 'cdr', + 'celebrity', 'ceo', 'cfo', 'chair', 'chairs', 'chancellor', 'chaplain', + 'chef', + 'chemist', 'chief', 'chieftain', + 'choreographer', 'civil', + 'classical', + 'clergyman', 'clerk', 'cmsaf', 'cmsgt', 'co-chair', 'co-chairs', + 'co-founder', 'coach', 'col', + 'collector', 'colonel', + 'comedian', + 'comedienne', + 'comic', 'commander', 'commander-in-chief', 'commodore', 'composer', 'compositeur', 'comptroller', + 'computer', + 'comtesse', + 'conductor', + 'consultant', + 'contessa', 'controller', 'corporal', 'corporate', + 'correspondent', 'councillor', + 'counselor', + 'count', + 'countess', 'courtier', 'cpl', 'cpo', 'cpt', 'credit', 'criminal', + 'criminologist', + 'critic', 'csm', 'curator', 'customs', @@ -153,13 +222,20 @@ 'cwo3', 'cwo4', 'cwo5', + 'cyclist', + 'dancer', 'deacon', 'delegate', 'deputy', 'designated', + 'designer', + 'detective', + 'developer', + 'diplomat', 'dir', 'director', 'discovery', + 'dissident', 'district', 'division', 'do', @@ -169,11 +245,18 @@ 'doyen', 'dpty', 'dr', + 'dramatist', 'druid', + 'drummer', + 'duchesse', 'duke', 'dutchess', + 'ecologist', + 'economist', + 'editor', 'edmi', 'edohen', + 'educator', 'effendi', 'ekegbian', 'elder', @@ -181,23 +264,33 @@ 'eminence', 'emperor', 'empress', + 'engineer', + 'english', 'ens', + 'entertainer', + 'entrepreneur', 'envoy', + 'essayist', + 'evangelist', 'excellency', 'excellent', 'exec', 'executive', + 'expert', 'fadm', 'family', 'federal', 'field', + 'film', 'financial', 'first', 'flag', 'flying', 'foreign', 'forester', + 'founder', 'friar', + 'gaf', 'gen', 'general', 'generalissimo', @@ -206,41 +299,64 @@ 'goodman', 'goodwife', 'governor', + 'graf', 'grand', 'group', + 'guitarist', 'guru', 'gyani', 'gysgt', 'hajji', 'headman', + 'heir', + 'heiress', 'her', 'hereditary', 'high', 'highness', 'his', + 'historian', + 'historicus', + 'historien', 'holiness', 'hon', # sorry Hon Solo, but judges seem more common. 'honorable', 'honourable', + 'host', + 'illustrator', 'imam', + 'industrialist', 'information', + 'instructor', 'intelligence', 'intendant', + 'inventor', + 'investigator', + 'investor', + 'journalist', 'journeyman', 'jr', 'judge', 'judicial', 'junior', + 'jurist', + 'keyboardist', 'kingdom', 'knowledge', 'lady', 'lama', 'lamido', 'law', + 'lawyer', 'lcdr', 'lcpl', 'leader', + 'lecturer', + 'legal', + 'librarian', 'lieutenant', + 'linguist', + 'literary', 'lord', 'lt', 'ltc', @@ -248,13 +364,16 @@ 'ltg', 'ltgen', 'ltjg', + 'lyricist', 'madam', 'madame', + 'mademoiselle', 'mag', 'mag-judge', 'mag/judge', 'magistrate', 'magistrate-judge', + 'magnate', 'maharajah', 'maharani', 'mahdi', @@ -263,7 +382,11 @@ 'majgen', 'manager', 'marcher', + 'marchess', 'marketing', + 'marquis', + 'mathematician', + 'mathematics', 'matriarch', 'mayor', 'mcpo', @@ -271,16 +394,26 @@ 'mcpon', 'md', 'member', + 'memoirist', + 'merchant', 'metropolitan', 'mg', 'mgr', 'mgysgt', + 'military', 'minister', 'miss', 'misses', + 'missionary', 'mister', + 'mlle', + 'mme', + 'mobster', + 'model', + 'monk', 'monsignor', 'most', + 'mountaineer', 'mpco-cg', 'mr', 'mrs', @@ -291,23 +424,50 @@ 'mullah', 'municipal', 'murshid', + 'musician', + 'musicologist', + 'mystery', 'nanny', + 'narrator', 'national', + 'naturalist', + 'navy', + 'neuroscientist', + 'novelist', 'nurse', + 'obstetritian', 'officer', + 'opera', 'operating', + 'ornithologist', + 'painter', + 'paleontologist', 'pastor', 'patriarch', + 'pediatrician', + 'personality', 'petty', 'pfc', 'pharaoh', + 'phd', + 'philantropist', + 'philosopher', + 'photographer', + 'physician', + 'physicist', + 'pianist', 'pilot', + 'pioneer', 'pir', + 'player', + 'playwright', 'po1', 'po2', 'po3', + 'poet', 'police', 'political', + 'politician', 'prefect', 'prelate', 'premier', @@ -323,16 +483,24 @@ 'prince', 'princess', 'principal', + 'printer', + 'printmaker', 'prior', 'private', 'pro', + 'producer', 'prof', + 'professor', 'provost', 'pslc', + 'psychiatrist', + 'psychologist', + 'publisher', 'pursuivant', 'pv2', 'pvt', 'rabbi', + 'radio', 'radm', 'rangatira', 'ranger', @@ -342,18 +510,27 @@ 'registrar', 'rep', 'representative', + 'researcher', 'resident', 'rev', 'revenue', 'reverend', 'right', 'risk', + 'rock', 'royal', 'rt', 'sa', + 'sailor', 'saint', + 'sainte', 'saoshyant', + 'satirist', + 'scholar', + 'schoolmaster', + 'scientist', 'scpo', + 'screenwriter', 'se', 'secretary', 'security', @@ -372,10 +549,18 @@ 'sheikh', 'sheriff', 'siddha', + 'singer', + 'singer-songwriter', 'sma', 'smsgt', 'sn', + 'soccer', + 'social', + 'sociologist', + 'software', + 'soldier', 'solicitor', + 'soprano', 'spc', 'speaker', 'special', @@ -389,17 +574,29 @@ 'strategy', 'subaltern', 'subedar', + 'suffragist', 'sultan', 'sultana', 'superior', 'supreme', 'surgeon', + 'swami', 'swordbearer', 'sysselmann', 'tax', + 'teacher', 'technical', + 'technologist', + 'television ', + 'tenor', + 'theater', + 'theatre', + 'theologian', + 'theorist', 'timi', 'tirthankar', + 'translator', + 'travel', 'treasurer', 'tsar', 'tsarina', @@ -416,6 +613,8 @@ 'vice', 'viscount', 'vizier', + 'vocalist', + 'voice', 'warden', 'warrant', 'wing', @@ -426,4 +625,6 @@ 'wo4', 'wo5', 'woodman', + 'writer', + 'zoologist', ]) From 43fa4798ce2613423e51ede19789c89816a0156c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:24:05 -0700 Subject: [PATCH 027/163] remove emoji from initial string by default, fix #58 disable by setting 'emoji' regex to False add example in docs --- docs/customize.rst | 26 ++++++++++++++++++++++---- nameparser/__init__.py | 2 +- nameparser/config/regexes.py | 17 +++++++++++++++++ nameparser/parser.py | 11 ++++++++++- tests.py | 22 ++++++++++++++++++++++ 5 files changed, 72 insertions(+), 6 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index 4e8ba48..2da1980 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -116,6 +116,8 @@ If you don't want to detect any titles at all, you can remove all of them: Adding a Title ~~~~~~~~~~~~~~~~ +You can also pass a ``Constants`` instance to ``HumanName``on instantiation. + "Dean" is a common first name so it is not included in the default titles constant. But in some contexts it is more common as a title. If you would like "Dean" to be parsed as a title, simply add it to the titles constant. @@ -130,10 +132,11 @@ making them lower case and removing periods. :options: +ELLIPSIS, +NORMALIZE_WHITESPACE >>> from nameparser import HumanName - >>> from nameparser.config import CONSTANTS - >>> CONSTANTS.titles.add('dean', 'Chemistry') + >>> from nameparser.config import Constants + >>> constants = Constants() + >>> constants.titles.add('dean', 'Chemistry') SetManager({'right', ..., 'tax'}) - >>> hn = HumanName("Assoc Dean of Chemistry Robert Johns") + >>> hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=constants) >>> hn -If you'd prefer new instances to have their own config values, you can pass +If you'd prefer new instances to have their own config values, one shortcut is to pass ``None`` as the second argument (or ``constant`` keyword argument) when instantiating ``HumanName``. Each instance always has a ``C`` attribute, but if you didn't pass something falsey to the ``constants`` argument then it's a @@ -200,6 +203,21 @@ reference to the module-level config values with the behavior described above. >>> other_instance.has_own_config True +Don't Remove Emojis +~~~~~~~~~~~~~~~~~~~ + +By default, all emojis are removed from the input string before the name is parsed. +You can turn this off by setting the ``emoji`` regex to ``False``. + +.. doctest:: + + >>> from nameparser import HumanName + >>> from nameparser.config import Constants + >>> constants = Constants() + >>> constants.regexes.emoji = False + >>> hn = HumanName("Sam 😊 Smith", constants=constants) + >>> hn + "Sam 😊 Smith" Config Changes May Need Parse Refresh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 7661fab..ea1125d 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 2) +VERSION = (0, 5, 3) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index 01ca86d..12d3efe 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -2,6 +2,22 @@ from __future__ import unicode_literals import re +# emoji regex from https://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python +try: + # Wide UCS-4 build + re_emoji = re.compile(u'[' + u'\U0001F300-\U0001F64F' + u'\U0001F680-\U0001F6FF' + u'\u2600-\u26FF\u2700-\u27BF]+', + re.UNICODE) +except re.error: + # Narrow UCS-2 build + re_emoji = re.compile(u'(' + u'\ud83c[\udf00-\udfff]|' + u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' + u'[\u2600-\u26FF\u2700-\u27BF])+', + re.UNICODE) + REGEXES = set([ ("spaces", re.compile(r"\s+", re.U)), ("word", re.compile(r"(\w|\.)+", re.U)), @@ -11,6 +27,7 @@ ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)), ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)), ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), + ("emoji",re_emoji), ]) """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index 55f574a..55d85df 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -77,6 +77,7 @@ def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING, self.ENCODING = encoding self.string_format = string_format or self.C.string_format + # full_name setter triggers the parse self.full_name = full_name def __iter__(self): @@ -371,7 +372,7 @@ def pre_process(self): """ self.parse_nicknames() - + self.squash_emoji() def post_process(self): """ @@ -392,6 +393,14 @@ def parse_nicknames(self): self.nickname_list = re_nickname.findall(self._full_name) self._full_name = re_nickname.sub('', self._full_name) + def squash_emoji(self): + """ + Remove emoji from the input string. + """ + re_emoji = self.C.regexes.emoji + if re_emoji and re_emoji.search(self._full_name): + self._full_name = re_emoji.sub('', self._full_name) + def handle_firstnames(self): """ If there are only two parts and one is a title, assume it's a last name diff --git a/tests.py b/tests.py index 113c725..983eed8 100644 --- a/tests.py +++ b/tests.py @@ -1936,6 +1936,28 @@ def test_formating_of_nicknames_in_middle(self): self.assertEqual(u(hn), "Rev John (Kenny) A. Kenneth Doe III") hn.nickname='' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") + + def test_remove_emojis(self): + hn = HumanName("Sam Smith 😊") + self.m(hn.first,"Sam", hn) + self.m(hn.last,"Smith", hn) + self.assertEqual(u(hn), "Sam Smith") + + def test_keep_non_emojis(self): + hn = HumanName("∫≜⩕ Smith 😊") + self.m(hn.first,"∫≜⩕", hn) + self.m(hn.last,"Smith", hn) + self.assertEqual(u(hn), "∫≜⩕ Smith") + + def test_keep_emojis(self): + from nameparser.config import Constants + constants = Constants() + constants.regexes.emoji = False + hn = HumanName("∫≜⩕ Smith😊", constants) + self.m(hn.first,"∫≜⩕", hn) + self.m(hn.last,"Smith😊", hn) + self.assertEqual(u(hn), "∫≜⩕ Smith😊") + # test cleanup TEST_NAMES = ( "John Doe", From 0bc5929b4cb0999d620fd3794be9ce060e8ec3ef Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:24:46 -0700 Subject: [PATCH 028/163] update dev requirements --- dev-requirements.txt | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 0d116e5..8aab0b6 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,6 @@ -ipdb==0.9.0 -nose==1.3.7 -Sphinx==1.3.6 -coverage==4.0.3 -ipython==4.1.2 -Pygments==2.1.3 -dill==0.2.5 +ipdb +nose>=1.3.7 +coverage>=4.0.3 +dill>=0.2.5 +twine +Sphinx From acac4ba1487d77fa12421dabadb6e51cc053019f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:29:31 -0700 Subject: [PATCH 029/163] fix emoji regex on python 3.2 --- nameparser/config/regexes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index 12d3efe..ceb66a2 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -5,16 +5,16 @@ # emoji regex from https://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python try: # Wide UCS-4 build - re_emoji = re.compile(u'[' - u'\U0001F300-\U0001F64F' - u'\U0001F680-\U0001F6FF' + re_emoji = re.compile(u'[' + + u'\U0001F300-\U0001F64F' + + u'\U0001F680-\U0001F6FF' + u'\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) except re.error: # Narrow UCS-2 build - re_emoji = re.compile(u'(' - u'\ud83c[\udf00-\udfff]|' - u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' + re_emoji = re.compile(u'(' + + u'\ud83c[\udf00-\udfff]|' + + u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' + u'[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) From 891945d5b4858b76b3ac4ccfc1612366a41e1b7d Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:34:45 -0700 Subject: [PATCH 030/163] really fix emoji regex on python 3.2 --- nameparser/config/regexes.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index ceb66a2..2024103 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -5,17 +5,17 @@ # emoji regex from https://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python try: # Wide UCS-4 build - re_emoji = re.compile(u'[' + - u'\U0001F300-\U0001F64F' + - u'\U0001F680-\U0001F6FF' + - u'\u2600-\u26FF\u2700-\u27BF]+', + re_emoji = re.compile('[' + '\U0001F300-\U0001F64F' + '\U0001F680-\U0001F6FF' + '\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) except re.error: # Narrow UCS-2 build - re_emoji = re.compile(u'(' + - u'\ud83c[\udf00-\udfff]|' + - u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' + - u'[\u2600-\u26FF\u2700-\u27BF])+', + re_emoji = re.compile(u'(' + '\ud83c[\udf00-\udfff]|' + '\ud83d[\udc00-\ude4f\ude80-\udeff]|' + '[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) REGEXES = set([ From 3be2094039f32b9116fcda61946d75e18be8270c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:36:48 -0700 Subject: [PATCH 031/163] really really fix emoji regex on python 3.2 --- nameparser/config/regexes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index 2024103..51a6ed2 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -12,7 +12,7 @@ re.UNICODE) except re.error: # Narrow UCS-2 build - re_emoji = re.compile(u'(' + re_emoji = re.compile('(' '\ud83c[\udf00-\udfff]|' '\ud83d[\udc00-\ude4f\ude80-\udeff]|' '[\u2600-\u26FF\u2700-\u27BF])+', From bdd9118a33b0a7c3d7c65985f37722bf5340d45a Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:46:59 -0700 Subject: [PATCH 032/163] update release log --- docs/release_log.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release_log.rst b/docs/release_log.rst index 28f1f85..dbd644a 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 0.5.2 - June 27, 2017 + - Remove emojis from initial string by default with option to include emojis (#58) * 0.5.1 - August 12, 2016 - Fix error for names that end with conjunction (#54) * 0.5.0 - August 4, 2016 From 84ac117d1e360ae3e320389f6d22ed285862f6af Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:49:54 -0700 Subject: [PATCH 033/163] typo --- docs/customize.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/customize.rst b/docs/customize.rst index 2da1980..0809c89 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -116,7 +116,7 @@ If you don't want to detect any titles at all, you can remove all of them: Adding a Title ~~~~~~~~~~~~~~~~ -You can also pass a ``Constants`` instance to ``HumanName``on instantiation. +You can also pass a ``Constants`` instance to ``HumanName`` on instantiation. "Dean" is a common first name so it is not included in the default titles constant. But in some contexts it is more common as a title. If you would From d0c8be602a8d5aeb5b5839bfb4f9890ad9f99ec8 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 27 Jun 2017 20:54:23 -0700 Subject: [PATCH 034/163] update release log --- docs/release_log.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index dbd644a..982d5dd 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,7 +1,9 @@ Release Log =========== -* 0.5.2 - June 27, 2017 +* 0.5.3 - June 27, 2017 - Remove emojis from initial string by default with option to include emojis (#58) +* 0.5.2 - March 19, 2017 + - Added names scrapped from VIAF data, thanks daryanypl (#57) * 0.5.1 - August 12, 2016 - Fix error for names that end with conjunction (#54) * 0.5.0 - August 4, 2016 From 2b0d339fe2e739739a33358ead0f7b1b4038af46 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 10 Aug 2017 21:07:43 -0700 Subject: [PATCH 035/163] add the full set of Italian derivatives from "di" (preposizioni articolate), fix #59 sort and de-dup --- nameparser/config/prefixes.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 64731e8..21c82fa 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -4,18 +4,23 @@ #: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece. PREFIXES = set([ 'abu', - 'bon', 'bin', + 'bon', 'da', 'dal', 'de', + 'degli', + 'dei', 'del', 'dela', + 'della', + 'delle', + 'delli', + 'dello', 'der', - 'de', 'di', - 'dí', 'du', + 'dí', 'ibn', 'la', 'le', From 6ab3baa37d513be5a2ae0cad2a6d5727eb706c72 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 10 Aug 2017 21:11:05 -0700 Subject: [PATCH 036/163] add "dr" to suffixes, fixes #62 --- nameparser/config/suffixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index b8c4f4b..9cb3345 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals SUFFIX_NOT_ACRONYMS = set([ + 'dr', 'esq', 'esquire', 'jr', From e7ffbca1b6ebaadf364d48c273167cd2d306d359 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 10 Aug 2017 21:16:56 -0700 Subject: [PATCH 037/163] bump to v1.0.0, update release log --- docs/release_log.rst | 4 ++++ docs/usage.rst | 5 ++++- nameparser/__init__.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 982d5dd..62c17d3 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,9 @@ Release Log =========== +* 1.0.0 - August 10, 2017 + - Refactor tests into fixtures by MilesCranmer (#61) + - Add Dr to suffixes (#62) + - Add the full set of Italian derivatives from "di" (#59) * 0.5.3 - June 27, 2017 - Remove emojis from initial string by default with option to include emojis (#58) * 0.5.2 - March 19, 2017 diff --git a/docs/usage.rst b/docs/usage.rst index 9711331..aa21951 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -116,7 +116,10 @@ available from the nickname attribute. Change the output string with string formatting ----------------------------------------------- -The string representation of a `HumanName` instance is controlled by its `string_format` attribute. The default value, "{title} {first} {middle} {last} {suffix} ({nickname})", includes parenthesis around nicknames. Trailing commas and empty quotes and parenthesis are automatically removed if the name has no nickname pieces. +The string representation of a `HumanName` instance is controlled by its `string_format` attribute. +The default value, `"{title} {first} {middle} {last} {suffix} ({nickname})"`, includes parenthesis +around nicknames. Trailing commas and empty quotes and parenthesis are automatically removed if the +name has no nickname pieces. You can change the default formatting for all `HumanName` instances by setting a new :py:attr:`~nameparser.config.Constants.string_format` value on the shared diff --git a/nameparser/__init__.py b/nameparser/__init__.py index ea1125d..e161e63 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 3) +VERSION = (1, 0, 0) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 34d6a8660dd0a29c12190f3657fd2245c7093b15 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 10 Aug 2017 22:17:28 -0700 Subject: [PATCH 038/163] tweak capitalization of mac regex to fix "mack", fixes #56 --- nameparser/config/regexes.py | 2 +- tests.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index 51a6ed2..42da85d 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -21,7 +21,7 @@ REGEXES = set([ ("spaces", re.compile(r"\s+", re.U)), ("word", re.compile(r"(\w|\.)+", re.U)), - ("mac", re.compile(r'^(ma?c)(\w+)', re.I | re.U)), + ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), ("nickname", re.compile(r'\s*?[\("](.+?)[\)"]', re.U)), ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)), diff --git a/tests.py b/tests.py index 983eed8..58dd05d 100644 --- a/tests.py +++ b/tests.py @@ -1858,6 +1858,10 @@ def test_downcasing_mc(self): hn.capitalize() self.m(str(hn), 'Ronald McDonald', hn) + def test_short_names_with_mac(self): + hn = HumanName('mack johnson') + hn.capitalize() + self.m(str(hn), 'Mack Johnson', hn) class HumanNameOutputFormatTests(HumanNameTestBase): From 4515e5bc3503ed18b4ca7cdff4f9062cc8e8978e Mon Sep 17 00:00:00 2001 From: Simeon Visser Date: Tue, 3 Oct 2017 11:44:27 +0100 Subject: [PATCH 039/163] Avoid use of sys.version for checking Python version --- nameparser/parser.py | 4 ++-- nameparser/util.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 55d85df..3cc3949 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -133,7 +133,7 @@ def __unicode__(self): return " ".join(self) def __str__(self): - if sys.version >= '3': + if sys.version_info[0] >= 3: return self.__unicode__() return self.__unicode__().encode(self.ENCODING) @@ -150,7 +150,7 @@ def __repr__(self): 'suffix': self.suffix or '', 'nickname': self.nickname or '', } - if sys.version >= '3': + if sys.version_info[0] >= 3: return _string return _string.encode(self.ENCODING) diff --git a/nameparser/util.py b/nameparser/util.py index 899bcb0..4ef7458 100644 --- a/nameparser/util.py +++ b/nameparser/util.py @@ -13,7 +13,7 @@ def emit(self, record): import sys -if sys.version < '3': +if sys.version_info[0] < 3: text_type = unicode binary_type = str From dead673e1958f27f60c7382b15236fa9bf9e771a Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 5 Oct 2017 10:16:00 +0100 Subject: [PATCH 040/163] correct spelling mistakes --- docs/customize.rst | 4 ++-- docs/usage.rst | 2 +- tests.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index 0809c89..7442300 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -43,8 +43,8 @@ Editable attributes of nameparser.config.CONSTANTS * :py:obj:`~nameparser.config.Constants.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David" * :py:obj:`~nameparser.config.Constants.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d." * :py:obj:`~nameparser.config.Constants.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr." -* :py:obj:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceeding piece to the following piece. -* :py:obj:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceeding +* :py:obj:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. +* :py:obj:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding * :py:obj:`~nameparser.config.Constants.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D" * :py:obj:`~nameparser.config.Constants.regexes` - Regular expressions used to find words, initials, nicknames, etc. diff --git a/docs/usage.rst b/docs/usage.rst index aa21951..f3ab41b 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -136,7 +136,7 @@ You can change the default formatting for all `HumanName` instances by setting a >>> str(name) 'Robert (Rob) Johnson' -You can control the order and presense of any name fields by changing the +You can control the order and presence of any name fields by changing the :py:attr:`~nameparser.config.Constants.string_format` attribute of the shared CONSTANTS instance. Don't want to include nicknames in your output? No problem. Just omit that keyword from the `string_format` attribute. diff --git a/tests.py b/tests.py index 58dd05d..a1b31bd 100644 --- a/tests.py +++ b/tests.py @@ -1465,7 +1465,7 @@ def test_two_suffixes(self): hn = HumanName("Kenneth Clarke QC MP") self.m(hn.first, "Kenneth", hn) self.m(hn.last, "Clarke", hn) - # NOTE: this adds a comma when the orginal format did not have one. + # NOTE: this adds a comma when the original format did not have one. # not ideal but at least its in the right bucket self.m(hn.suffix, "QC, MP", hn) @@ -1473,7 +1473,7 @@ def test_two_suffixes_lastname_comma_format(self): hn = HumanName("Washington Jr. MD, Franklin") self.m(hn.first, "Franklin", hn) self.m(hn.last, "Washington", hn) - # NOTE: this adds a comma when the orginal format did not have one. + # NOTE: this adds a comma when the original format did not have one. self.m(hn.suffix, "Jr., MD", hn) def test_two_suffixes_suffix_comma_format(self): From b7aded4796e8f2d46647b3023bfe6d5cecc5456b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 6 Dec 2017 19:56:03 -0800 Subject: [PATCH 041/163] set v0.5.4 (1.0 will have to wait for another day) --- .gitignore | 1 + docs/release_log.rst | 4 ++-- nameparser/__init__.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index c16e878..c586728 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build *.egg .coverage dist +.idea # docs docs/_* diff --git a/docs/release_log.rst b/docs/release_log.rst index 62c17d3..725f9a2 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,9 +1,9 @@ Release Log =========== -* 1.0.0 - August 10, 2017 - - Refactor tests into fixtures by MilesCranmer (#61) +* 0.5.4 - December 10, 2017 - Add Dr to suffixes (#62) - Add the full set of Italian derivatives from "di" (#59) + - Add parameter to specify the encoding of strings added to constants, use 'UTF-8' as fallback (#67) * 0.5.3 - June 27, 2017 - Remove emojis from initial string by default with option to include emojis (#58) * 0.5.2 - March 19, 2017 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index e161e63..074bf5e 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 0) +VERSION = (0, 5, 4) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 6962b51995c43da5f6a6fddac750d128437870cd Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 6 Dec 2017 19:59:19 -0800 Subject: [PATCH 042/163] move default encoding to constants file, use it as fallback for add(), fix #67 also add ability to specify encoding of values added to the constants using add_with_encoding() --- nameparser/config/__init__.py | 22 ++++++++++++++++------ nameparser/parser.py | 13 +++++++------ tests.py | 9 ++++++++- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 7bddf90..3b11e88 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -43,6 +43,8 @@ from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import REGEXES +DEFAULT_ENCODING = 'UTF-8' + class SetManager(collections.Set): ''' Easily add and remove config variables per module or instance. Subclass of @@ -84,15 +86,23 @@ def __next__(self): self.count = c + 1 return getattr(self, self.elements[c]) or next(self) + def add_with_encoding(self, s, encoding=None): + """ + Add the lower case and no-period version of the string to the set. Pass an + explicit `encoding` parameter to specify the encoding of binary strings that + are not DEFAULT_ENCODING (UTF-8). + """ + encoding = encoding or sys.stdin.encoding or DEFAULT_ENCODING + if type(s) == binary_type: + s = s.decode(encoding) + self.elements.add(lc(s)) + def add(self, *strings): """ Add the lower case and no-period version of the string arguments to the set. - Returns ``self`` for chaining. + Can pass a list of strings. Returns ``self`` for chaining. """ - for s in strings: - if type(s) == binary_type: - s = s.decode(sys.stdin.encoding) - self.elements.add(lc(s)) + [self.add_with_encoding(s) for s in strings] return self def remove(self, *strings): @@ -193,7 +203,7 @@ def suffixes_prefixes_titles(self): if not self._pst: self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst - + def __repr__(self): return "" diff --git a/nameparser/parser.py b/nameparser/parser.py index 55d85df..f6748e5 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -11,6 +11,7 @@ from nameparser.util import log from nameparser.config import CONSTANTS from nameparser.config import Constants +from nameparser.config import DEFAULT_ENCODING ENCODING = 'utf-8' @@ -69,13 +70,13 @@ class HumanName(object): unparsable = True _full_name = '' - def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING, + def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, string_format=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() - self.ENCODING = encoding + self.encoding = encoding self.string_format = string_format or self.C.string_format # full_name setter triggers the parse self.full_name = full_name @@ -127,7 +128,7 @@ def __unicode__(self): if self.string_format: # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" _s = self.string_format.format(**self.as_dict()) - # remove trailing punctation from missing nicknames + # remove trailing punctuation from missing nicknames _s = _s.replace(str(self.C.empty_attribute_default),'').replace(" ()","").replace(" ''","").replace(' ""',"") return self.collapse_whitespace(_s).strip(', ') return " ".join(self) @@ -135,7 +136,7 @@ def __unicode__(self): def __str__(self): if sys.version >= '3': return self.__unicode__() - return self.__unicode__().encode(self.ENCODING) + return self.__unicode__().encode(self.encoding) def __repr__(self): if self.unparsable: @@ -152,7 +153,7 @@ def __repr__(self): } if sys.version >= '3': return _string - return _string.encode(self.ENCODING) + return _string.encode(self.encoding) def as_dict(self, include_empty=True): """ @@ -355,7 +356,7 @@ def full_name(self, value): self.original = value self._full_name = value if isinstance(value, binary_type): - self._full_name = value.decode(self.ENCODING) + self._full_name = value.decode(self.encoding) self.parse_full_name() def collapse_whitespace(self, string): diff --git a/tests.py b/tests.py index 58dd05d..8210e94 100644 --- a/tests.py +++ b/tests.py @@ -189,6 +189,7 @@ def test_blank_name(self): self.m(hn.first, "", hn) self.m(hn.last, "", hn) + class FirstNameHandlingTests(HumanNameTestBase): def test_first_name(self): hn = HumanName("Andrew") @@ -1058,7 +1059,6 @@ def test119(self): self.m(hn.last, "Almighty", hn) - class HumanNameConjunctionTestCase(HumanNameTestBase): # Last name with conjunction def test_last_name_with_conjunction(self): @@ -1244,6 +1244,7 @@ def test_conjunction_in_an_address_with_a_first_name_title(self): # if you want to be technical, Queen is in FIRST_NAME_TITLES self.m(hn.first, "Elizabeth", hn) + class ConstantsCustomization(HumanNameTestBase): def test_add_title(self): @@ -1335,6 +1336,12 @@ def test_none_empty_attribute_string_formatting(self): hn.C.empty_attribute_default = None self.assertEqual('', str(hn), hn) + def test_add_constant_with_explicit_encoding(self): + c = Constants() + c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') + self.assertIn('béck', c.titles) + + class HumanNameNicknameTestCase(HumanNameTestBase): # https://code.google.com/p/python-nameparser/issues/detail?id=33 def test_nickname_in_parenthesis(self): From e42699a6d07788b5cef73ba1d6e579814af37457 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 6 Dec 2017 20:34:50 -0800 Subject: [PATCH 043/163] Fix #66 handling of names composed entirely of conjunctions --- docs/release_log.rst | 1 + nameparser/parser.py | 10 +++++++--- tests.py | 4 ++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 725f9a2..f626358 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -4,6 +4,7 @@ Release Log - Add Dr to suffixes (#62) - Add the full set of Italian derivatives from "di" (#59) - Add parameter to specify the encoding of strings added to constants, use 'UTF-8' as fallback (#67) + - Fix handling of names composed entirely of conjunctions (#66) * 0.5.3 - June 27, 2017 - Remove emojis from initial string by default with option to include emojis (#58) * 0.5.2 - March 19, 2017 diff --git a/nameparser/parser.py b/nameparser/parser.py index f6748e5..1370cb2 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -641,7 +641,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): """ length = len(pieces) + additional_parts_count - # don't join on conjuctions if there's only 2 parts + # don't join on conjunctions if there's only 2 parts if length < 3: return pieces @@ -658,7 +658,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for i, val in enumerate(conj_index): try: if conj_index[i+1] == val+1: - contiguous_conj_i += [val] + contiguous_conj_i += [val] except IndexError: pass @@ -680,7 +680,11 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for i in reversed(delete_i): # delete pieces in reverse order or the index changes on each delete del pieces[i] - + + if len(pieces) == 1: + # if there's only one piece left, nothing left to do + return pieces + # refresh conjunction index locations conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] diff --git a/tests.py b/tests.py index 8210e94..3123a6f 100644 --- a/tests.py +++ b/tests.py @@ -1244,6 +1244,10 @@ def test_conjunction_in_an_address_with_a_first_name_title(self): # if you want to be technical, Queen is in FIRST_NAME_TITLES self.m(hn.first, "Elizabeth", hn) + def test_name_is_conjunctions(self): + hn = HumanName("e and e") + self.m(hn.first, "e and e", hn) + class ConstantsCustomization(HumanNameTestBase): From a22705de594563e2064bda127bb966e38112c827 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 6 Dec 2017 20:43:10 -0800 Subject: [PATCH 044/163] --use-mirrors was deprecated in 2015 i guess --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 84bf5dd..dc37c42 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ python: - "3.6" # command to install dependencies install: - - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors unittest2; fi + - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - "pip install dill" - "python setup.py install" # command to run tests From 0f3aa875a181b475afca1540a1235201cfde4c72 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 10 Jan 2018 18:12:24 -0800 Subject: [PATCH 045/163] v0.5.5, Support J.D. as suffix and Wm. as title, fix #68 --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- nameparser/config/suffixes.py | 1 + nameparser/config/titles.py | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index f626358..1e12826 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 0.5.5 - January 10, 2018 + - Support J.D. as suffix and Wm. as title * 0.5.4 - December 10, 2017 - Add Dr to suffixes (#62) - Add the full set of Italian derivatives from "di" (#59) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 074bf5e..75e716c 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 4) +VERSION = (0, 5, 5) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 9cb3345..c058ce6 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -73,6 +73,7 @@ 'idsm', 'iom', 'iso', + 'jd', 'kbe', 'kcb', 'kcie', diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 0f37a19..131738f 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -618,6 +618,7 @@ 'warden', 'warrant', 'wing', + 'wm', 'wo-1', 'wo1', 'wo2', From fb1475af3b939fcbdc679daf352d940376dac89e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 15 Jan 2018 15:28:23 -0800 Subject: [PATCH 046/163] v0.5.6 --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 1e12826..dbae5db 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 0.5.6 - January 15, 2018 + - Fix python version check (#64) * 0.5.5 - January 10, 2018 - Support J.D. as suffix and Wm. as title * 0.5.4 - December 10, 2017 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 75e716c..d3efaaf 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 5) +VERSION = (0, 5, 6) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 7892f83756ddafddf316dac9d91caccf7f0bc3a1 Mon Sep 17 00:00:00 2001 From: "Kelvin S. do Prado" Date: Mon, 7 May 2018 07:48:21 -0300 Subject: [PATCH 047/163] Fix 'Publishing to Pypi Guide' link --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2e1b8ae..329716d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -67,7 +67,7 @@ don't blow up, so it can be a helpful regression indicator. New Releases ------------ -[https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/](Publishing to Pypi Guide) +[Publishing to Pypi Guide](https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/) $ python setup.py sdist bdist_wheel $ twine upload dist/* From d329f38f8b69eed5fd62dd8a92ce5544bbfd1064 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 13 Jun 2018 17:42:56 -0700 Subject: [PATCH 048/163] note python 3.6 support --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 4986703..4b71aca 100755 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def read(fname): 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Development Status :: 5 - Production/Stable', 'Natural Language :: English', "Topic :: Software Development :: Libraries :: Python Modules", From 811ed9d6df3e85f386831f69c062dd8bda98935a Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 13 Jun 2018 17:54:27 -0700 Subject: [PATCH 049/163] handle parsing some Portuguese prefixes (#72 & 71) --- nameparser/config/prefixes.py | 4 +++- tests.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 21c82fa..b2a9386 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -19,8 +19,10 @@ 'dello', 'der', 'di', - 'du', 'dí', + 'do', + 'dos', + 'du', 'ibn', 'la', 'le', diff --git a/tests.py b/tests.py index 31a7c19..7b4a740 100644 --- a/tests.py +++ b/tests.py @@ -1248,6 +1248,19 @@ def test_name_is_conjunctions(self): hn = HumanName("e and e") self.m(hn.first, "e and e", hn) + def test_portuguese_dos(self): + hn = HumanName("Rafael Sousa dos Anjos") + self.m(hn.first, "Rafael", hn) + self.m(hn.middle, "Sousa", hn) + self.m(hn.last, "dos Anjos", hn) + + def test_portuguese_prefixes(self): + hn = HumanName("Joao da Silva do Amaral de Souza") + self.m(hn.first, "Joao", hn) + self.m(hn.middle, "da Silva do Amaral de", hn) + self.m(hn.last, "Souza", hn) + + class ConstantsCustomization(HumanNameTestBase): From 61fc1b4e798c9ba0a25ccf6f853b9401dea06f68 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 15 Jun 2018 18:20:43 -0700 Subject: [PATCH 050/163] v0.5.7 --- docs/release_log.rst | 3 +++ nameparser/__init__.py | 2 +- tests.py | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index dbae5db..d27a576 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,8 @@ Release Log =========== +* 0.5.7 - June 16, 2018 + - Fix doc link (#73) + - Fix handling of "do" and "dos" Portuguese prefixes (#71, #72) * 0.5.6 - January 15, 2018 - Fix python version check (#64) * 0.5.5 - January 10, 2018 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index d3efaaf..5e4b808 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 6) +VERSION = (0, 5, 7) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/tests.py b/tests.py index 7b4a740..dc5b2e0 100644 --- a/tests.py +++ b/tests.py @@ -1887,6 +1887,11 @@ def test_short_names_with_mac(self): hn.capitalize() self.m(str(hn), 'Mack Johnson', hn) + def test_portuguese_prefixes(self): + hn = HumanName("joao da silva do amaral de souza") + hn.capitalize() + self.m(str(hn), 'Joao da Silva do Amaral de Souza', hn) + class HumanNameOutputFormatTests(HumanNameTestBase): def test_formatting_init_argument(self): From 93f9ec288f4a12bcf67b734900f3be35c59b3022 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 15 Jun 2018 18:52:02 -0700 Subject: [PATCH 051/163] pep8 tests and fix some test name clashes --- tests.py | 153 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 77 insertions(+), 76 deletions(-) diff --git a/tests.py b/tests.py index dc5b2e0..1d34415 100644 --- a/tests.py +++ b/tests.py @@ -53,7 +53,6 @@ def m(self, actual, expected, hn): self.assertEquals(actual, expected) - class HumanNamePythonTests(HumanNameTestBase): def test_utf8(self): @@ -77,12 +76,12 @@ def test_len(self): hn = HumanName("John Doe") self.m(len(hn), 2, hn) - @unittest.skipUnless(dill,"requires python-dill module to test pickling") + @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_config_pickle(self): - C = Constants() - self.assertTrue(dill.pickles(C)) + constants = Constants() + self.assertTrue(dill.pickles(constants)) - @unittest.skipUnless(dill,"requires python-dill module to test pickling") + @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_name_instance_pickle(self): hn = HumanName("Title First Middle Middle Last, Jr.") self.assertTrue(dill.pickles(hn)) @@ -91,7 +90,7 @@ def test_comparison(self): hn1 = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") hn2 = HumanName("Dr. John P. Doe-Ray, CLU, CFP, LUTC") self.assertTrue(hn1 == hn2) - self.assertTrue(not hn1 is hn2) + self.assertTrue(hn1 is not hn2) self.assertTrue(hn1 == "Dr. John P. Doe-Ray CLU, CFP, LUTC") hn1 = HumanName("Doe, Dr. John P., CLU, CFP, LUTC") hn2 = HumanName("Dr. John P. Doe-Ray, CLU, CFP, LUTC") @@ -127,17 +126,17 @@ def test_assignment_to_attribute(self): with self.assertRaises(TypeError): hn.suffix = [['test']] with self.assertRaises(TypeError): - hn.suffix = {"test":"test"} + hn.suffix = {"test": "test"} def test_assign_list_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") - hn.title = ["test1","test2"] + hn.title = ["test1", "test2"] self.m(hn.title, "test1 test2", hn) - hn.first = ["test3","test4"] + hn.first = ["test3", "test4"] self.m(hn.first, "test3 test4", hn) - hn.middle = ["test5","test6","test7"] + hn.middle = ["test5", "test6", "test7"] self.m(hn.middle, "test5 test6 test7", hn) - hn.last = ["test8","test9","test10"] + hn.last = ["test8", "test9", "test10"] self.m(hn.last, "test8 test9 test10", hn) hn.suffix = ['test'] self.m(hn.suffix, "test", hn) @@ -146,13 +145,13 @@ def test_comparison_case_insensitive(self): hn1 = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") hn2 = HumanName("dr. john p. doe-Ray, CLU, CFP, LUTC") self.assertTrue(hn1 == hn2) - self.assertTrue(not hn1 is hn2) + self.assertTrue(hn1 is not hn2) self.assertTrue(hn1 == "Dr. John P. Doe-ray clu, CFP, LUTC") def test_slice(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") self.m(list(hn), ['Dr.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) - self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC',hn.C.empty_attribute_default], hn) + self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) self.m(hn[1:-2], ['John', 'P.', 'Doe-Ray'], hn) def test_getitem(self): @@ -167,12 +166,12 @@ def test_setitem(self): hn = HumanName("Dr. John A. Kenneth Doe, Jr.") hn['title'] = 'test' self.m(hn['title'], "test", hn) - hn['last'] = ['test','test2'] + hn['last'] = ['test', 'test2'] self.m(hn['last'], "test test2", hn) with self.assertRaises(TypeError): hn["suffix"] = [['test']] with self.assertRaises(TypeError): - hn["suffix"] = {"test":"test"} + hn["suffix"] = {"test": "test"} def test_conjunction_names(self): hn = HumanName("johnny y") @@ -1261,31 +1260,30 @@ def test_portuguese_prefixes(self): self.m(hn.last, "Souza", hn) - class ConstantsCustomization(HumanNameTestBase): def test_add_title(self): hn = HumanName("Te Awanui-a-Rangi Black", constants=None) hn.C.titles.add('te') hn.parse_full_name() - self.m(hn.title,"Te", hn) - self.m(hn.first,"Awanui-a-Rangi", hn) - self.m(hn.last,"Black", hn) + self.m(hn.title, "Te", hn) + self.m(hn.first, "Awanui-a-Rangi", hn) + self.m(hn.last, "Black", hn) def test_remove_title(self): hn = HumanName("Hon Solo", constants=None) hn.C.titles.remove('hon') hn.parse_full_name() - self.m(hn.first,"Hon", hn) - self.m(hn.last,"Solo", hn) + self.m(hn.first, "Hon", hn) + self.m(hn.last, "Solo", hn) def test_add_multiple_arguments(self): hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=None) hn.C.titles.add('dean', 'Chemistry') hn.parse_full_name() - self.m(hn.title,"Assoc Dean of Chemistry", hn) - self.m(hn.first,"Robert", hn) - self.m(hn.last,"Johns", hn) + self.m(hn.title, "Assoc Dean of Chemistry", hn) + self.m(hn.first, "Robert", hn) + self.m(hn.last, "Johns", hn) def test_instances_can_have_own_constants(self): hn = HumanName("", None) @@ -1295,8 +1293,7 @@ def test_instances_can_have_own_constants(self): self.assertEqual(hn.has_own_config, True) self.assertEqual('hon' in hn2.C.titles, True) self.assertEqual(hn2.has_own_config, False) - - + def test_can_change_global_constants(self): hn = HumanName("") hn2 = HumanName("") @@ -1312,18 +1309,18 @@ def test_remove_multiple_arguments(self): hn = HumanName("Ms Hon Solo", constants=None) hn.C.titles.remove('hon', 'ms') hn.parse_full_name() - self.m(hn.first,"Ms", hn) - self.m(hn.middle,"Hon", hn) - self.m(hn.last,"Solo", hn) + self.m(hn.first, "Ms", hn) + self.m(hn.middle, "Hon", hn) + self.m(hn.last, "Solo", hn) def test_chain_multiple_arguments(self): hn = HumanName("Dean Ms Hon Solo", constants=None) hn.C.titles.remove('hon', 'ms').add('dean') hn.parse_full_name() - self.m(hn.title,"Dean", hn) - self.m(hn.first,"Ms", hn) - self.m(hn.middle,"Hon", hn) - self.m(hn.last,"Solo", hn) + self.m(hn.title, "Dean", hn) + self.m(hn.first, "Ms", hn) + self.m(hn.middle, "Hon", hn) + self.m(hn.last, "Solo", hn) def test_empty_attribute_default(self): from nameparser.config import CONSTANTS @@ -1421,7 +1418,7 @@ def test_single_quotes_on_end_of_last_name_not_treated_as_nickname(self): self.m(hn.last, "Aube'", hn) self.m(hn.nickname, "", hn) - #http://code.google.com/p/python-nameparser/issues/detail?id=17 + # http://code.google.com/p/python-nameparser/issues/detail?id=17 def test_parenthesis_are_removed(self): hn = HumanName("John Jones (Google Docs)") self.m(hn.first, "John", hn) @@ -1435,6 +1432,7 @@ def test_parenthesis_are_removed2(self): self.m(hn.last, "Jones", hn) self.m(hn.suffix, "Jr.", hn) + class PrefixesTestCase(HumanNameTestBase): def test_prefix(self): @@ -1543,7 +1541,7 @@ def test_phd_with_erroneous_space(self): self.m(hn.last, "Smith", hn) self.m(hn.suffix, "Ph. D.", hn) - #http://en.wikipedia.org/wiki/Ma_(surname) + # http://en.wikipedia.org/wiki/Ma_(surname) def test_potential_suffix_that_is_also_last_name(self): hn = HumanName("Jack Ma") self.m(hn.first, "Jack", hn) @@ -1586,23 +1584,23 @@ def test_king(self): self.m(hn.last, "King", hn) self.m(hn.suffix, "Jr", hn) - def test_suffix_with_periods(self): + def test_multiple_letter_suffix_with_periods(self): hn = HumanName("John Doe Msc.Ed.") - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) - self.m(hn.suffix,"Msc.Ed.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + self.m(hn.suffix, "Msc.Ed.", hn) def test_suffix_with_periods_with_comma(self): hn = HumanName("John Doe, Msc.Ed.") - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) - self.m(hn.suffix,"Msc.Ed.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + self.m(hn.suffix, "Msc.Ed.", hn) def test_suffix_with_periods_with_lastname_comma(self): hn = HumanName("Doe, John Msc.Ed.") - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) - self.m(hn.suffix,"Msc.Ed.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + self.m(hn.suffix, "Msc.Ed.", hn) class TitleTestCase(HumanNameTestBase): @@ -1652,7 +1650,7 @@ def test_title_is_title(self): # TODO: fix handling of U.S. @unittest.expectedFailure - def test_chained_title_first_name_initial(self): + def test_chained_title_first_name_title_is_initials(self): hn = HumanName("U.S. District Judge Marc Thomas Treadwell") self.m(hn.title, "U.S. District Judge", hn) self.m(hn.first, "Marc", hn) @@ -1665,7 +1663,7 @@ def test_conflict_with_chained_title_first_name_initial(self): self.m(hn.middle, "S.", hn) self.m(hn.last, "Grant", hn) - def test_chained_title_first_name_initial(self): + def test_chained_title_first_name_initial_with_no_period(self): hn = HumanName("US Magistrate Judge T Michael Putnam") self.m(hn.title, "US Magistrate Judge", hn) self.m(hn.first, "T", hn) @@ -1781,8 +1779,8 @@ def test_possible_conflict_with_suffix_that_could_be_initial(self): @unittest.expectedFailure def test_ben_as_conjunction(self): hn = HumanName("Ahmad ben Husain") - self.m(hn.first,"Ahmad", hn) - self.m(hn.last,"ben Husain", hn) + self.m(hn.first, "Ahmad", hn) + self.m(hn.last, "ben Husain", hn) def test_ben_as_first_name(self): hn = HumanName("Ben Johnson") @@ -1809,15 +1807,16 @@ def test_last_name_also_prefix(self): def test_title_with_periods(self): hn = HumanName("Lt.Gov. John Doe") - self.m(hn.title,"Lt.Gov.", hn) - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) + self.m(hn.title, "Lt.Gov.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) def test_title_with_periods_lastname_comma(self): hn = HumanName("Doe, Lt.Gov. John") - self.m(hn.title,"Lt.Gov.", hn) - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) + self.m(hn.title, "Lt.Gov.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): @@ -1892,11 +1891,12 @@ def test_portuguese_prefixes(self): hn.capitalize() self.m(str(hn), 'Joao da Silva do Amaral de Souza', hn) + class HumanNameOutputFormatTests(HumanNameTestBase): def test_formatting_init_argument(self): - hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", - string_format = "TEST1") + hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", + string_format="TEST1") self.assertEqual(u(hn), "TEST1") def test_formatting_constants_attribute(self): @@ -1935,51 +1935,51 @@ def test_formating_removing_pieces_from_name_buckets(self): self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'") hn.string_format = "{title} {first} {middle} {last} {suffix}" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") - hn.middle='' + hn.middle = '' self.assertEqual(u(hn), "Rev John Doe III") - hn.suffix='' + hn.suffix = '' self.assertEqual(u(hn), "Rev John Doe") - hn.title='' + hn.title = '' self.assertEqual(u(hn), "John Doe") def test_formating_of_nicknames_with_parenthesis(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III (Kenny)") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_formating_of_nicknames_with_single_quotes(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_formating_of_nicknames_with_double_quotes(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} \"{nickname}\"" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III \"Kenny\"") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_formating_of_nicknames_in_middle(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} ({nickname}) {middle} {last} {suffix}" self.assertEqual(u(hn), "Rev John (Kenny) A. Kenneth Doe III") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_remove_emojis(self): hn = HumanName("Sam Smith 😊") - self.m(hn.first,"Sam", hn) - self.m(hn.last,"Smith", hn) + self.m(hn.first, "Sam", hn) + self.m(hn.last, "Smith", hn) self.assertEqual(u(hn), "Sam Smith") def test_keep_non_emojis(self): hn = HumanName("∫≜⩕ Smith 😊") - self.m(hn.first,"∫≜⩕", hn) - self.m(hn.last,"Smith", hn) + self.m(hn.first, "∫≜⩕", hn) + self.m(hn.last, "Smith", hn) self.assertEqual(u(hn), "∫≜⩕ Smith") def test_keep_emojis(self): @@ -1987,11 +1987,12 @@ def test_keep_emojis(self): constants = Constants() constants.regexes.emoji = False hn = HumanName("∫≜⩕ Smith😊", constants) - self.m(hn.first,"∫≜⩕", hn) - self.m(hn.last,"Smith😊", hn) + self.m(hn.first, "∫≜⩕", hn) + self.m(hn.last, "Smith😊", hn) self.assertEqual(u(hn), "∫≜⩕ Smith😊") # test cleanup + TEST_NAMES = ( "John Doe", "John Doe, Jr.", @@ -2180,7 +2181,7 @@ def test_variations_of_TEST_NAMES(self): hn = HumanName(name) if len(hn.suffix_list) > 1: hn = HumanName("{title} {first} {middle} {last} {suffix}".format(**hn.as_dict()).split(',')[0]) - hn.C.empty_attribute_default = '' # format strings below require empty string + hn.C.empty_attribute_default = '' # format strings below require empty string hn_dict = hn.as_dict() attrs = [ 'title', @@ -2212,11 +2213,11 @@ def test_variations_of_TEST_NAMES(self): if len(sys.argv) > 1: log.setLevel(logging.ERROR) log.addHandler(logging.StreamHandler()) - name = sys.argv[1] - hn = HumanName(name, encoding=sys.stdout.encoding) - print((repr(hn))) - hn.capitalize() - print((repr(hn))) + name_string = sys.argv[1] + hn_instance = HumanName(name_string, encoding=sys.stdout.encoding) + print((repr(hn_instance))) + hn_instance.capitalize() + print((repr(hn_instance))) else: print("-"*80) print("Running tests") From cbc660218ad8bb3224f6b261c70e93232fab6c5c Mon Sep 17 00:00:00 2001 From: abnerjacobsen Date: Sun, 19 Aug 2018 19:29:29 -0300 Subject: [PATCH 052/163] Added title for female doctor in portuguese --- nameparser/config/titles.py | 1 + tests.py | 1 + 2 files changed, 2 insertions(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 131738f..c2af521 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -245,6 +245,7 @@ 'doyen', 'dpty', 'dr', + 'dra', 'dramatist', 'druid', 'drummer', diff --git a/tests.py b/tests.py index 1d34415..d741023 100644 --- a/tests.py +++ b/tests.py @@ -2166,6 +2166,7 @@ def test_keep_emojis(self): "Designated Judge David A. Ezra", "Sr US District Judge Richard G Kopf", "U.S. District Judge Marc Thomas Treadwell", + "Dra. Andréia da Silva", ) From e447e68fc364f41643b12d3446c59d7cea0ffbef Mon Sep 17 00:00:00 2001 From: abnerjacobsen Date: Sun, 19 Aug 2018 19:35:59 -0300 Subject: [PATCH 053/163] Added title for title for unmarried woman in Portuguese --- nameparser/config/titles.py | 1 + tests.py | 1 + 2 files changed, 2 insertions(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index c2af521..c675736 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -567,6 +567,7 @@ 'special', 'sr', 'sra', + 'srta', 'ssg', 'ssgt', 'staff', diff --git a/tests.py b/tests.py index d741023..0fe6e7a 100644 --- a/tests.py +++ b/tests.py @@ -2167,6 +2167,7 @@ def test_keep_emojis(self): "Sr US District Judge Richard G Kopf", "U.S. District Judge Marc Thomas Treadwell", "Dra. Andréia da Silva", + "Srta. Andréia da Silva", ) From 8de648f4e18eebd9cf8242328a6332b2ff1b9d1e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Aug 2018 16:11:14 -0700 Subject: [PATCH 054/163] add "junior" to suffixes, fix #76 --- nameparser/config/suffixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index c058ce6..7f01581 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -7,6 +7,7 @@ 'esquire', 'jr', 'jnr', + 'junior', 'sr', 'snr', '2', From c6fa80c11542bb5bd3017c1d0749f41544e59b0c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Aug 2018 16:30:54 -0700 Subject: [PATCH 055/163] v0.5.8 --- README.rst | 6 +++--- docs/release_log.rst | 3 +++ nameparser/__init__.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index da2265c..c5419fe 100644 --- a/README.rst +++ b/README.rst @@ -52,9 +52,9 @@ install with pip using the command below. ``pip install -e git+git://github.com/derek73/python-nameparser.git#egg=nameparser`` -If you're looking for a web service, check out -`eyeseast's nameparse service `_, a -simple Heroku-friendly Flask wrapper for this module. +If you need to handle lists of names, check out +`namesparser `_, a + compliment to this module that handles multiple names in a string. Quick Start Example diff --git a/docs/release_log.rst b/docs/release_log.rst index d27a576..869a2c9 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,8 @@ Release Log =========== +* 0.5.8 - August 19, 2018 + - Add "Junior" to suffixes (#76) + - Add "dra" and "srta" to titles (#77) * 0.5.7 - June 16, 2018 - Fix doc link (#73) - Fix handling of "do" and "dos" Portuguese prefixes (#71, #72) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 5e4b808..a3bea61 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 7) +VERSION = (0, 5, 8) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 60876af986e2814ace30c8f7b0d53d0714e14486 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Aug 2018 16:31:25 -0700 Subject: [PATCH 056/163] add 3.7-dev and 3.8-dev --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index dc37c42..82e3782 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,8 @@ python: - "3.4" - "3.5" - "3.6" + - "3.7-dev" + - "3.8-dev" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi From 37ac3301bb2ab77eb477fd314adadc7b2fb9177f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Aug 2018 16:36:15 -0700 Subject: [PATCH 057/163] python v3.7 --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 82e3782..4da19f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,7 @@ python: - "3.4" - "3.5" - "3.6" - - "3.7-dev" - - "3.8-dev" + - "3.7" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi From 7e26a6a56def141f7d2876622e26d481527cdd73 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 19 Aug 2018 16:38:26 -0700 Subject: [PATCH 058/163] no python v3.7 yet --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4da19f2..dc37c42 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,6 @@ python: - "3.4" - "3.5" - "3.6" - - "3.7" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi From 03e580c05c1de4c5e58dac9219b551f9b9abd93c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 30 Aug 2018 12:49:02 -0700 Subject: [PATCH 059/163] v1.0, fix support for nicknames in single quotes (#74) --- docs/release_log.rst | 3 +++ nameparser/__init__.py | 2 +- nameparser/config/regexes.py | 4 +++- nameparser/parser.py | 28 +++++++++++++++++--------- tests.py | 38 ++++++++++++++++++++++++++---------- 5 files changed, 54 insertions(+), 21 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 869a2c9..a819e90 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,8 @@ Release Log =========== +* 1.0.0 - August 30, 2018 + - Fix support for nicknames in single quotes (#74) + - No other big changes, just bumping to v1 to indicate approprite project maturity * 0.5.8 - August 19, 2018 - Add "Junior" to suffixes (#76) - Add "dra" and "srta" to titles (#77) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index a3bea61..e161e63 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 8) +VERSION = (1, 0, 0) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index 42da85d..a333ea0 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -23,7 +23,9 @@ ("word", re.compile(r"(\w|\.)+", re.U)), ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("nickname", re.compile(r'\s*?[\("](.+?)[\)"]', re.U)), + ("quoted_word", re.compile(r'\'([^\s]*?)\'', re.U)), + ("double_quotes", re.compile(r'\"(.*?)\"', re.U)), + ("parenthesis", re.compile(r'\((.*?)\)', re.U)), ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)), ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)), ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), diff --git a/nameparser/parser.py b/nameparser/parser.py index 1ce2812..5d6bb2c 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -384,15 +384,25 @@ def post_process(self): def parse_nicknames(self): """ - The content of parenthesis or double quotes in the name will - be treated as nicknames. This happens before any other - processing of the name. - """ - # https://code.google.com/p/python-nameparser/issues/detail?id=33 - re_nickname = self.C.regexes.nickname - if re_nickname.search(self._full_name): - self.nickname_list = re_nickname.findall(self._full_name) - self._full_name = re_nickname.sub('', self._full_name) + The content of parenthesis or quotes in the name will be added to the + nicknames list. This happens before any other processing of the name. + + Single quotes cannot span white space characters to allow for single + quotes in names like O'Connor. Double quotes and parenthesis can span + white space. + + Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; + `quoted_word`, `double_quotes` and `parenthesis`. + """ + + re_quoted_word = self.C.regexes.quoted_word + re_double_quotes = self.C.regexes.double_quotes + re_parenthesis = self.C.regexes.parenthesis + + for _re in (re_quoted_word, re_double_quotes, re_parenthesis): + if _re.search(self._full_name): + self.nickname_list += [x for x in _re.findall(self._full_name)] + self._full_name = _re.sub('', self._full_name) def squash_emoji(self): """ diff --git a/tests.py b/tests.py index 0fe6e7a..fab25f9 100644 --- a/tests.py +++ b/tests.py @@ -1365,6 +1365,20 @@ def test_nickname_in_parenthesis(self): self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) + def test_two_word_nickname_in_parenthesis(self): + hn = HumanName("Benjamin (Big Ben) Franklin") + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Big Ben", hn) + + def test_two_words_in_quotes(self): + hn = HumanName('Benjamin "Big Ben" Franklin') + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Big Ben", hn) + def test_nickname_in_parenthesis_with_comma(self): hn = HumanName("Franklin, Benjamin (Ben)") self.m(hn.first, "Benjamin", hn) @@ -1380,9 +1394,6 @@ def test_nickname_in_parenthesis_with_comma_and_suffix(self): self.m(hn.suffix, "Jr.", hn) self.m(hn.nickname, "Ben", hn) - # it would be hard to support this without breaking some of the - # other examples with single quotes in the names. - @unittest.expectedFailure def test_nickname_in_single_quotes(self): hn = HumanName("Benjamin 'Ben' Franklin") self.m(hn.first, "Benjamin", hn) @@ -1398,9 +1409,9 @@ def test_nickname_in_double_quotes(self): self.m(hn.nickname, "Ben", hn) def test_single_quotes_on_first_name_not_treated_as_nickname(self): - hn = HumanName("Brian O'connor") + hn = HumanName("Brian Andrew O'connor") self.m(hn.first, "Brian", hn) - self.m(hn.middle, "", hn) + self.m(hn.middle, "Andrew", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) @@ -1419,19 +1430,26 @@ def test_single_quotes_on_end_of_last_name_not_treated_as_nickname(self): self.m(hn.nickname, "", hn) # http://code.google.com/p/python-nameparser/issues/detail?id=17 - def test_parenthesis_are_removed(self): - hn = HumanName("John Jones (Google Docs)") + def test_parenthesis_are_removed_from_name(self): + hn = HumanName("John Jones (Unknown)") self.m(hn.first, "John", hn) self.m(hn.last, "Jones", hn) # not testing the nicknames because we don't actually care - # about Google Docs. - - def test_parenthesis_are_removed2(self): + # about Google Docs here + + def test_duplicate_parenthesis_are_removed_from_name(self): hn = HumanName("John Jones (Google Docs), Jr. (Unknown)") self.m(hn.first, "John", hn) self.m(hn.last, "Jones", hn) self.m(hn.suffix, "Jr.", hn) + def test_parenthesis_and_quotes_together(self): + hn = HumanName("Jennifer 'Jen' Jones (Duff)") + self.m(hn.first, "Jennifer", hn) + self.m(hn.last, "Jones", hn) + self.m(hn.nickname, "Jen Duff", hn) + + class PrefixesTestCase(HumanNameTestBase): From ff6d888b62df623f73691dd860432e06145d099b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 30 Aug 2018 15:36:25 -0700 Subject: [PATCH 060/163] Change prefix handling to support prefixes on first names, fix #60 --- docs/customize.rst | 16 ++++++++-------- docs/release_log.rst | 1 + docs/usage.rst | 2 +- nameparser/parser.py | 43 ++++++++++++++++++++++++++++++++----------- tests.py | 36 ++++++++++++++++++++++++++++++++++-- 5 files changed, 76 insertions(+), 22 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index 7442300..b4c45ca 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). Editable attributes of nameparser.config.CONSTANTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:obj:`~nameparser.config.Constants.titles` - Pieces that come before the name. Cannot include things that may be first names -* :py:obj:`~nameparser.config.Constants.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David" -* :py:obj:`~nameparser.config.Constants.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d." -* :py:obj:`~nameparser.config.Constants.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr." -* :py:obj:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. -* :py:obj:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding -* :py:obj:`~nameparser.config.Constants.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D" -* :py:obj:`~nameparser.config.Constants.regexes` - Regular expressions used to find words, initials, nicknames, etc. +* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. +* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". +* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". +* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. +* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. +* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". +* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc. Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning the constants for your project. These methods automatically lower case and diff --git a/docs/release_log.rst b/docs/release_log.rst index a819e90..7f81fdf 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -2,6 +2,7 @@ Release Log =========== * 1.0.0 - August 30, 2018 - Fix support for nicknames in single quotes (#74) + - Change prefix handling to support prefixes on first names (#60) - No other big changes, just bumping to v1 to indicate approprite project maturity * 0.5.8 - August 19, 2018 - Add "Junior" to suffixes (#76) diff --git a/docs/usage.rst b/docs/usage.rst index f3ab41b..dd313d2 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -96,7 +96,7 @@ pass the parameter `force=True`. Nickname Handling ------------------ -The content of parenthesis or double quotes in the name will be +The content of parenthesis or quotes in the name will be available from the nickname attribute. .. doctest:: nicknames diff --git a/nameparser/parser.py b/nameparser/parser.py index 5d6bb2c..dbfa147 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -319,7 +319,7 @@ def is_suffix(self, piece): return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \ or (lc(piece) in self.C.suffix_not_acronyms)) \ and not self.is_an_initial(piece) - + def are_suffixes(self, pieces): """Return True if all pieces are suffixes.""" for piece in pieces: @@ -444,6 +444,7 @@ def parse_full_name(self): self.last_list = [] self.suffix_list = [] self.nickname_list = [] + self.prefix_joins = [] self.unparsable = True @@ -489,6 +490,14 @@ def parse_full_name(self): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break + if piece in self.prefix_joins: + last_piece = pieces[-1:][0] + if self.is_suffix(last_piece): + self.last_list += pieces[i:-1] + self.suffix = last_piece + else: + self.last_list += pieces[i:] + break if not nxt: self.last_list.append(piece) continue @@ -528,6 +537,14 @@ def parse_full_name(self): self.last_list.append(piece) self.suffix_list = pieces[i+1:] + self.suffix_list break + if piece in self.prefix_joins: + last_piece = pieces[-1:][0] + if self.is_suffix(last_piece): + self.last_list += pieces[i:-1] + self.suffix_list.insert(0, last_piece) + else: + self.last_list += pieces[i:] + break if not nxt: self.last_list.append(piece) continue @@ -544,7 +561,7 @@ def parse_full_name(self): # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: - # the first one is always a last name, even if it look like + # the first one is always a last name, even if it looks like # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) @@ -568,6 +585,9 @@ def parse_full_name(self): if self.is_suffix(piece): self.suffix_list.append(piece) continue + if piece in self.prefix_joins: + self.last_list += pieces[i:] + break self.middle_list.append(piece) try: if parts[2]: @@ -742,15 +762,16 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): prefixes = list(filter(self.is_prefix, pieces)) if prefixes: i = pieces.index(prefixes[0]) - # join everything after the prefix until the next suffix - next_suffix = list(filter(self.is_suffix, pieces[i:])) - if next_suffix: - j = pieces.index(next_suffix[0]) - new_piece = ' '.join(pieces[i:j]) - pieces = pieces[:i] + [new_piece] + pieces[j:] - else: - new_piece = ' '.join(pieces[i:]) - pieces = pieces[:i] + [new_piece] + # join everything after the prefix until the next non prefix + # store joined pieces in prefix_joins. When a prefix occurs in a last name, + # I think it means the rest of the name is part of the last name, so prefix_joins + # lets us do that in the parser flow. + non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:])) + if non_suffixes: + j = pieces.index(non_suffixes[0]) + new_piece = ' '.join(pieces[i:j + 1]) + self.prefix_joins += [new_piece] + pieces = pieces[:i] + [new_piece] + pieces[j + 1:] log.debug("pieces: {0}".format(pieces)) return pieces diff --git a/tests.py b/tests.py index fab25f9..2b9f18b 100644 --- a/tests.py +++ b/tests.py @@ -1256,8 +1256,8 @@ def test_portuguese_dos(self): def test_portuguese_prefixes(self): hn = HumanName("Joao da Silva do Amaral de Souza") self.m(hn.first, "Joao", hn) - self.m(hn.middle, "da Silva do Amaral de", hn) - self.m(hn.last, "Souza", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "da Silva do Amaral de Souza", hn) class ConstantsCustomization(HumanNameTestBase): @@ -1474,6 +1474,12 @@ def test_prefix_before_two_part_last_name_with_suffix(self): self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "III", hn) + def test_prefix_before_two_part_last_name_with_acronym_suffix(self): + hn = HumanName("pennie von bergen wessels M.D.") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "M.D.", hn) + def test_two_part_last_name_with_suffix_comma(self): hn = HumanName("pennie von bergen wessels, III") self.m(hn.first, "pennie", hn) @@ -1486,6 +1492,32 @@ def test_two_part_last_name_with_suffix(self): self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "III", hn) + def test_last_name_two_part_last_name_with_two_suffixes(self): + hn = HumanName("von bergen wessels MD, pennie III") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD, III", hn) + + def test_comma_two_part_last_name_with_acronym_suffix(self): + hn = HumanName("von bergen wessels, pennie MD") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD", hn) + + def test_comma_two_part_last_name_with_suffix_in_first_part(self): + # I'm kinda surprised this works, not really sure if this is a + # realistic place for a suffix to be. + hn = HumanName("von bergen wessels MD, pennie") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD", hn) + + def test_title_two_part_last_name_with_suffix_in_first_part(self): + hn = HumanName("pennie von bergen wessels MD, III") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD, III", hn) + class SuffixesTestCase(HumanNameTestBase): From da7f67ccf2f70bf3f6f092efbbd53c035ef6eb5f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 30 Aug 2018 16:44:10 -0700 Subject: [PATCH 061/163] fix capitalization of prefixes when they are not part of last name #70 --- docs/release_log.rst | 1 + nameparser/parser.py | 18 +++++++++--------- tests.py | 5 +++++ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 7f81fdf..7ad032f 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -3,6 +3,7 @@ Release Log * 1.0.0 - August 30, 2018 - Fix support for nicknames in single quotes (#74) - Change prefix handling to support prefixes on first names (#60) + - Fix prefix capitalization when not part of last name (#70) - No other big changes, just bumping to v1 to indicate approprite project maturity * 0.5.8 - August 19, 2018 - Add "Junior" to suffixes (#76) diff --git a/nameparser/parser.py b/nameparser/parser.py index dbfa147..7cd1dfc 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -779,8 +779,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): ### Capitalization Support - def cap_word(self, word): - if self.is_prefix(word) or self.is_conjunction(word): + def cap_word(self, word, attribute): + if (self.is_prefix(word) and attribute=='last') or self.is_conjunction(word): return word.lower() exceptions = self.C.capitalization_exceptions if lc(word) in exceptions: @@ -793,10 +793,10 @@ def cap_after_mac(m): else: return word.capitalize() - def cap_piece(self, piece): + def cap_piece(self, piece, attribute): if not piece: return "" - replacement = lambda m: self.cap_word(m.group(0)) + replacement = lambda m: self.cap_word(m.group(0), attribute) return self.C.regexes.word.sub(replacement, piece) def capitalize(self, force=False): @@ -829,8 +829,8 @@ def capitalize(self, force=False): name = u(self) if not force and not (name == name.upper() or name == name.lower()): return - self.title_list = self.cap_piece(self.title ).split(' ') - self.first_list = self.cap_piece(self.first ).split(' ') - self.middle_list = self.cap_piece(self.middle).split(' ') - self.last_list = self.cap_piece(self.last ).split(' ') - self.suffix_list = self.cap_piece(self.suffix).split(', ') + self.title_list = self.cap_piece(self.title , 'title').split(' ') + self.first_list = self.cap_piece(self.first , 'first').split(' ') + self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') + self.last_list = self.cap_piece(self.last , 'last').split(' ') + self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') diff --git a/tests.py b/tests.py index 2b9f18b..db9e0b0 100644 --- a/tests.py +++ b/tests.py @@ -1941,6 +1941,11 @@ def test_portuguese_prefixes(self): hn.capitalize() self.m(str(hn), 'Joao da Silva do Amaral de Souza', hn) + def test_capitalize_prefix_clash_on_first_name(self): + hn = HumanName("van nguyen") + hn.capitalize() + self.m(str(hn), 'Van Nguyen', hn) + class HumanNameOutputFormatTests(HumanNameTestBase): From 76a2b9e74a6bac09e4950f0b7ce1867d4e2623ec Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 30 Aug 2018 17:59:53 -0700 Subject: [PATCH 062/163] Handle erroneous space in "Ph. D.", fix #43 --- docs/release_log.rst | 4 ++-- nameparser/config/regexes.py | 1 + nameparser/parser.py | 20 ++++++++++++++++---- tests.py | 1 - 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 7ad032f..2878ccf 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -3,8 +3,8 @@ Release Log * 1.0.0 - August 30, 2018 - Fix support for nicknames in single quotes (#74) - Change prefix handling to support prefixes on first names (#60) - - Fix prefix capitalization when not part of last name (#70) - - No other big changes, just bumping to v1 to indicate approprite project maturity + - Fix prefix capitalization when not part of lastname (#70) + - Handle erroneous space in "Ph. D." (#43) * 0.5.8 - August 19, 2018 - Add "Junior" to suffixes (#76) - Add "dra" and "srta" to titles (#77) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index a333ea0..b5c49c3 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -30,6 +30,7 @@ ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)), ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), + ("phd", re.compile(r'ph\.?\s+d\.?', re.I | re.U)), ]) """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index 7cd1dfc..0b45efe 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -361,17 +361,21 @@ def full_name(self, value): def collapse_whitespace(self, string): # collapse multiple spaces into single space - return self.C.regexes.spaces.sub(" ", string.strip()) - + string = self.C.regexes.spaces.sub(" ", string.strip()) + if string.endswith(","): + string = string[:-1] + return string + def pre_process(self): """ This method happens at the beginning of the :py:func:`parse_full_name` before any other processing of the string aside from unicode normalization, so it's a good place to do any custom handling in a - subclass. Runs :py:func:`parse_nicknames`. + subclass. Runs :py:func:`parse_nicknames` and py:func:`squash_emoji`. """ + self.fix_phd() self.parse_nicknames() self.squash_emoji() @@ -382,6 +386,13 @@ def post_process(self): """ self.handle_firstnames() + def fix_phd(self): + _re = self.C.regexes.phd + match = _re.search(self._full_name) + if match: + self.suffix_list.append(match.group(0)) + self._full_name = _re.sub('', self._full_name) + def parse_nicknames(self): """ The content of parenthesis or quotes in the name will be added to the @@ -780,7 +791,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): ### Capitalization Support def cap_word(self, word, attribute): - if (self.is_prefix(word) and attribute=='last') or self.is_conjunction(word): + if (self.is_prefix(word) and attribute in ('last','middle')) \ + or self.is_conjunction(word): return word.lower() exceptions = self.C.capitalization_exceptions if lc(word) in exceptions: diff --git a/tests.py b/tests.py index db9e0b0..f2979d8 100644 --- a/tests.py +++ b/tests.py @@ -1584,7 +1584,6 @@ def test_suffix_with_double_comma_format(self): self.m(hn.last, "Doe", hn) self.m(hn.suffix, "jr., MD", hn) - @unittest.expectedFailure def test_phd_with_erroneous_space(self): hn = HumanName("John Smith, Ph. D.") self.m(hn.first, "John", hn) From e9fd11e1d54b3b4543a5fea0f359a4d2a64519b8 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 30 Aug 2018 18:02:10 -0700 Subject: [PATCH 063/163] I'm betting we deal with more people named Contessa than the title #75 --- nameparser/config/titles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index c675736..dfcbd07 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -194,7 +194,6 @@ 'comtesse', 'conductor', 'consultant', - 'contessa', 'controller', 'corporal', 'corporate', From 10f34e450d643fcea018d2994fbce89fc1c7ac0f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 13:50:25 -0700 Subject: [PATCH 064/163] refactor prefix handling to correctly parse Portuguese prefixes #72 while continuing to support multiple names after a prefix #23 --- docs/customize.rst | 16 +++--- nameparser/config/prefixes.py | 10 +++- nameparser/parser.py | 101 ++++++++++++++++++---------------- tests.py | 48 ++++++++++++---- 4 files changed, 108 insertions(+), 67 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index b4c45ca..46a60c9 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). Editable attributes of nameparser.config.CONSTANTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. -* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". -* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". -* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". -* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. -* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. -* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". -* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc. +* :py:obj:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. +* :py:obj:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". +* :py:obj:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". +* :py:obj:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:obj:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. +* :py:obj:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. +* :py:obj:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". +* :py:obj:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc. Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning the constants for your project. These methods automatically lower case and diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index b2a9386..fbcc3f2 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -1,7 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -#: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece. +#: Name pieces that appear before a last name. Prefixes join to the piece +# that follows them to make one new piece. They can be chained together, e.g +# "von der" and "de la". Because they only appear in middle or last names, +# they also signifiy that all following name pieces should be in the same name +# part, for example, "von" will be joined to all following pieces that are not +# prefixes or suffixes, allowing recognition of double last names when they +# appear after a prefixes. So in "pennie von bergen wessels MD", "von" will +# join with all following name pieces until the suffix "MD", resulting in the +# correct parsing of the last name "von bergen wessels". PREFIXES = set([ 'abu', 'bin', diff --git a/nameparser/parser.py b/nameparser/parser.py index 0b45efe..1b20018 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -501,14 +501,6 @@ def parse_full_name(self): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break - if piece in self.prefix_joins: - last_piece = pieces[-1:][0] - if self.is_suffix(last_piece): - self.last_list += pieces[i:-1] - self.suffix = last_piece - else: - self.last_list += pieces[i:] - break if not nxt: self.last_list.append(piece) continue @@ -548,14 +540,6 @@ def parse_full_name(self): self.last_list.append(piece) self.suffix_list = pieces[i+1:] + self.suffix_list break - if piece in self.prefix_joins: - last_piece = pieces[-1:][0] - if self.is_suffix(last_piece): - self.last_list += pieces[i:-1] - self.suffix_list.insert(0, last_piece) - else: - self.last_list += pieces[i:] - break if not nxt: self.last_list.append(piece) continue @@ -596,9 +580,6 @@ def parse_full_name(self): if self.is_suffix(piece): self.suffix_list.append(piece) continue - if piece in self.prefix_joins: - self.last_list += pieces[i:] - break self.middle_list.append(piece) try: if parts[2]: @@ -685,16 +666,16 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # don't join on conjunctions if there's only 2 parts if length < 3: return pieces - + rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length = len(rootname_pieces) + additional_parts_count - + # find all the conjunctions, join any conjunctions that are next to each # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it - conj_index = [i for i, piece in enumerate(pieces) + conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] - + contiguous_conj_i = [] for i, val in enumerate(conj_index): try: @@ -702,10 +683,10 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): contiguous_conj_i += [val] except IndexError: pass - + contiguous_conj_i = group_contiguous_integers(conj_index) - - delete_i = [] + + delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: new_piece = " ".join(pieces[ i[0] : i[1]+1] ) @@ -717,7 +698,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): pieces[i] = new_piece #add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) - + for i in reversed(delete_i): # delete pieces in reverse order or the index changes on each delete del pieces[i] @@ -728,7 +709,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # refresh conjunction index locations conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] - + for i in conj_index: if len(pieces[i]) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes @@ -736,7 +717,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # treating it as an initial rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue - + if i is 0: new_piece = " ".join(pieces[i:i+2]) if self.is_title(pieces[i+1]): @@ -748,8 +729,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for j,val in enumerate(conj_index): if val > i: conj_index[j]=val-1 - - else: + + else: new_piece = " ".join(pieces[i-1:i+2]) if self.is_title(pieces[i-1]): # when joining to a title, make new_piece a title too @@ -767,23 +748,51 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for j,val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count - - - # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] + + + # join prefixes to following lastnames: ['de la Vega'], ['van Buren III'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: - i = pieces.index(prefixes[0]) - # join everything after the prefix until the next non prefix - # store joined pieces in prefix_joins. When a prefix occurs in a last name, - # I think it means the rest of the name is part of the last name, so prefix_joins - # lets us do that in the parser flow. - non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:])) - if non_suffixes: - j = pieces.index(non_suffixes[0]) - new_piece = ' '.join(pieces[i:j + 1]) - self.prefix_joins += [new_piece] - pieces = pieces[:i] + [new_piece] + pieces[j + 1:] - + for prefix in prefixes: + try: + i = pieces.index(prefix) + except ValueError: + # If the prefix is no longer in pieces, it's because it has been + # combined with the prefix that appears right before (or before that when + # chained together) in the last loop, so the index of that newly created + # piece is the same as in the last loop, i==i still, and we want to join + # it to the next piece. + pass + + new_piece = '' + + # join everything after the prefix until the next non prefix + # store joined pieces in prefix_joins. When a prefix occurs in a last name, + # I think it means the rest of the name is part of the last name, so prefix_joins + # lets us do that in the parser flow. + # for prefix in prefixes: + + try: + next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) + j = pieces.index(next_prefix) + if j == i + 1: + # if there are two prefixes in sequence, join to the following piece + j += 1 + new_piece = ' '.join(pieces[i:j]) + pieces = pieces[:i] + [new_piece] + pieces[j:] + except StopIteration: + try: + # if there are no more prefixes, look for a suffix to stop at + stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) + j = pieces.index(stop_at) + new_piece = ' '.join(pieces[i:j]) + pieces = pieces[:i] + [new_piece] + pieces[j:] + except StopIteration: + # if there were no suffixes, nothing to stop at so join all + # remaining pieces + new_piece = ' '.join(pieces[i:]) + pieces = pieces[:i] + [new_piece] + log.debug("pieces: {0}".format(pieces)) return pieces diff --git a/tests.py b/tests.py index f2979d8..fb15674 100644 --- a/tests.py +++ b/tests.py @@ -1247,18 +1247,6 @@ def test_name_is_conjunctions(self): hn = HumanName("e and e") self.m(hn.first, "e and e", hn) - def test_portuguese_dos(self): - hn = HumanName("Rafael Sousa dos Anjos") - self.m(hn.first, "Rafael", hn) - self.m(hn.middle, "Sousa", hn) - self.m(hn.last, "dos Anjos", hn) - - def test_portuguese_prefixes(self): - hn = HumanName("Joao da Silva do Amaral de Souza") - self.m(hn.first, "Joao", hn) - self.m(hn.middle, "", hn) - self.m(hn.last, "da Silva do Amaral de Souza", hn) - class ConstantsCustomization(HumanNameTestBase): @@ -1518,6 +1506,42 @@ def test_title_two_part_last_name_with_suffix_in_first_part(self): self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "MD, III", hn) + def test_portuguese_dos(self): + hn = HumanName("Rafael Sousa dos Anjos") + self.m(hn.first, "Rafael", hn) + self.m(hn.middle, "Sousa", hn) + self.m(hn.last, "dos Anjos", hn) + + def test_portuguese_prefixes(self): + hn = HumanName("Joao da Silva do Amaral de Souza") + self.m(hn.first, "Joao", hn) + self.m(hn.middle, "da Silva do Amaral", hn) + self.m(hn.last, "de Souza", hn) + + def test_three_conjunctions(self): + hn = HumanName("Dr. Juan Q. Xavier de la dos Vega III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + + def test_lastname_three_conjunctions(self): + hn = HumanName("de la dos Vega, Dr. Juan Q. Xavier III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + + def test_comma_three_conjunctions(self): + hn = HumanName("Dr. Juan Q. Xavier de la dos Vega, III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + class SuffixesTestCase(HumanNameTestBase): From b74141257443f217393bce2abf82ccd1d425b10d Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 15:59:44 -0700 Subject: [PATCH 065/163] documentation fixes --- .gitignore | 1 - CONTRIBUTING.md | 2 +- docs/customize.rst | 16 ++++++++-------- nameparser/config/prefixes.py | 16 ++++++++-------- nameparser/parser.py | 2 +- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index c586728..c3fe42b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ __pycache__/ .python2/ MANIFEST nameparser.egg-info/ -dummycert.pem build *.egg .coverage diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 329716d..bb9fc4a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Contributing Development Environment Setup -------------------------------- -There are some exernal dependencies required in order to run the +There are some external dependencies required in order to run the tests, located in the dev-requirements.txt file. pip install -r dev-requirements.txt diff --git a/docs/customize.rst b/docs/customize.rst index 46a60c9..6097050 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). Editable attributes of nameparser.config.CONSTANTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:obj:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. -* :py:obj:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". -* :py:obj:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". -* :py:obj:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". -* :py:obj:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. -* :py:obj:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. -* :py:obj:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". -* :py:obj:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc. +* :py:data:`~nameparser.config.titles.TITLES` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. +* :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". +* :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". +* :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece. +* :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. +* :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". +* :py:data:`~nameparser.config.regexes.REGEXES` - Regular expressions used to find words, initials, nicknames, etc. Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning the constants for your project. These methods automatically lower case and diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index fbcc3f2..542ea03 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -2,14 +2,14 @@ from __future__ import unicode_literals #: Name pieces that appear before a last name. Prefixes join to the piece -# that follows them to make one new piece. They can be chained together, e.g -# "von der" and "de la". Because they only appear in middle or last names, -# they also signifiy that all following name pieces should be in the same name -# part, for example, "von" will be joined to all following pieces that are not -# prefixes or suffixes, allowing recognition of double last names when they -# appear after a prefixes. So in "pennie von bergen wessels MD", "von" will -# join with all following name pieces until the suffix "MD", resulting in the -# correct parsing of the last name "von bergen wessels". +#: that follows them to make one new piece. They can be chained together, e.g +#: "von der" and "de la". Because they only appear in middle or last names, +#: they also signifiy that all following name pieces should be in the same name +#: part, for example, "von" will be joined to all following pieces that are not +#: prefixes or suffixes, allowing recognition of double last names when they +#: appear after a prefixes. So in "pennie von bergen wessels MD", "von" will +#: join with all following name pieces until the suffix "MD", resulting in the +#: correct parsing of the last name "von bergen wessels". PREFIXES = set([ 'abu', 'bin', diff --git a/nameparser/parser.py b/nameparser/parser.py index 1b20018..d8e5498 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -296,7 +296,7 @@ def is_conjunction(self, piece): def is_prefix(self, piece): """ Lowercase and no periods version of piece is in the - `~nameparser.config.titles.PREFIXES` set. + :py:data:`~nameparser.config.prefixes.PREFIXES` set. """ return lc(piece) in self.C.prefixes From 8c55eb9f0a58c5bfa371bdd565251acd44206e11 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 16:51:41 -0700 Subject: [PATCH 066/163] Fix overzealous regex for "Ph. D." (#43) --- docs/release_log.rst | 2 ++ nameparser/config/regexes.py | 2 +- nameparser/parser.py | 2 +- tests.py | 6 ++++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 2878ccf..95867e3 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.0.1 - August 30, 2018 + - Fix overzealous regex for "Ph. D." (#43) * 1.0.0 - August 30, 2018 - Fix support for nicknames in single quotes (#74) - Change prefix handling to support prefixes on first names (#60) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index b5c49c3..beac95f 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -30,7 +30,7 @@ ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)), ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), - ("phd", re.compile(r'ph\.?\s+d\.?', re.I | re.U)), + ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), ]) """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index d8e5498..34282de 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -390,7 +390,7 @@ def fix_phd(self): _re = self.C.regexes.phd match = _re.search(self._full_name) if match: - self.suffix_list.append(match.group(0)) + self.suffix_list.append(match.group(1)) self._full_name = _re.sub('', self._full_name) def parse_nicknames(self): diff --git a/tests.py b/tests.py index fb15674..87dcb94 100644 --- a/tests.py +++ b/tests.py @@ -1614,6 +1614,12 @@ def test_phd_with_erroneous_space(self): self.m(hn.last, "Smith", hn) self.m(hn.suffix, "Ph. D.", hn) + def test_phd_conflict(self): + hn = HumanName("Adolph D") + self.m(hn.first, "Adolph", hn) + self.m(hn.last, "D", hn) + + # http://en.wikipedia.org/wiki/Ma_(surname) def test_potential_suffix_that_is_also_last_name(self): hn = HumanName("Jack Ma") From 9a3d187c7ef5cc9c590729ebcd6b3bac3f1da298 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 17:04:03 -0700 Subject: [PATCH 067/163] Add `surnames` attribute as aggregate of middle and last names --- docs/release_log.rst | 1 + nameparser/parser.py | 16 +++++++++++++++- tests.py | 9 ++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 95867e3..70e2529 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -2,6 +2,7 @@ Release Log =========== * 1.0.1 - August 30, 2018 - Fix overzealous regex for "Ph. D." (#43) + - Add `surnames` attribute as aggregate of middle and last names * 1.0.0 - August 30, 2018 - Fix support for nicknames in single quotes (#74) - Change prefix handling to support prefixes on first names (#60) diff --git a/nameparser/parser.py b/nameparser/parser.py index 34282de..e13bdfb 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -243,7 +243,21 @@ def nickname(self): parenthesis (``()``) """ return " ".join(self.nickname_list) or self.C.empty_attribute_default - + + @property + def surnames_list(self): + """ + List of middle names followed by last name. + """ + return self.middle_list + self.last_list + + @property + def surnames(self): + """ + A string of all middle names followed by the last name. + """ + return " ".join(self.surnames_list) or self.C.empty_attribute_default + ### setter methods def _set_list(self, attr, value): diff --git a/tests.py b/tests.py index 87dcb94..11bb3e8 100644 --- a/tests.py +++ b/tests.py @@ -188,6 +188,14 @@ def test_blank_name(self): self.m(hn.first, "", hn) self.m(hn.last, "", hn) + def test_surnames_list_attribute(self): + hn = HumanName("John Edgar Casey Williams III") + self.m(hn.surnames_list, ["Edgar", "Casey", "Williams"], hn) + + def test_surnames_attribute(self): + hn = HumanName("John Edgar Casey Williams III") + self.m(hn.surnames, "Edgar Casey Williams", hn) + class FirstNameHandlingTests(HumanNameTestBase): def test_first_name(self): @@ -1438,7 +1446,6 @@ def test_parenthesis_and_quotes_together(self): self.m(hn.nickname, "Jen Duff", hn) - class PrefixesTestCase(HumanNameTestBase): def test_prefix(self): From 438bc7d5b74de82388745a99ead56264b30c120c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 17:12:47 -0700 Subject: [PATCH 068/163] v1.0.1, document surnames attribute --- docs/usage.rst | 2 ++ nameparser/__init__.py | 2 +- tests.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/usage.rst b/docs/usage.rst index dd313d2..45f67a4 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -23,6 +23,8 @@ The examples use Python 3, but Python 2.6+ is supported. 'de la Vega' >>> name.suffix 'III' + >>> name.surnames + 'Q. Xavier de la Vega' >>> name.full_name = "Juan Q. Xavier Velasquez y Garcia, Jr." >>> name 0) hn.C.titles.add('te') + self.assertEqual(start_len + 1, len(hn.C.titles)) hn.parse_full_name() self.m(hn.title, "Te", hn) self.m(hn.first, "Awanui-a-Rangi", hn) @@ -1268,7 +1271,10 @@ def test_add_title(self): def test_remove_title(self): hn = HumanName("Hon Solo", constants=None) + start_len = len(hn.C.titles) + self.assert_(start_len > 0) hn.C.titles.remove('hon') + self.assertEqual(start_len - 1, len(hn.C.titles)) hn.parse_full_name() self.m(hn.first, "Hon", hn) self.m(hn.last, "Solo", hn) From efe255834e2bff86c160b7c1d063ab24fa78a62e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 17:18:15 -0700 Subject: [PATCH 069/163] update surname docs --- nameparser/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index e13bdfb..09726d4 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -43,7 +43,8 @@ class HumanName(object): * :py:attr:`last` * :py:attr:`suffix` * :py:attr:`nickname` - + * :py:attr:`surnames` + :param str full_name: The name string to be parsed. :param constants constants: a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for From 9e3e06d229347b03e79c3d26e848f03bcc15dc8d Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 14 Sep 2018 12:37:45 -0700 Subject: [PATCH 070/163] comments cleanup --- nameparser/parser.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 09726d4..e390438 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -470,7 +470,6 @@ def parse_full_name(self): self.last_list = [] self.suffix_list = [] self.nickname_list = [] - self.prefix_joins = [] self.unparsable = True @@ -765,7 +764,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): conj_index[j] = val - rm_count - # join prefixes to following lastnames: ['de la Vega'], ['van Buren III'] + # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: for prefix in prefixes: @@ -781,11 +780,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): new_piece = '' - # join everything after the prefix until the next non prefix - # store joined pieces in prefix_joins. When a prefix occurs in a last name, - # I think it means the rest of the name is part of the last name, so prefix_joins - # lets us do that in the parser flow. - # for prefix in prefixes: + # join everything after the prefix until the next prefix or suffix try: next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) From 878c3f914251ca3d7a5685498befb5f8b441ad42 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 22 Sep 2018 13:37:34 -0700 Subject: [PATCH 071/163] fix RST error, update badges --- README.rst | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index c5419fe..b347593 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,7 @@ Name Parser =========== -.. image:: https://travis-ci.org/derek73/python-nameparser.svg?branch=master - :target: https://travis-ci.org/derek73/python-nameparser -.. image:: https://badge.fury.io/py/nameparser.svg - :target: http://badge.fury.io/py/nameparser +|Build Status| |PyPI| |PyPI version| |Documentation| A simple Python (3.2+ & 2.6+) module for parsing human names into their individual components. @@ -15,6 +12,7 @@ individual components. * hn.last * hn.suffix * hn.nickname +* hn.surnames *(middle + last)* Supported Name Structures ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -54,7 +52,7 @@ install with pip using the command below. If you need to handle lists of names, check out `namesparser `_, a - compliment to this module that handles multiple names in a string. +compliment to this module that handles multiple names in a string. Quick Start Example @@ -135,4 +133,14 @@ https://github.com/derek73/python-nameparser .. _CONTRIBUTING.md: https://github.com/derek73/python-nameparser/tree/master/CONTRIBUTING.md .. _Start a New Issue: https://github.com/derek73/python-nameparser/issues -.. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py \ No newline at end of file +.. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py + + +.. |Build Status| image:: https://travis-ci.org/derek73/python-nameparser.svg?branch=master + :target: https://travis-ci.org/derek73/python-nameparser +.. |PyPI| image:: https://img.shields.io/pypi/v/nameparser.svg + :target: https://pypi.org/project/nameparser/ +.. |Documentation| image:: https://readthedocs.org/projects/nameparser/badge/?version=latest + :target: http://nameparser.readthedocs.io/en/latest/?badge=latest +.. |PyPI version| image:: https://img.shields.io/pypi/pyversions/nameparser.svg + :target: https://pypi.org/project/nameparser/ From 0b1001852fcc176774d2d68a2ade062b05d8dfc3 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 22 Sep 2018 13:39:09 -0700 Subject: [PATCH 072/163] remove minor python versions since we always support all of them --- setup.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 4b71aca..ba0cc5a 100755 --- a/setup.py +++ b/setup.py @@ -26,14 +26,8 @@ def read(fname): 'Operating System :: OS Independent', "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', 'Development Status :: 5 - Production/Stable', 'Natural Language :: English', "Topic :: Software Development :: Libraries :: Python Modules", From e2c90432f1affc203b08013a3786f3e6285db3a3 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 26 Oct 2018 14:43:44 -0700 Subject: [PATCH 073/163] v1.0.2, narrow fix to handle only nickname and last name (#78) --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- nameparser/parser.py | 3 ++ tests.py | 66 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 70e2529..7364ad0 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.0.2 - Oct 26, 2018 + - Fix handling of only nickname and last name (#78) * 1.0.1 - August 30, 2018 - Fix overzealous regex for "Ph. D." (#43) - Add `surnames` attribute as aggregate of middle and last names diff --git a/nameparser/__init__.py b/nameparser/__init__.py index febcb0b..0e8663a 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 1) +VERSION = (1, 0, 2) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/parser.py b/nameparser/parser.py index e390438..bf438f7 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -503,6 +503,9 @@ def parse_full_name(self): self.title_list.append(piece) continue if not self.first: + if p_len == 1 and self.nickname: + self.last_list.append(piece) + continue self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ diff --git a/tests.py b/tests.py index 290e6af..0ee7d75 100644 --- a/tests.py +++ b/tests.py @@ -46,7 +46,7 @@ def m(self, actual, expected, hn): self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%r" % ( actual, expected, - hn.full_name, + hn.original, hn )) except UnicodeDecodeError: @@ -1358,7 +1358,7 @@ def test_add_constant_with_explicit_encoding(self): self.assertIn('béck', c.titles) -class HumanNameNicknameTestCase(HumanNameTestBase): +class NicknameTestCase(HumanNameTestBase): # https://code.google.com/p/python-nameparser/issues/detail?id=33 def test_nickname_in_parenthesis(self): hn = HumanName("Benjamin (Ben) Franklin") @@ -1445,12 +1445,64 @@ def test_duplicate_parenthesis_are_removed_from_name(self): self.m(hn.last, "Jones", hn) self.m(hn.suffix, "Jr.", hn) - def test_parenthesis_and_quotes_together(self): - hn = HumanName("Jennifer 'Jen' Jones (Duff)") - self.m(hn.first, "Jennifer", hn) - self.m(hn.last, "Jones", hn) - self.m(hn.nickname, "Jen Duff", hn) + def test_nickname_and_last_name(self): + hn = HumanName('"Rick" Edmonds') + self.m(hn.first, "", hn) + self.m(hn.last, "Edmonds", hn) + self.m(hn.nickname, "Rick", hn) + @unittest.expectedFailure + def test_nickname_and_last_name_with_title(self): + hn = HumanName('Senator "Rick" Edmonds') + self.m(hn.title, "Senator", hn) + self.m(hn.first, "", hn) + self.m(hn.last, "Edmonds", hn) + self.m(hn.nickname, "Rick", hn) + + + +# class MaidenNameTestCase(HumanNameTestBase): +# +# def test_parenthesis_and_quotes_together(self): +# hn = HumanName("Jennifer 'Jen' Jones (Duff)") +# self.m(hn.first, "Jennifer", hn) +# self.m(hn.last, "Jones", hn) +# self.m(hn.nickname, "Jen", hn) +# self.m(hn.maiden, "Duff", hn) +# +# def test_maiden_name_with_nee(self): +# # https://en.wiktionary.org/wiki/née +# hn = HumanName("Mary Toogood nee Johnson") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_accented_nee(self): +# # https://en.wiktionary.org/wiki/née +# hn = HumanName("Mary Toogood née Johnson") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_nee_and_comma(self): +# # https://en.wiktionary.org/wiki/née +# hn = HumanName("Mary Toogood, née Johnson") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_nee_with_parenthesis(self): +# hn = HumanName("Mary Toogood (nee Johnson)") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_parenthesis(self): +# hn = HumanName("Mary Toogood (Johnson)") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# class PrefixesTestCase(HumanNameTestBase): From 285057e6900782aca26ae8a9e4895d5437a7e322 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 26 Oct 2018 14:48:32 -0700 Subject: [PATCH 074/163] add python 3.7 to travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index dc37c42..4da19f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ python: - "3.4" - "3.5" - "3.6" + - "3.7" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi From b9b29cbbbfc138d4958b63e00a047610f30ff6de Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 26 Oct 2018 14:50:59 -0700 Subject: [PATCH 075/163] 3.7 still not available on ubuntu --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4da19f2..dc37c42 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,6 @@ python: - "3.4" - "3.5" - "3.6" - - "3.7" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi From b066aa7e9f1b79426b3e75467a4cfdb9ae7f3bd5 Mon Sep 17 00:00:00 2001 From: TyVik Date: Thu, 18 Apr 2019 21:07:41 +0300 Subject: [PATCH 076/163] Implemented support for escaping log entry arguments. --- nameparser/parser.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bf438f7..8c10c8f 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -480,8 +480,8 @@ def parse_full_name(self): # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] - log.debug("full_name: {0}".format(self._full_name)) - log.debug("parts: {0}".format(parts)) + log.debug("full_name: %s", self._full_name) + log.debug("parts: %s", parts) if len(parts) == 1: @@ -538,7 +538,7 @@ def parse_full_name(self): self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) - log.debug("pieces: {0}".format(u(pieces))) + log.debug("pieces: %s", u(pieces)) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] @@ -568,7 +568,7 @@ def parse_full_name(self): # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) - log.debug("pieces: {0}".format(u(pieces))) + log.debug("pieces: %s", u(pieces)) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) @@ -605,7 +605,7 @@ def parse_full_name(self): pass if len(self) < 0: - log.info("Unparsable: \"{}\" ".format(self.original)) + log.info("Unparsable: \"%s\" ", self.original) else: self.unparsable = False self.post_process() @@ -806,7 +806,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): new_piece = ' '.join(pieces[i:]) pieces = pieces[:i] + [new_piece] - log.debug("pieces: {0}".format(pieces)) + log.debug("pieces: %s", pieces) return pieces From b948e2c647ff114c29058732948f745b9beed8a4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 18 Apr 2019 21:54:05 -0700 Subject: [PATCH 077/163] fix #82, test for sys.stdin --- nameparser/config/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 3b11e88..b9d6e1e 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -92,7 +92,10 @@ def add_with_encoding(self, s, encoding=None): explicit `encoding` parameter to specify the encoding of binary strings that are not DEFAULT_ENCODING (UTF-8). """ - encoding = encoding or sys.stdin.encoding or DEFAULT_ENCODING + stdin_encoding = None + if sys.stdin: + stdin_encoding = sys.stdin.encoding + encoding = encoding or stdin_encoding or DEFAULT_ENCODING if type(s) == binary_type: s = s.decode(encoding) self.elements.add(lc(s)) From 9b8c30b4c95e0830ff78bcba1d1a171aa417ccb1 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 18 Apr 2019 21:56:39 -0700 Subject: [PATCH 078/163] v1.0.3 --- docs/release_log.rst | 3 +++ nameparser/__init__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 7364ad0..59e7cbe 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,8 @@ Release Log =========== +* 1.0.3 - April 18, 2018 + - fix sys.stdin usage when stdin doesn't exist (#82) + - support for escaping log entry arguments (#84) * 1.0.2 - Oct 26, 2018 - Fix handling of only nickname and last name (#78) * 1.0.1 - August 30, 2018 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 0e8663a..bdcad4c 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 2) +VERSION = (1, 0, 3) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 8d6ef523d98b736b1229dda91bcd95e6c499ca7b Mon Sep 17 00:00:00 2001 From: Matt VanEseltine Date: Thu, 20 Jun 2019 16:26:16 -0400 Subject: [PATCH 079/163] Prevent false nicknames due to multiple quotes Certain Anglicized names such as those from some Hawaiian, Samoan, and Kenyan traditions, include multiple single quotation marks. This adjusts the quoted_word regex to only capture single quote marks that are not inside words. Without this fix, false nicknames are extracted from inside names like Ng'ang'a and Kawai'ae'a. Tests are included to cover; existing Benjamin 'Ben' Franklin test assures that the typical nickname case is unchanged. --- nameparser/config/regexes.py | 2 +- nameparser/parser.py | 6 +++--- tests.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index beac95f..bd4b320 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -23,7 +23,7 @@ ("word", re.compile(r"(\w|\.)+", re.U)), ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("quoted_word", re.compile(r'\'([^\s]*?)\'', re.U)), + ("quoted_word", re.compile(r'(? Date: Wed, 26 Jun 2019 18:24:04 -0700 Subject: [PATCH 080/163] get full_name should return current string output, not original name --- nameparser/parser.py | 4 ++-- tests.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 5aa754d..a2dfa8a 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -363,8 +363,8 @@ def is_an_initial(self, value): @property def full_name(self): - """The name string to be parsed.""" - return self._full_name + """The string output of the HumanName instance.""" + return self.__str__() @full_name.setter def full_name(self, value): diff --git a/tests.py b/tests.py index b764955..5e2ddab 100644 --- a/tests.py +++ b/tests.py @@ -111,6 +111,12 @@ def test_assignment_to_full_name(self): self.m(hn.last, "Velasquez y Garcia", hn) self.m(hn.suffix, "III", hn) + def test_get_full_name_attribute_references_internal_lists(self): + hn = HumanName("John Williams") + hn.first_list = ["Larry"] + self.m(hn.full_name, "Larry Williams", hn) + + def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" From ce92f379e96a324178e259a378aab628cbc12a6e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 26 Jun 2019 18:24:25 -0700 Subject: [PATCH 081/163] v1.0.4 --- docs/release_log.rst | 5 ++++- nameparser/__init__.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 59e7cbe..0700af2 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,6 +1,9 @@ Release Log =========== -* 1.0.3 - April 18, 2018 +* 1.0.4 - June 26, 2019 + - Better nickname handling of multiple single quotes (#86) + - full_name attribute now returns formatted string output instead of original string (#87) +* 1.0.3 - April 18, 2019 - fix sys.stdin usage when stdin doesn't exist (#82) - support for escaping log entry arguments (#84) * 1.0.2 - Oct 26, 2018 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index bdcad4c..650cf32 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 3) +VERSION = (1, 0, 4) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From afcf74f4f0a1e2692e47ec2a4288cda0879a07eb Mon Sep 17 00:00:00 2001 From: "James C. Palmer" Date: Sat, 20 Jul 2019 10:20:53 -0400 Subject: [PATCH 082/163] Remove deprecated test case aliases and add editorconfig This also fixes a few minor spelling and spacing errors and ignores Pipenv files --- .editorconfig | 20 ++++++++++++++++++++ .gitignore | 2 ++ nameparser/config/capitalization.py | 10 +++++----- nameparser/config/prefixes.py | 2 +- tests.py | 10 ++++------ 5 files changed, 32 insertions(+), 12 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..55170d0 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# http://editorconfig.org + +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.{py,rst,ini}] +indent_style = space +indent_size = 4 + +[*.{html,json,yml}] +indent_style = space +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitignore b/.gitignore index c3fe42b..3874fab 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ build .coverage dist .idea +Pipfile +Pipfile.lock # docs docs/_* diff --git a/nameparser/config/capitalization.py b/nameparser/config/capitalization.py index 4aa3214..84dfbef 100644 --- a/nameparser/config/capitalization.py +++ b/nameparser/config/capitalization.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals CAPITALIZATION_EXCEPTIONS = ( - ('ii' ,'II'), - ('iii','III'), - ('iv' ,'IV'), - ('md' ,'M.D.'), - ('phd','Ph.D.'), + ('ii', 'II'), + ('iii', 'III'), + ('iv', 'IV'), + ('md', 'M.D.'), + ('phd', 'Ph.D.'), ) """ Any pieces that are not capitalized by capitalizing the first letter. diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 542ea03..2f5eb31 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -4,7 +4,7 @@ #: Name pieces that appear before a last name. Prefixes join to the piece #: that follows them to make one new piece. They can be chained together, e.g #: "von der" and "de la". Because they only appear in middle or last names, -#: they also signifiy that all following name pieces should be in the same name +#: they also signify that all following name pieces should be in the same name #: part, for example, "von" will be joined to all following pieces that are not #: prefixes or suffixes, allowing recognition of double last names when they #: appear after a prefixes. So in "pennie von bergen wessels MD", "von" will diff --git a/tests.py b/tests.py index 5e2ddab..c87ad03 100644 --- a/tests.py +++ b/tests.py @@ -40,7 +40,7 @@ class HumanNameTestBase(unittest.TestCase): def m(self, actual, expected, hn): - """assertEquals with a better message and awareness of hn.C.empty_attribute_default""" + """assertEqual with a better message and awareness of hn.C.empty_attribute_default""" expected = expected or hn.C.empty_attribute_default try: self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%r" % ( @@ -50,7 +50,7 @@ def m(self, actual, expected, hn): hn )) except UnicodeDecodeError: - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) class HumanNamePythonTests(HumanNameTestBase): @@ -62,8 +62,6 @@ def test_utf8(self): def test_string_output(self): hn = HumanName("de la Véña, Jüan") - print(hn) - print(repr(hn)) def test_escaped_utf8_bytes(self): hn = HumanName(b'B\xc3\xb6ck, Gerald') @@ -1267,7 +1265,7 @@ class ConstantsCustomization(HumanNameTestBase): def test_add_title(self): hn = HumanName("Te Awanui-a-Rangi Black", constants=None) start_len = len(hn.C.titles) - self.assert_(start_len > 0) + self.assertTrue(start_len > 0) hn.C.titles.add('te') self.assertEqual(start_len + 1, len(hn.C.titles)) hn.parse_full_name() @@ -1278,7 +1276,7 @@ def test_add_title(self): def test_remove_title(self): hn = HumanName("Hon Solo", constants=None) start_len = len(hn.C.titles) - self.assert_(start_len > 0) + self.assertTrue(start_len > 0) hn.C.titles.remove('hon') self.assertEqual(start_len - 1, len(hn.C.titles)) hn.parse_full_name() From f4bbf42bd7b43444e15063f0866a4c4ff30a1869 Mon Sep 17 00:00:00 2001 From: "James C. Palmer" Date: Sat, 20 Jul 2019 14:17:27 -0400 Subject: [PATCH 083/163] Added capitalization rules to Constants class These changes let users apply capitalization rules to all `HumanName` instances. --- docs/customize.rst | 2 ++ docs/usage.rst | 30 +++++++++++++++++++++++++++--- nameparser/config/__init__.py | 33 +++++++++++++++++++++++++++++++-- nameparser/parser.py | 27 +++++++++++++++++++++------ tests.py | 22 ++++++++++++++++++++++ 5 files changed, 103 insertions(+), 11 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index 6097050..1e4f38d 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -57,6 +57,8 @@ Other editable attributes * :py:obj:`~nameparser.config.Constants.string_format` - controls output from `str()` * :py:obj:`~nameparser.config.Constants.empty_attribute_default` - value returned by empty attributes, defaults to empty string +* :py:obj:`~nameparser.config.Constants.capitalize_name` - If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to :py:class:`~nameparser.parser.HumanName` instance. +* :py:obj:`~nameparser.config.Constants.force_mixed_case_capitalization` - If set, forces the capitalization of mixed case strings when :py:meth:`~nameparser.parser.HumanName.capitalize` is called. diff --git a/docs/usage.rst b/docs/usage.rst index 45f67a4..6a65c4e 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -72,9 +72,8 @@ Capitalization Support The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. By default, it will not adjust -the case of names entered in mixed case. To run capitalization on all names -pass the parameter `force=True`. - +the case of names entered in mixed case. To run capitalization on a +`HumanName` instance, pass the parameter `force=True`. Capitalize the name. @@ -94,6 +93,31 @@ pass the parameter `force=True`. >>> str(name) 'Shirley MacLaine' +To apply capitalization to all `HumanName` instances, set +:py:attr:`~nameparser.config.Constants.capitalize_name` to `True`. + +.. doctest:: capitalize_name + :options: +NORMALIZE_WHITESPACE + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.capitalize_name = True + >>> name = HumanName("bob v. de la macdole-eisenhower phd") + >>> str(name) + 'Bob V. de la MacDole-Eisenhower Ph.D.' + +To force the capitalization of mixed case strings on all `HumanName` instances, +set :py:attr:`~nameparser.config.Constants.force_mixed_case_capitalization` to `True`. + +.. doctest:: force_mixed_case_capitalization + :options: +NORMALIZE_WHITESPACE + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_mixed_case_capitalization = True + >>> name = HumanName('Shirley Maclaine') + >>> name.capitalize() + >>> str(name) + 'Shirley MacLaine' + Nickname Handling ------------------ diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index b9d6e1e..602afe8 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -179,8 +179,37 @@ class Constants(object): 'John' """ - - + capitalize_name = False + """ + If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to + :py:class:`~nameparser.parser.HumanName` instance. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.capitalize_name = True + >>> name = HumanName("bob v. de la macdole-eisenhower phd") + >>> str(name) + 'Bob V. de la MacDole-Eisenhower Ph.D.' + + """ + force_mixed_case_capitalization = False + """ + If set, forces the capitalization of mixed case strings when + :py:meth:`~nameparser.parser.HumanName.capitalize` is called. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_mixed_case_capitalization = True + >>> name = HumanName('Shirley Maclaine') + >>> name.capitalize() + >>> str(name) + 'Shirley MacLaine' + + """ + + def __init__(self, prefixes=PREFIXES, suffix_acronyms=SUFFIX_ACRONYMS, diff --git a/nameparser/parser.py b/nameparser/parser.py index a2dfa8a..e90c99b 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -387,7 +387,7 @@ def pre_process(self): This method happens at the beginning of the :py:func:`parse_full_name` before any other processing of the string aside from unicode normalization, so it's a good place to do any custom handling in a - subclass. Runs :py:func:`parse_nicknames` and py:func:`squash_emoji`. + subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. """ self.fix_phd() @@ -397,9 +397,11 @@ def pre_process(self): def post_process(self): """ This happens at the end of the :py:func:`parse_full_name` after - all other processing has taken place. Runs :py:func:`handle_firstnames`. + all other processing has taken place. Runs :py:func:`handle_firstnames` + and :py:func:`handle_capitalization`. """ self.handle_firstnames() + self.handle_capitalization() def fix_phd(self): _re = self.C.regexes.phd @@ -675,9 +677,9 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: :return: new list with piece next to conjunctions merged into one piece - with spaces in it. + with spaces in it. :rtype: list - + """ length = len(pieces) + additional_parts_count # don't join on conjunctions if there's only 2 parts @@ -833,14 +835,16 @@ def cap_piece(self, piece, attribute): replacement = lambda m: self.cap_word(m.group(0), attribute) return self.C.regexes.word.sub(replacement, piece) - def capitalize(self, force=False): + def capitalize(self, force=None): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. By default, it will not adjust the case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. - :param bool force: force capitalization of strings that include mixed case + :param bool force: Forces capitalization of mixed case strings. This + parameter overrides rules set within + :py:class:`~nameparser.config.CONSTANTS`. **Usage** @@ -861,6 +865,9 @@ def capitalize(self, force=False): """ name = u(self) + force = self.C.force_mixed_case_capitalization \ + if force is None else force + if not force and not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title , 'title').split(' ') @@ -868,3 +875,11 @@ def capitalize(self, force=False): self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') self.last_list = self.cap_piece(self.last , 'last').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') + + def handle_capitalization(self): + """ + Handles capitalization configurations set within + :py:class:`~nameparser.config.CONSTANTS`. + """ + if self.C.capitalize_name: + self.capitalize() diff --git a/tests.py b/tests.py index c87ad03..0e948c7 100644 --- a/tests.py +++ b/tests.py @@ -2088,6 +2088,28 @@ def test_formatting_constants_attribute(self): self.assertEqual(u(hn), "TEST2") CONSTANTS.string_format = _orig + def test_capitalize_name_constants_attribute(self): + from nameparser.config import CONSTANTS + CONSTANTS.capitalize_name = True + hn = HumanName("bob v. de la macdole-eisenhower phd") + self.assertEqual(str(hn), "Bob V. de la MacDole-Eisenhower Ph.D.") + CONSTANTS.capitalize_name = False + + def test_force_mixed_case_capitalization_constants_attribute(self): + from nameparser.config import CONSTANTS + CONSTANTS.force_mixed_case_capitalization = True + hn = HumanName('Shirley Maclaine') + hn.capitalize() + self.assertEqual(str(hn), "Shirley MacLaine") + CONSTANTS.force_mixed_case_capitalization = False + + def test_capitalize_name_and_force_mixed_case_capitalization_constants_attributes(self): + from nameparser.config import CONSTANTS + CONSTANTS.capitalize_name = True + CONSTANTS.force_mixed_case_capitalization = True + hn = HumanName('Shirley Maclaine') + self.assertEqual(str(hn), "Shirley MacLaine") + def test_quote_nickname_formating(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" From ca408b8e1899f1cbe3ad8c5bc30fb037cc199be1 Mon Sep 17 00:00:00 2001 From: Chris Erickson Date: Fri, 2 Aug 2019 14:08:33 -0500 Subject: [PATCH 084/163] Dep warning with importing from collections --- nameparser/config/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index b9d6e1e..a6fb152 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,7 +29,7 @@ unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals -import collections +from collections.abc import Set import sys from nameparser.util import binary_type @@ -45,7 +45,7 @@ DEFAULT_ENCODING = 'UTF-8' -class SetManager(collections.Set): +class SetManager(Set): ''' Easily add and remove config variables per module or instance. Subclass of ``collections.Set``. From c60ba11d2526fa39ba2cd43ef1b221c7d4f8e2e0 Mon Sep 17 00:00:00 2001 From: Amrish Parmar Date: Wed, 18 Sep 2019 11:01:58 +0100 Subject: [PATCH 085/163] Fix deprecated use of `collections` module In Python 3.8 imports of abstract base classes directly from the `collections` module will stop working entirely. This change maintains existing behaviour whilst providing backwards compatibility. --- nameparser/config/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index b9d6e1e..0b31c4e 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,8 +29,13 @@ unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals -import collections + import sys +try: + # Python 3.3+ + from collections.abc import Set +except ImportError: + from collections import Set from nameparser.util import binary_type from nameparser.util import lc @@ -45,10 +50,10 @@ DEFAULT_ENCODING = 'UTF-8' -class SetManager(collections.Set): +class SetManager(Set): ''' Easily add and remove config variables per module or instance. Subclass of - ``collections.Set``. + ``collections.abc.Set``. Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) From e83ee3183296ff5671ad2034a8dbeed6a489a3be Mon Sep 17 00:00:00 2001 From: andrew Date: Mon, 11 Nov 2019 19:08:06 -0800 Subject: [PATCH 086/163] run the parsing of pieces for post-comma parts prior to checking if all post-comma elements are suffixes. this ensures that suffixes are properly captured in the constants prior to the all-suffix check --- nameparser/parser.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index a2dfa8a..5a64af3 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -528,6 +528,9 @@ def parse_full_name(self): # in the first part. (Suffixes will never appear after last names # only, and allows potential first names to be in suffixes, e.g. # "Johnson, Bart" + + post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1) + if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: @@ -566,9 +569,8 @@ def parse_full_name(self): # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] - pieces = self.parse_pieces(parts[1].split(' '), 1) - log.debug("pieces: %s", u(pieces)) + log.debug("post-comma pieces: %s", u(post_comma_pieces)) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) @@ -580,14 +582,14 @@ def parse_full_name(self): else: self.last_list.append(piece) - for i, piece in enumerate(pieces): + for i, piece in enumerate(post_comma_pieces): try: - nxt = pieces[i + 1] + nxt = post_comma_pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ - and (nxt or len(pieces) == 1) \ + and (nxt or len(post_comma_pieces) == 1) \ and not self.first: self.title_list.append(piece) continue From 2fb91dcb4adca54cd5dfd333889e15d13d13969c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 19:03:32 -0800 Subject: [PATCH 087/163] remove deprecated python versions no longer supported by travis --- .travis.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index dc37c42..62e7e8b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,12 @@ language: python python: - - "2.6" - "2.7" - - "3.2" - - "3.3" - "3.4" - "3.5" - "3.6" + - "3.7" # command to install dependencies -install: +install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - "pip install dill" - "python setup.py install" From be1e85185f2083b708360b1286f508b1afb7b7f4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 19:05:32 -0800 Subject: [PATCH 088/163] add python 3.8 to travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 62e7e8b..45394bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "3.5" - "3.6" - "3.7" + - "3.8" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi From 72d2a13d4cecc83af344109ad562b260de3903ee Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 19:33:25 -0800 Subject: [PATCH 089/163] fix duplicate import from merge --- nameparser/config/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index c1f7bbc..4f1e4f2 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,7 +29,6 @@ unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals -from collections.abc import Set import sys try: # Python 3.3+ From c0d14c5300136bffdbcf993c3a05bb8c813b348b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 20:37:28 -0800 Subject: [PATCH 090/163] remove Elder from constants because it is sometimes a first name, fix #96 --- nameparser/config/titles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index dfcbd07..3d5892f 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -259,7 +259,6 @@ 'educator', 'effendi', 'ekegbian', - 'elder', 'elerunwon', 'eminence', 'emperor', From af5bdabc160fc15054b59e078c658ac80a3cb1ff Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 21:01:27 -0800 Subject: [PATCH 091/163] add post-nominal list from wikipedia. fix #93 --- nameparser/config/suffixes.py | 533 +++++++++++++++++++++++++++++++++- 1 file changed, 531 insertions(+), 2 deletions(-) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 7f01581..9765b92 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -24,104 +24,633 @@ """ SUFFIX_ACRONYMS = set([ + '(ret)', + '(vet)', + '8-vsb', + 'aas', + 'aba', + 'abc', + 'abd', + 'abpp', + 'abr', + 'aca', + 'acas', + 'ace', + 'acha', + 'acp', 'ae', + 'ae', + 'aem', + 'afasma', + 'afc', 'afc', 'afm', + 'afm', + 'agsf', + 'aia', + 'aicp', + 'ala', + 'alc', + 'alp', + 'am', + 'amd', + 'ame', + 'amieee', + 'ams', + 'aphr', + 'apn aprn', + 'apr', + 'apss', + 'aqp', + 'arm', 'arrc', - 'bart', + 'asa', + 'asc', + 'asid', + 'asla', + 'asp', + 'atc', + 'awb', + 'bca', + 'bcl', + 'bcss', + 'bds', + 'bem', 'bem', + 'bls-i', + 'bpe', + 'bpi', + 'bpt', 'bt', + 'btcs', + 'bts', + 'cacts', + 'cae', + 'caha', + 'caia', + 'cams', + 'cap', + 'capa', + 'capm', + 'capp', + 'caps', + 'caro', + 'cas', + 'casp', 'cb', 'cbe', + 'cbm', + 'cbne', + 'cbnt', + 'cbp', + 'cbrte', + 'cbs', + 'cbsp', + 'cbt', + 'cbte', + 'cbv', + 'cca', + 'ccc', + 'ccca', + 'cccm', + 'cce', + 'cchp', + 'ccie', + 'ccim', + 'cciso', + 'ccm', + 'ccmt', + 'ccna', + 'ccnp', + 'ccp', + 'ccp-c', + 'ccpr', + 'ccs', + 'ccufc', + 'cd', + 'cdal', + 'cdfm', + 'cdmp', + 'cds', + 'cdt', + 'cea', + 'ceas', + 'cebs', + 'ceds', + 'ceh', + 'cela', + 'cem', + 'cep', + 'cera', + 'cet', + 'cfa', + 'cfc', + 'cfcc', + 'cfce', + 'cfcm', + 'cfe', + 'cfeds', + 'cfi', + 'cfm', 'cfp', + 'cfps', + 'cfr', + 'cfre', + 'cga', + 'cgap', + 'cgb', 'cgc', + 'cgfm', + 'cgfo', 'cgm', + 'cgm', + 'cgma', + 'cgp', + 'cgr', + 'cgsp', + 'ch', 'ch', + 'cha', + 'chba', + 'chdm', + 'che', + 'ches', + 'chfc', 'chfc', + 'chi', + 'chmc', + 'chmm', + 'chp', + 'chpa', + 'chpe', + 'chpln', + 'chpse', + 'chrm', + 'chsc', + 'chse', + 'chse-a', + 'chsos', + 'chss', + 'cht', + 'cia', + 'cic', 'cie', + 'cig', + 'cip', + 'cipm', + 'cips', + 'ciro', + 'cisa', + 'cism', + 'cissp', + 'cla', + 'clsd', + 'cltd', 'clu', + 'cm', + 'cma', + 'cmas', + 'cmc', + 'cmfo', 'cmg', + 'cmp', + 'cms', + 'cmsp', + 'cmt', + 'cna', + 'cnm', + 'cnp', + 'cp', + 'cp-c', 'cpa', + 'cpacc', + 'cpbe', + 'cpcm', + 'cpcu', + 'cpe', + 'cpfa', + 'cpfo', + 'cpg', + 'cph', + 'cpht', + 'cpim', + 'cpl', + 'cplp', 'cpm', + 'cpo', + 'cpp', + 'cprc', + 'cpre', + 'cprp', + 'cpsc', + 'cpsi', + 'cpss', + 'cpt', + 'cpwa', + 'crde', + 'crisc', + 'crma', + 'crme', + 'crna', + 'cro', + 'crp', + 'crt', + 'crtt', + 'csa', + 'csbe', + 'csc', + 'cscp', + 'cscu', + 'csep', 'csi', 'csm', + 'csp', + 'cspo', + 'csre', + 'csrte', + 'csslp', + 'cssm', + 'cst', + 'cste', + 'ctbs', + 'ctfa', + 'cto', + 'ctp', + 'cts', + 'cua', + 'cusp', + 'cva', + 'cva[22]', 'cvo', + 'cvp', + 'cvrs', + 'cwap', + 'cwb', + 'cwdp', + 'cwep', + 'cwna', + 'cwne', + 'cwp', + 'cwsp', + 'cxa', + 'cyds', + 'cysa', + 'dabfm', + 'dabvlm', + 'dacvim', 'dbe', + 'dc', 'dcb', 'dcm', 'dcmg', 'dcvo', + 'dd', 'dds', + 'ded', + 'dep', 'dfc', 'dfm', + 'diplac', + 'diplom', + 'djur', + 'dma', 'dmd', + 'dmin', + 'dnp', 'do', 'dpm', + 'dpt', + 'drb', + 'drmp', + 'drph', 'dsc', 'dsm', 'dso', + 'dss', + 'dtr', + 'dvep', 'dvm', + 'ea', 'ed', + 'edd', + 'ei', + 'eit', + 'els', + 'emd', + 'emt-b', + 'emt-i/85', + 'emt-i/99', + 'emt-p', + 'enp', 'erd', + 'esq', + 'evp', + 'faafp', + 'faan', + 'faap', + 'fac-c', + 'facc', + 'facd', + 'facem', + 'facep', + 'facha', + 'facofp', + 'facog', + 'facp', + 'facph', + 'facs', + 'faia', + 'faicp', + 'fala', + 'fashp', + 'fasid', + 'fasla', + 'fasma', + 'faspen', + 'fca', + 'fcas', + 'fcela', + 'fd', + 'fec', + 'fhames', + 'fic', + 'ficf', + 'fieee', + 'fmp', + 'fmva', + 'fnss', + 'fp&a', + 'fp-c', + 'fpc', + 'frm', + 'fsa', + 'fsdp', + 'fws', + 'gaee[14]', + 'gba', 'gbe', 'gc', 'gcb', + 'gcb', + 'gchs', 'gcie', 'gcmg', + 'gcmg', 'gcsi', 'gcvo', + 'gcvo', + 'gisp', + 'git', 'gm', + 'gmb', + 'gmr', + 'gphr', + 'gri', + 'grp', + 'gsmieee', + 'hccp', + 'hrs', + 'iaccp', + 'iaee', + 'iccm-d', + 'iccm-f', 'idsm', + 'ifgict', 'iom', + 'ipep', + 'ipm', 'iso', + 'issp-csp', + 'issp-sa', + 'itil', 'jd', + 'jp', 'kbe', 'kcb', + 'kchs/dchs', + 'kcie', 'kcie', 'kcmg', 'kcsi', + 'kcsi', 'kcvo', 'kg', + 'khs/dhs', 'kp', 'kt', + 'lac', + 'lcmt', + 'lcpc', + 'lcsw', + 'leed ap', 'lg', + 'litk', + 'litl', + 'litp', + 'llm', + 'lm', + 'lmsw', + 'lmt', + 'lp', + 'lpa', + 'lpc', + 'lpn', + 'lpss', + 'lsi', + 'lsit', 'lt', + 'lvn', 'lvo', + 'lvt', 'ma', + 'maaa', + 'mai', 'mba', 'mbe', + 'mbs', 'mc', + 'mcct', + 'mcdba', + 'mches', + 'mcm', + 'mcp', + 'mcpd', + 'mcsa', + 'mcsd', + 'mcse', + 'mct', 'md', + 'mdiv', + 'mem', + 'mfa', + 'micp', + 'mieee', + 'mirm', + 'mle', + 'mls', + 'mlse', + 'mlt', 'mm', + 'mmad', + 'mmas', + 'mnaa', + 'mnae', 'mp', + 'mpa', + 'mph', + 'mpse', + 'mra', + 'ms', + 'msa', 'msc' + 'mscmsm', 'msm', + 'mt', + 'mts', 'mvo', + 'nbc-his', + 'nbcch', + 'nbcch-ps', + 'nbcdch', + 'nbcdch-ps', + 'nbcfch', + 'nbcfch-ps', + 'nbct', + 'ncarb', + 'nccp', + 'ncidq', + 'ncps', + 'ncso', + 'ncto', + 'nd', + 'ndtr', + 'nicet i', + 'nicet ii', + 'nicet iii', + 'nicet iv', + 'nmd', + 'np', + 'np[18]', + 'nraemt', + 'nremr', + 'nremt', + 'nrp', 'obe', 'obi', + 'oca', + 'ocm', + 'ocp', + 'od', 'om', + 'oscp', + 'ot', + 'pa-c', + 'pcc', + 'pci', + 'pe', + 'pfmp', + 'pg', + 'pgmp', + 'ph', + 'pharmd', + 'phc', 'phd', 'phr', + 'phrca', + 'pla', + 'pls', + 'pmc', + 'pmi-acp', 'pmp', + 'pp', + 'pps', + 'prm', + 'psm i', + 'psm ii', + 'psm', + 'psp', + 'psyd', + 'pt', + 'pta', 'qam', 'qc', + 'qcsw', 'qfsm', 'qgm', 'qpm', + 'qsd', + 'qsp', + 'ra', + 'rai', + 'rba', + 'rci', + 'rcp', 'rd', + 'rdcs', + 'rdh', + 'rdms', + 'rdn', + 'res', + 'rfp', + 'rhca', + 'rid', + 'rls', + 'rmsks', + 'rn', + 'rp', + 'rpa', + 'rph', + 'rpl', 'rrc', + 'rrt', + 'rrt-accs', + 'rrt-nps', + 'rrt-sds', + 'rtrp', 'rvm', + 'rvt', + 'sa', + 'same', + 'sasm', + 'sccp', + 'scmp', + 'se', + 'secb', + 'sfp', 'sgm', + 'shrm-cp', + 'shrm-scp', + 'si', + 'siie', + 'smieee', + 'sphr', + 'sra', + 'sscp', + 'stmieee', + 'tbr-ct', 'td', + 'thd', + 'thm', 'ud', + 'usa', + 'usaf', + 'usar', + 'uscg', + 'usmc', + 'usn', + 'usnr', + 'uxc', + 'uxmc', + 'vc', 'vc', + 'vcp', 'vd', 'vrd', ]) """ Post-nominal acronyms. Titles, degrees and other things people stick after their name -that may or may not have periods between the letters. The parser removes periods +that may or may not have periods between the letters. The parser removes periods when matching against these pieces. """ From ba4d69a66d23f0bbe3b9e6894c9b76258b213221 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 21:03:22 -0800 Subject: [PATCH 092/163] v1.0.5, update release log --- docs/release_log.rst | 6 ++++++ nameparser/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 0700af2..f53cbcc 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,11 @@ Release Log =========== +* 1.0.5 - Dec 12, 2019 + - Fix suffix parsing bug in comma parts (#98) + - Fix deprecation warning on Python 3.7 (#94) + - Improved capitalization support of mixed case names (#90) + - Remove "elder" from titles (#96) + - Add post-nominal list from Wikipedia to suffixes (#93) * 1.0.4 - June 26, 2019 - Better nickname handling of multiple single quotes (#86) - full_name attribute now returns formatted string output instead of original string (#87) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 650cf32..b2644e7 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 4) +VERSION = (1, 0, 5) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 6da7fb3d53bb0d25858a5df0f7e0a37d2aab8f2a Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 21:09:04 -0800 Subject: [PATCH 093/163] remove tests for Bart as suffix, bart no longer in suffixes --- tests.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests.py b/tests.py index 0e948c7..5f976b8 100644 --- a/tests.py +++ b/tests.py @@ -1727,19 +1727,7 @@ def test_potential_suffix_that_is_also_last_name_comma(self): hn = HumanName("Ma, Jack") self.m(hn.first, "Jack", hn) self.m(hn.last, "Ma", hn) - - def test_potential_suffix_that_is_also_first_name_comma(self): - hn = HumanName("Johnson, Bart") - self.m(hn.first, "Bart", hn) - self.m(hn.last, "Johnson", hn) - - # TODO: handle conjunctions in last names followed by first names clashing with suffixes - @unittest.expectedFailure - def test_potential_suffix_that_is_also_first_name_comma_with_conjunction(self): - hn = HumanName("De la Vina, Bart") - self.m(hn.first, "Bart", hn) - self.m(hn.last, "De la Vina", hn) - + def test_potential_suffix_that_is_also_last_name_with_suffix(self): hn = HumanName("Jack Ma Jr") self.m(hn.first, "Jack", hn) From 859eb38e74e25ceb9ba55b70ec60aecb0462ea1e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 11 Dec 2019 22:07:27 -0800 Subject: [PATCH 094/163] Add publish python package workflow --- .github/workflows/pythonpublish.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/pythonpublish.yml diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml new file mode 100644 index 0000000..21f2f01 --- /dev/null +++ b/.github/workflows/pythonpublish.yml @@ -0,0 +1,26 @@ +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: Set up Python + uses: actions/setup-python@v1 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* From ee8b8e047febd7b64fd97cbc7c717df02eab2950 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 4 Feb 2020 19:09:51 -0800 Subject: [PATCH 095/163] remove publish workflow --- .github/workflows/pythonpublish.yml | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 .github/workflows/pythonpublish.yml diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml deleted file mode 100644 index 21f2f01..0000000 --- a/.github/workflows/pythonpublish.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Upload Python Package - -on: - release: - types: [created] - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* From 845d61a5978957cdc6276991730140ffb76ef3cd Mon Sep 17 00:00:00 2001 From: Karthikeyan Singaravelan Date: Sat, 8 Feb 2020 10:16:33 +0530 Subject: [PATCH 096/163] Fix syntax warnings due to comparison of literals using is. --- nameparser/parser.py | 264 +++++++++++++++++++++---------------------- 1 file changed, 132 insertions(+), 132 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index caf4ded..bd79057 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -17,7 +17,7 @@ def group_contiguous_integers(data): """ - return list of tuples containing first and last index + return list of tuples containing first and last index position of contiguous numbers in a series """ ranges = [] @@ -30,13 +30,13 @@ def group_contiguous_integers(data): class HumanName(object): """ Parse a person's name into individual components. - + Instantiation assigns to ``full_name``, and assignment to :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the name, these instance attributes are available. - + **HumanName Instance Attributes** - + * :py:attr:`title` * :py:attr:`first` * :py:attr:`middle` @@ -46,61 +46,61 @@ class HumanName(object): * :py:attr:`surnames` :param str full_name: The name string to be parsed. - :param constants constants: - a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for - `per-instance config `_. + :param constants constants: + a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for + `per-instance config `_. :param str encoding: string representing the encoding of your input - :param str string_format: python string formatting + :param str string_format: python string formatting """ - + C = CONSTANTS """ A reference to the configuration for this instance, which may or may not be - a reference to the shared, module-wide instance at - :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser + a reference to the shared, module-wide instance at + :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser `_. """ - + original = '' """ The original string, untouched by the parser. """ - + _count = 0 _members = ['title','first','middle','last','suffix','nickname'] unparsable = True _full_name = '' - + def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, string_format=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() - + self.encoding = encoding self.string_format = string_format or self.C.string_format # full_name setter triggers the parse self.full_name = full_name - + def __iter__(self): return self - + def __len__(self): l = 0 for x in self: l += 1 return l - + def __eq__(self, other): """ - HumanName instances are equal to other objects whose + HumanName instances are equal to other objects whose lower case unicode representation is the same. """ return (u(self)).lower() == (u(other)).lower() - + def __ne__(self, other): return not (u(self)).lower() == (u(other)).lower() - + def __getitem__(self, key): if isinstance(key, slice): return [getattr(self, x) for x in self._members[key]] @@ -133,12 +133,12 @@ def __unicode__(self): _s = _s.replace(str(self.C.empty_attribute_default),'').replace(" ()","").replace(" ''","").replace(' ""',"") return self.collapse_whitespace(_s).strip(', ') return " ".join(self) - + def __str__(self): if sys.version_info[0] >= 3: return self.__unicode__() return self.__unicode__().encode(self.encoding) - + def __repr__(self): if self.unparsable: _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__,} @@ -155,22 +155,22 @@ def __repr__(self): if sys.version_info[0] >= 3: return _string return _string.encode(self.encoding) - + def as_dict(self, include_empty=True): """ Return the parsed name as a dictionary of its attributes. - + :param bool include_empty: Include keys in the dictionary for empty name attributes. :rtype: dict - + .. doctest:: - + >>> name = HumanName("Bob Dole") >>> name.as_dict() {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'} >>> name.as_dict(False) {'last': 'Dole', 'first': 'Bob'} - + """ d = {} for m in self._members: @@ -181,66 +181,66 @@ def as_dict(self, include_empty=True): if val: d[m] = val return d - + @property def has_own_config(self): """ - True if this instance is not using the shared module-level + True if this instance is not using the shared module-level configuration. """ return self.C is not CONSTANTS - + ### attributes - + @property def title(self): """ - The person's titles. Any string of consecutive pieces in - :py:mod:`~nameparser.config.titles` or + The person's titles. Any string of consecutive pieces in + :py:mod:`~nameparser.config.titles` or :py:mod:`~nameparser.config.conjunctions` at the beginning of :py:attr:`full_name`. """ return " ".join(self.title_list) or self.C.empty_attribute_default - + @property def first(self): """ - The person's first name. The first name piece after any known + The person's first name. The first name piece after any known :py:attr:`title` pieces parsed from :py:attr:`full_name`. """ return " ".join(self.first_list) or self.C.empty_attribute_default - + @property def middle(self): """ - The person's middle names. All name pieces after the first name and + The person's middle names. All name pieces after the first name and before the last name parsed from :py:attr:`full_name`. """ return " ".join(self.middle_list) or self.C.empty_attribute_default - + @property def last(self): """ - The person's last name. The last name piece parsed from + The person's last name. The last name piece parsed from :py:attr:`full_name`. """ return " ".join(self.last_list) or self.C.empty_attribute_default - + @property def suffix(self): """ The persons's suffixes. Pieces at the end of the name that are found in :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end - of comma separated formats, e.g. - "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed + of comma separated formats, e.g. + "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed from :py:attr:`full_name`. """ return ", ".join(self.suffix_list) or self.C.empty_attribute_default - + @property def nickname(self): """ - The person's nicknames. Any text found inside of quotes (``""``) or + The person's nicknames. Any text found inside of quotes (``""``) or parenthesis (``()``) """ return " ".join(self.nickname_list) or self.C.empty_attribute_default @@ -260,7 +260,7 @@ def surnames(self): return " ".join(self.surnames_list) or self.C.empty_attribute_default ### setter methods - + def _set_list(self, attr, value): if isinstance(value, list): val = value @@ -273,60 +273,60 @@ def _set_list(self, attr, value): "Can only assign strings, lists or None to name attributes." " Got {0}".format(type(value))) setattr(self, attr+"_list", self.parse_pieces(val)) - + @title.setter def title(self, value): self._set_list('title', value) - + @first.setter def first(self, value): self._set_list('first', value) - + @middle.setter def middle(self, value): self._set_list('middle', value) - + @last.setter def last(self, value): self._set_list('last', value) - + @suffix.setter def suffix(self, value): self._set_list('suffix', value) - + @nickname.setter def nickname(self, value): self._set_list('nickname', value) - + ### Parse helpers - + def is_title(self, value): """Is in the :py:data:`~nameparser.config.titles.TITLES` set.""" return lc(value) in self.C.titles - + def is_conjunction(self, piece): """Is in the conjuctions set and not :py:func:`is_an_initial()`.""" return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) - + def is_prefix(self, piece): """ - Lowercase and no periods version of piece is in the + Lowercase and no periods version of piece is in the :py:data:`~nameparser.config.prefixes.PREFIXES` set. """ return lc(piece) in self.C.prefixes def is_roman_numeral(self, value): """ - Matches the ``roman_numeral`` regular expression in + Matches the ``roman_numeral`` regular expression in :py:data:`~nameparser.config.regexes.REGEXES`. """ return bool(self.C.regexes.roman_numeral.match(value)) - + def is_suffix(self, piece): """ - Is in the suffixes set and not :py:func:`is_an_initial()`. - - Some suffixes may be acronyms (M.B.A) while some are not (Jr.), + Is in the suffixes set and not :py:func:`is_an_initial()`. + + Some suffixes may be acronyms (M.B.A) while some are not (Jr.), so we remove the periods from `piece` when testing against `C.suffix_acronyms`. """ @@ -341,31 +341,31 @@ def are_suffixes(self, pieces): if not self.is_suffix(piece): return False return True - + def is_rootname(self, piece): """ Is not a known title, suffix or prefix. Just first, middle, last names. """ return lc(piece) not in self.C.suffixes_prefixes_titles \ - and not self.is_an_initial(piece) - + and not self.is_an_initial(piece) + def is_an_initial(self, value): """ Words with a single period at the end, or a single uppercase letter. - - Matches the ``initial`` regular expression in + + Matches the ``initial`` regular expression in :py:data:`~nameparser.config.regexes.REGEXES`. """ return bool(self.C.regexes.initial.match(value)) - + ### full_name parser - + @property def full_name(self): """The string output of the HumanName instance.""" return self.__str__() - + @full_name.setter def full_name(self, value): self.original = value @@ -373,7 +373,7 @@ def full_name(self, value): if isinstance(value, binary_type): self._full_name = value.decode(self.encoding) self.parse_full_name() - + def collapse_whitespace(self, string): # collapse multiple spaces into single space string = self.C.regexes.spaces.sub(" ", string.strip()) @@ -383,12 +383,12 @@ def collapse_whitespace(self, string): def pre_process(self): """ - + This method happens at the beginning of the :py:func:`parse_full_name` before any other processing of the string aside from unicode normalization, so it's a good place to do any custom handling in a subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. - + """ self.fix_phd() self.parse_nicknames() @@ -412,17 +412,17 @@ def fix_phd(self): def parse_nicknames(self): """ - The content of parenthesis or quotes in the name will be added to the + The content of parenthesis or quotes in the name will be added to the nicknames list. This happens before any other processing of the name. - + Single quotes cannot span white space characters and must border white space to allow for quotes in names like O'Connor and Kawai'ae'a. Double quotes and parenthesis can span white space. - - Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; + + Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; `quoted_word`, `double_quotes` and `parenthesis`. """ - + re_quoted_word = self.C.regexes.quoted_word re_double_quotes = self.C.regexes.double_quotes re_parenthesis = self.C.regexes.parenthesis @@ -445,7 +445,7 @@ def handle_firstnames(self): If there are only two parts and one is a title, assume it's a last name instead of a first name. e.g. Mr. Johnson. Unless it's a special title like "Sir", then when it's followed by a single name that name is always - a first name. + a first name. """ if self.title \ and len(self) == 2 \ @@ -454,18 +454,18 @@ def handle_firstnames(self): def parse_full_name(self): """ - + The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. - + :py:func:`parse_pieces` then splits those parts on spaces and - :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. + :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ - + self.title_list = [] self.first_list = [] self.middle_list = [] @@ -473,23 +473,23 @@ def parse_full_name(self): self.suffix_list = [] self.nickname_list = [] self.unparsable = True - - + + self.pre_process() - + self._full_name = self.collapse_whitespace(self._full_name) - + # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] - + log.debug("full_name: %s", self._full_name) log.debug("parts: %s", parts) - + if len(parts) == 1: - + # no commas, title first middle middle middle last suffix # part[0] - + pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): @@ -497,7 +497,7 @@ def parse_full_name(self): nxt = pieces[i + 1] except IndexError: nxt = None - + # title must have a next piece, unless it's just a title if self.is_title(piece) \ and (nxt or p_len == 1) \ @@ -511,10 +511,10 @@ def parse_full_name(self): self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ - ( + ( # if the next piece is the last piece and a roman # numeral but this piece is not an initial - self.is_roman_numeral(nxt) and i == p_len - 2 + self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) @@ -523,7 +523,7 @@ def parse_full_name(self): if not nxt: self.last_list.append(piece) continue - + self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece @@ -535,12 +535,12 @@ def parse_full_name(self): if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: - - # suffix comma: + + # suffix comma: # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] - - + + self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: %s", u(pieces)) @@ -567,13 +567,13 @@ def parse_full_name(self): continue self.middle_list.append(piece) else: - - # lastname comma: + + # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] - + log.debug("post-comma pieces: %s", u(post_comma_pieces)) - + # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: @@ -583,13 +583,13 @@ def parse_full_name(self): self.suffix_list.append(piece) else: self.last_list.append(piece) - + for i, piece in enumerate(post_comma_pieces): try: nxt = post_comma_pieces[i + 1] except IndexError: nxt = None - + if self.is_title(piece) \ and (nxt or len(post_comma_pieces) == 1) \ and not self.first: @@ -607,7 +607,7 @@ def parse_full_name(self): self.suffix_list += parts[2:] except IndexError: pass - + if len(self) < 0: log.info("Unparsable: \"%s\" ", self.original) else: @@ -621,24 +621,24 @@ def parse_pieces(self, parts, additional_parts_count=0): lastname prefixes. If parts have periods in the middle, try splitting on periods and check if the parts are titles or suffixes. If they are add to the constant so they will be found. - + :param list parts: name part strings from the comma split - :param int additional_parts_count: - - if the comma format contains other parts, we need to know - how many there are to decide if things should be considered a + :param int additional_parts_count: + + if the comma format contains other parts, we need to know + how many there are to decide if things should be considered a conjunction. :return: pieces split on spaces and joined on conjunctions :rtype: list """ - + output = [] for part in parts: if not isinstance(part, text_types): raise TypeError("Name parts must be strings. " "Got {0}".format(type(part))) output += [x.strip(' ,') for x in part.split(' ')] - + # If part contains periods, check if it's multiple titles or suffixes # together without spaces if so, add the new part with periods to the # constants so they get parsed correctly later @@ -650,7 +650,7 @@ def parse_pieces(self, parts, additional_parts_count=0): period_chunks = part.split(".") titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) - + # add the part to the constant so it will be found if len(list(titles)): self.C.titles.add(part) @@ -658,27 +658,27 @@ def parse_pieces(self, parts, additional_parts_count=0): if len(list(suffixes)): self.C.suffix_not_acronyms.add(part) continue - + return self.join_on_conjunctions(output, additional_parts_count) - + def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: - + ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> ['Mr. and Mrs.', 'John', 'Doe'] - + ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> ['The Secretary of State', 'Hillary', 'Clinton'] - + When joining titles, saves newly formed piece to the instance's titles constant so they will be parsed correctly later. E.g. after parsing the example names above, 'The Secretary of State' and 'Mr. and Mrs.' would be present in the titles constant set. - + :param list pieces: name pieces strings after split on spaces - :param int additional_parts_count: - :return: new list with piece next to conjunctions merged into one piece + :param int additional_parts_count: + :return: new list with piece next to conjunctions merged into one piece with spaces in it. :rtype: list @@ -739,7 +739,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue - if i is 0: + if i == 0: new_piece = " ".join(pieces[i:i+2]) if self.is_title(pieces[i+1]): # when joining to a title, make new_piece a title too @@ -812,10 +812,10 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): log.debug("pieces: %s", pieces) return pieces - - + + ### Capitalization Support - + def cap_word(self, word, attribute): if (self.is_prefix(word) and attribute in ('last','middle')) \ or self.is_conjunction(word): @@ -843,15 +843,15 @@ def capitalize(self, force=None): entered in all upper or lower case. By default, it will not adjust the case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. - + :param bool force: Forces capitalization of mixed case strings. This parameter overrides rules set within :py:class:`~nameparser.config.CONSTANTS`. **Usage** - + .. doctest:: capitalize - + >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> str(name) @@ -859,12 +859,12 @@ def capitalize(self, force=None): >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() - >>> str(name) + >>> str(name) 'Shirley Maclaine' >>> name.capitalize(force=True) - >>> str(name) + >>> str(name) 'Shirley MacLaine' - + """ name = u(self) force = self.C.force_mixed_case_capitalization \ From 1d0b8f85bdb64e6e0826760409bd1f759295a8a5 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 8 Feb 2020 13:35:56 -0800 Subject: [PATCH 097/163] v1.0.6 --- docs/release_log.rst | 12 +++++++----- nameparser/__init__.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index f53cbcc..38e76e4 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.0.6 - February 8, 2020 + - Fix Python 3.8 syntax error (#104) * 1.0.5 - Dec 12, 2019 - Fix suffix parsing bug in comma parts (#98) - Fix deprecation warning on Python 3.7 (#94) @@ -110,7 +112,7 @@ Release Log - Generate documentation using sphinx and host on readthedocs. * 0.2.10 - May 6, 2014 - If name is only a title and one part, assume it's a last name instead of a first name, with exceptions for some titles like 'Sir'. (`#7 `_). - - Add some judicial and other common titles. (#9) + - Add some judicial and other common titles. (#9) * 0.2.9 - Apr 1, 2014 - Add a new nickname attribute containing anything in parenthesis or double quotes (`Issue 33 `_). * 0.2.8 - Oct 25, 2013 @@ -123,7 +125,7 @@ Release Log * 0.2.5 - Feb 11, 2013 - Set logging handler to NullHandler - Remove 'ben' from PREFIXES because it's more common as a name than a prefix. - - Deprecate BlankHumanNameError. Do not raise exceptions if full_name is empty string. + - Deprecate BlankHumanNameError. Do not raise exceptions if full_name is empty string. * 0.2.4 - Feb 10, 2013 - Adjust logging, don't set basicConfig. Fix `Issue 10 `_ and `Issue 26 `_. - Fix handling of single lower case initials that are also conjunctions, e.g. "john e smith". Re `Issue 11 `_. @@ -134,12 +136,12 @@ Release Log - tests/test.py can now take an optional name argument that will return repr() for that name. * 0.2.3 - Fix overzealous "Mac" regex * 0.2.2 - Fix parsing error -* 0.2.0 +* 0.2.0 - Significant refactor of parsing logic. Handle conjunctions and prefixes before parsing into attribute buckets. - Support attribute overriding by assignment. - - Support multiple titles. - - Lowercase titles constants to fix bug with comparison. + - Support multiple titles. + - Lowercase titles constants to fix bug with comparison. - Move documentation to README.rst, add release log. * 0.1.4 - Use set() in constants for improved speed. setuptools compatibility - sketerpot * 0.1.3 - Add capitalization feature - twotwo diff --git a/nameparser/__init__.py b/nameparser/__init__.py index b2644e7..6c898ba 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 5) +VERSION = (1, 0, 6) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 6e43c995e0f7c0f4e40270e36fdba1d2fbacab54 Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Thu, 27 Feb 2020 20:08:15 +1100 Subject: [PATCH 098/163] Fix simple typo: conjuctions -> conjunctions Closes #106 --- nameparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bd79057..7ee06c6 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -305,7 +305,7 @@ def is_title(self, value): return lc(value) in self.C.titles def is_conjunction(self, piece): - """Is in the conjuctions set and not :py:func:`is_an_initial()`.""" + """Is in the conjunctions set and not :py:func:`is_an_initial()`.""" return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) def is_prefix(self, piece): From c87d42e3879af9041becd2c9c5ad093499f54d0e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 9 Aug 2020 14:00:09 -0700 Subject: [PATCH 099/163] add baby names reference to resources --- docs/resources.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/resources.rst b/docs/resources.rst index 0c70695..6cc28e8 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -2,12 +2,14 @@ Naming Practices and Resources ============================== * US_Census_Surname_Data_2000_ + * US_Social_Security_Administration_Baby_Names_Index_ * Naming_practice_guide_UK_2006_ * Wikipedia_Anthroponymy_ * Wikipedia_Naming_conventions_ * Wikipedia_List_Of_Titles_ -.. _US_Census_Surname_Data_2000: http://www.census.gov/genealogy/www/data/2000surnames/index.html +.. _US_Census_Surname_Data_2000: https://www.census.gov/data/developers/data-sets/surnames/2000.html +.. _US_Social_Security_Administration_Baby_Names_Index: https://www.ssa.gov/oact/babynames/limits.html .. _Naming_practice_guide_UK_2006: https://www.fbiic.gov/public/2008/nov/Naming_practice_guide_UK_2006.pdf .. _Wikipedia_Anthroponymy: https://en.wikipedia.org/wiki/Anthroponymy .. _Wikipedia_Naming_conventions: http://en.wikipedia.org/wiki/Wikipedia:Naming_conventions_(people) From d498968e850577ffc4dfa01c27610500a9ef3a80 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 9 Aug 2020 14:14:33 -0700 Subject: [PATCH 100/163] dill library no longer supports python 3.4 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 45394bd..42fadab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ python: # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - - "pip install dill" + - if [[ $TRAVIS_PYTHON_VERSION -ne '3.4' ]]; then pip install dill; fi - "python setup.py install" # command to run tests script: python tests.py From e53f878bd49ef5705d4615ecba30f4bfb5c03895 Mon Sep 17 00:00:00 2001 From: zahna Date: Thu, 18 Feb 2021 16:29:49 -0500 Subject: [PATCH 101/163] Added a few additional prefixes for last names. Added a few additional prefixes for last names. --- nameparser/config/prefixes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 2f5eb31..cce7805 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -34,11 +34,15 @@ 'ibn', 'la', 'le', + 'mac', + 'mc', 'san', 'santa', 'st', 'ste', 'van', + 'vander', + 'van der', 'vel', 'von', ]) From e553657ba1cdcb0b69c78a65dd7ee6eab24154cd Mon Sep 17 00:00:00 2001 From: zahna Date: Thu, 18 Feb 2021 16:47:22 -0500 Subject: [PATCH 102/163] Adding additional religious titles. Adding additional religious titles. --- nameparser/config/titles.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 3d5892f..91a9ac3 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -136,6 +136,7 @@ 'bodhisattva', 'bookseller', 'botanist', + 'bp', 'brigadier', 'briggen', 'british', @@ -223,6 +224,7 @@ 'cwo5', 'cyclist', 'dancer', + 'dcn', 'deacon', 'delegate', 'deputy', @@ -278,6 +280,7 @@ 'expert', 'fadm', 'family', + 'father', 'federal', 'field', 'film', @@ -288,6 +291,7 @@ 'foreign', 'forester', 'founder', + 'fr', 'friar', 'gaf', 'gen', @@ -314,6 +318,8 @@ 'high', 'highness', 'his', + 'his eminence', + 'his eminence metropolitan', 'historian', 'historicus', 'historien', @@ -395,6 +401,7 @@ 'member', 'memoirist', 'merchant', + 'met', 'metropolitan', 'mg', 'mgr', @@ -568,6 +575,7 @@ 'srta', 'ssg', 'ssgt', + 'st', 'staff', 'state', 'states', From 77cd216ddb4e5ec2153ebbe69f45906daa7a3d5f Mon Sep 17 00:00:00 2001 From: geritwagner Date: Tue, 22 Jun 2021 16:15:32 +0200 Subject: [PATCH 103/163] Update prefixes.py --- nameparser/config/prefixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 2f5eb31..48f5ac2 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -41,4 +41,5 @@ 'van', 'vel', 'von', + 'vom', ]) From 32c7613e17f60384a6e833c46e5a604e3ef2f760 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Wed, 20 Oct 2021 14:48:19 +0200 Subject: [PATCH 104/163] Parse initials from first and middle names --- nameparser/parser.py | 106 ++++++++++++++++++++++++++----------------- tests.py | 86 +++++++++++++++++------------------ 2 files changed, 108 insertions(+), 84 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bd79057..e58a428 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -15,6 +15,7 @@ ENCODING = 'utf-8' + def group_contiguous_integers(data): """ return list of tuples containing first and last index @@ -27,6 +28,7 @@ def group_contiguous_integers(data): ranges.append((group[0], group[-1])) return ranges + class HumanName(object): """ Parse a person's name into individual components. @@ -67,12 +69,12 @@ class HumanName(object): """ _count = 0 - _members = ['title','first','middle','last','suffix','nickname'] + _members = ['title', 'initials', 'first', 'middle', 'last', 'suffix', 'nickname'] unparsable = True _full_name = '' def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, - string_format=None): + string_format=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() @@ -130,7 +132,7 @@ def __unicode__(self): # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" _s = self.string_format.format(**self.as_dict()) # remove trailing punctuation from missing nicknames - _s = _s.replace(str(self.C.empty_attribute_default),'').replace(" ()","").replace(" ''","").replace(' ""',"") + _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "") return self.collapse_whitespace(_s).strip(', ') return " ".join(self) @@ -141,9 +143,9 @@ def __str__(self): def __repr__(self): if self.unparsable: - _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__,} + _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, } else: - _string = "<%(class)s : [\n\ttitle: '%(title)s' \n\tfirst: '%(first)s' \n\tmiddle: '%(middle)s' \n\tlast: '%(last)s' \n\tsuffix: '%(suffix)s'\n\tnickname: '%(nickname)s'\n]>" % { + _string = "<%(class)s : [\n\ttitle: '%(title)s' \n\tinitials: '%(initials)s' \n\tfirst: '%(first)s' \n\tmiddle: '%(middle)s' \n\tlast: '%(last)s' \n\tsuffix: '%(suffix)s'\n\tnickname: '%(nickname)s'\n]>" % { 'class': self.__class__.__name__, 'title': self.title or '', 'first': self.first or '', @@ -151,6 +153,7 @@ def __repr__(self): 'last': self.last or '', 'suffix': self.suffix or '', 'nickname': self.nickname or '', + 'initials': self.initials or '', } if sys.version_info[0] >= 3: return _string @@ -190,7 +193,7 @@ def has_own_config(self): """ return self.C is not CONSTANTS - ### attributes + # attributes @property def title(self): @@ -259,7 +262,14 @@ def surnames(self): """ return " ".join(self.surnames_list) or self.C.empty_attribute_default - ### setter methods + @property + def initials(self): + """" + A string of all initials + """ + return " ".join([initial + "." for initial in self.initials_list]) + + # setter methods def _set_list(self, attr, value): if isinstance(value, list): @@ -270,8 +280,8 @@ def _set_list(self, attr, value): val = [] else: raise TypeError( - "Can only assign strings, lists or None to name attributes." - " Got {0}".format(type(value))) + "Can only assign strings, lists or None to name attributes." + " Got {0}".format(type(value))) setattr(self, attr+"_list", self.parse_pieces(val)) @title.setter @@ -298,7 +308,11 @@ def suffix(self, value): def nickname(self, value): self._set_list('nickname', value) - ### Parse helpers + @initials.setter + def initials(self, value): + self._set_list('initials', value) + + # Parse helpers def is_title(self, value): """Is in the :py:data:`~nameparser.config.titles.TITLES` set.""" @@ -331,8 +345,8 @@ def is_suffix(self, piece): `C.suffix_acronyms`. """ # suffixes may have periods inside them like "M.D." - return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \ - or (lc(piece) in self.C.suffix_not_acronyms)) \ + return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) + or (lc(piece) in self.C.suffix_not_acronyms)) \ and not self.is_an_initial(piece) def are_suffixes(self, pieces): @@ -358,8 +372,7 @@ def is_an_initial(self, value): """ return bool(self.C.regexes.initial.match(value)) - - ### full_name parser + # full_name parser @property def full_name(self): @@ -376,7 +389,7 @@ def full_name(self, value): def collapse_whitespace(self, string): # collapse multiple spaces into single space - string = self.C.regexes.spaces.sub(" ", string.strip()) + string = self.C.regexes.spaces.sub(" ", string.strip()) if string.endswith(","): string = string[:-1] return string @@ -404,7 +417,7 @@ def post_process(self): self.handle_capitalization() def fix_phd(self): - _re = self.C.regexes.phd + _re = self.C.regexes.phd match = _re.search(self._full_name) if match: self.suffix_list.append(match.group(1)) @@ -442,15 +455,16 @@ def squash_emoji(self): def handle_firstnames(self): """ - If there are only two parts and one is a title, assume it's a last name + If there are only three parts and one is a title, assume it's a last name instead of a first name. e.g. Mr. Johnson. Unless it's a special title like "Sir", then when it's followed by a single name that name is always a first name. """ if self.title \ - and len(self) == 2 \ + and len(self) == 3 \ and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last + self.initials_list = [] def parse_full_name(self): """ @@ -472,9 +486,9 @@ def parse_full_name(self): self.last_list = [] self.suffix_list = [] self.nickname_list = [] + self.initials_list = [] self.unparsable = True - self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) @@ -486,7 +500,6 @@ def parse_full_name(self): log.debug("parts: %s", parts) if len(parts) == 1: - # no commas, title first middle middle middle last suffix # part[0] @@ -509,6 +522,8 @@ def parse_full_name(self): self.last_list.append(piece) continue self.first_list.append(piece) + if len(piece) > 0: + self.initials_list.append(piece[0]) continue if self.are_suffixes(pieces[i+1:]) or \ ( @@ -516,7 +531,7 @@ def parse_full_name(self): # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) - ): + ): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break @@ -525,6 +540,8 @@ def parse_full_name(self): continue self.middle_list.append(piece) + if len(piece) > 0: + self.initials_list.append(piece[0]) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names @@ -540,7 +557,6 @@ def parse_full_name(self): # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] - self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: %s", u(pieces)) @@ -557,6 +573,8 @@ def parse_full_name(self): continue if not self.first: self.first_list.append(piece) + if len(piece) > 0: + self.initials_list.append(piece[0]) continue if self.are_suffixes(pieces[i+1:]): self.last_list.append(piece) @@ -566,6 +584,8 @@ def parse_full_name(self): self.last_list.append(piece) continue self.middle_list.append(piece) + if len(piece) > 0: + self.initials_list.append(piece[0]) else: # lastname comma: @@ -597,11 +617,15 @@ def parse_full_name(self): continue if not self.first: self.first_list.append(piece) + if len(piece) > 0: + self.initials_list.append(piece[0]) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) + if len(piece) > 0: + self.initials_list.append(piece[0]) try: if parts[2]: self.suffix_list += parts[2:] @@ -614,7 +638,6 @@ def parse_full_name(self): self.unparsable = False self.post_process() - def parse_pieces(self, parts, additional_parts_count=0): """ Split parts on spaces and remove commas, join on conjunctions and @@ -648,7 +671,7 @@ def parse_pieces(self, parts, additional_parts_count=0): # split on periods, any of the split pieces titles or suffixes? # ("Lt.Gov.") period_chunks = part.split(".") - titles = list(filter(self.is_title, period_chunks)) + titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) # add the part to the constant so it will be found @@ -695,7 +718,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it conj_index = [i for i, piece in enumerate(pieces) - if self.is_conjunction(piece)] + if self.is_conjunction(piece)] contiguous_conj_i = [] for i, val in enumerate(conj_index): @@ -710,14 +733,14 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: - new_piece = " ".join(pieces[ i[0] : i[1]+1] ) - delete_i += list(range( i[0]+1, i[1]+1 )) + new_piece = " ".join(pieces[i[0]: i[1]+1]) + delete_i += list(range(i[0]+1, i[1]+1)) pieces[i[0]] = new_piece else: - new_piece = " ".join(pieces[ i : i+2 ]) + new_piece = " ".join(pieces[i: i+2]) delete_i += [i+1] pieces[i] = new_piece - #add newly joined conjunctions to constants to be found later + # add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) for i in reversed(delete_i): @@ -747,9 +770,9 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): pieces[i] = new_piece pieces.pop(i+1) # subtract 1 from the index of all the remaining conjunctions - for j,val in enumerate(conj_index): + for j, val in enumerate(conj_index): if val > i: - conj_index[j]=val-1 + conj_index[j] = val-1 else: new_piece = " ".join(pieces[i-1:i+2]) @@ -766,11 +789,10 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # subtract the number of removed pieces from the index # of all the remaining conjunctions - for j,val in enumerate(conj_index): + for j, val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count - # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: @@ -813,12 +835,11 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): log.debug("pieces: %s", pieces) return pieces - - ### Capitalization Support + # Capitalization Support def cap_word(self, word, attribute): - if (self.is_prefix(word) and attribute in ('last','middle')) \ - or self.is_conjunction(word): + if (self.is_prefix(word) and attribute in ('last', 'middle')) \ + or self.is_conjunction(word): return word.lower() exceptions = self.C.capitalization_exceptions if lc(word) in exceptions: @@ -834,7 +855,8 @@ def cap_after_mac(m): def cap_piece(self, piece, attribute): if not piece: return "" - replacement = lambda m: self.cap_word(m.group(0), attribute) + + def replacement(m): return self.cap_word(m.group(0), attribute) return self.C.regexes.word.sub(replacement, piece) def capitalize(self, force=None): @@ -872,11 +894,13 @@ def capitalize(self, force=None): if not force and not (name == name.upper() or name == name.lower()): return - self.title_list = self.cap_piece(self.title , 'title').split(' ') - self.first_list = self.cap_piece(self.first , 'first').split(' ') + + self.title_list = self.cap_piece(self.title, 'title').split(' ') + self.first_list = self.cap_piece(self.first, 'first').split(' ') self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') - self.last_list = self.cap_piece(self.last , 'last').split(' ') + self.last_list = self.cap_piece(self.last, 'last').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') + self.initials_list = self.cap_piece(self.initials, 'initials').replace('.', '').split(' ') def handle_capitalization(self): """ diff --git a/tests.py b/tests.py index 5f976b8..e176634 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import unittest """ Run this file to run the tests. @@ -30,7 +31,6 @@ log = logging.getLogger('HumanName') -import unittest try: unittest.expectedFailure except AttributeError: @@ -70,9 +70,9 @@ def test_escaped_utf8_bytes(self): def test_len(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(len(hn), 5, hn) + self.m(len(hn), 6, hn) hn = HumanName("John Doe") - self.m(len(hn), 2, hn) + self.m(len(hn), 3, hn) @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_config_pickle(self): @@ -104,17 +104,18 @@ def test_assignment_to_full_name(self): self.m(hn.last, "Doe", hn) self.m(hn.middle, "A. Kenneth", hn) self.m(hn.suffix, "Jr.", hn) + self.m(hn.initials, "J. A. K.", hn) hn.full_name = "Juan Velasquez y Garcia III" self.m(hn.first, "Juan", hn) self.m(hn.last, "Velasquez y Garcia", hn) self.m(hn.suffix, "III", hn) + self.m(hn.initials, "J.", hn) def test_get_full_name_attribute_references_internal_lists(self): hn = HumanName("John Williams") hn.first_list = ["Larry"] self.m(hn.full_name, "Larry Williams", hn) - def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" @@ -154,9 +155,9 @@ def test_comparison_case_insensitive(self): def test_slice(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(list(hn), ['Dr.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) - self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) - self.m(hn[1:-2], ['John', 'P.', 'Doe-Ray'], hn) + self.m(list(hn), ['Dr.', 'J. P.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) + self.m(hn[1:], ['J. P.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) + self.m(hn[1:-2], ['J. P.', 'John', 'P.', 'Doe-Ray'], hn) def test_getitem(self): hn = HumanName("Dr. John A. Kenneth Doe, Jr.") @@ -210,16 +211,16 @@ def test_assume_title_and_one_other_name_is_last_name(self): hn = HumanName("Rev Andrews") self.m(hn.title, "Rev", hn) self.m(hn.last, "Andrews", hn) - + # TODO: Seems "Andrews, M.D.", Andrews should be treated as a last name - # but other suffixes like "George Jr." should be first names. Might be + # but other suffixes like "George Jr." should be first names. Might be # related to https://github.com/derek73/python-nameparser/issues/2 @unittest.expectedFailure def test_assume_suffix_title_and_one_other_name_is_last_name(self): hn = HumanName("Andrews, M.D.") self.m(hn.suffix, "M.D.", hn) self.m(hn.last, "Andrews", hn) - + def test_suffix_in_lastname_part_of_lastname_comma_format(self): hn = HumanName("Smith Jr., John") self.m(hn.last, "Smith", hn) @@ -230,22 +231,22 @@ def test_sir_exception_to_first_name_rule(self): hn = HumanName("Sir Gerald") self.m(hn.title, "Sir", hn) self.m(hn.first, "Gerald", hn) - + def test_king_exception_to_first_name_rule(self): hn = HumanName("King Henry") self.m(hn.title, "King", hn) self.m(hn.first, "Henry", hn) - + def test_queen_exception_to_first_name_rule(self): hn = HumanName("Queen Elizabeth") self.m(hn.title, "Queen", hn) self.m(hn.first, "Elizabeth", hn) - + def test_dame_exception_to_first_name_rule(self): hn = HumanName("Dame Mary") self.m(hn.title, "Dame", hn) self.m(hn.first, "Mary", hn) - + def test_first_name_is_not_prefix_if_only_two_parts(self): """When there are only two parts, don't join prefixes or conjunctions""" hn = HumanName("Van Nguyen") @@ -263,7 +264,7 @@ def test_first_name_is_prefix_if_three_parts(self): hn = HumanName("Mr. Van Nguyen") self.m(hn.first, "Van", hn) self.m(hn.last, "Nguyen", hn) - + class HumanNameBruteForceTests(HumanNameTestBase): @@ -1084,7 +1085,7 @@ def test_multiple_conjunctions(self): def test_multiple_conjunctions2(self): hn = HumanName("part1 of and The part2 of the part3 And part4") self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn) - + def test_ends_with_conjunction(self): hn = HumanName("Jon Dough and") self.m(hn.first, "Jon", hn) @@ -1242,12 +1243,12 @@ def test_le_as_last_name_with_middle_initial(self): self.m(hn.first, "Yin", hn) self.m(hn.middle, "a", hn) self.m(hn.last, "Le", hn) - + def test_conjunction_in_an_address_with_a_title(self): hn = HumanName("His Excellency Lord Duncan") self.m(hn.title, "His Excellency Lord", hn) self.m(hn.last, "Duncan", hn) - + @unittest.expectedFailure def test_conjunction_in_an_address_with_a_first_name_title(self): hn = HumanName("Her Majesty Queen Elizabeth") @@ -1272,7 +1273,7 @@ def test_add_title(self): self.m(hn.title, "Te", hn) self.m(hn.first, "Awanui-a-Rangi", hn) self.m(hn.last, "Black", hn) - + def test_remove_title(self): hn = HumanName("Hon Solo", constants=None) start_len = len(hn.C.titles) @@ -1282,7 +1283,7 @@ def test_remove_title(self): hn.parse_full_name() self.m(hn.first, "Hon", hn) self.m(hn.last, "Solo", hn) - + def test_add_multiple_arguments(self): hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=None) hn.C.titles.add('dean', 'Chemistry') @@ -1310,7 +1311,7 @@ def test_can_change_global_constants(self): self.assertEqual(hn2.has_own_config, False) # clean up so we don't mess up other tests hn.C.titles.add('hon') - + def test_remove_multiple_arguments(self): hn = HumanName("Ms Hon Solo", constants=None) hn.C.titles.remove('hon', 'ms') @@ -1370,7 +1371,7 @@ def test_nickname_in_parenthesis(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_two_word_nickname_in_parenthesis(self): hn = HumanName("Benjamin (Big Ben) Franklin") self.m(hn.first, "Benjamin", hn) @@ -1391,7 +1392,7 @@ def test_nickname_in_parenthesis_with_comma(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_nickname_in_parenthesis_with_comma_and_suffix(self): hn = HumanName("Franklin, Benjamin (Ben), Jr.") self.m(hn.first, "Benjamin", hn) @@ -1399,7 +1400,7 @@ def test_nickname_in_parenthesis_with_comma_and_suffix(self): self.m(hn.last, "Franklin", hn) self.m(hn.suffix, "Jr.", hn) self.m(hn.nickname, "Ben", hn) - + def test_nickname_in_single_quotes(self): hn = HumanName("Benjamin 'Ben' Franklin") self.m(hn.first, "Benjamin", hn) @@ -1413,28 +1414,28 @@ def test_nickname_in_double_quotes(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_single_quotes_on_first_name_not_treated_as_nickname(self): hn = HumanName("Brian Andrew O'connor") self.m(hn.first, "Brian", hn) self.m(hn.middle, "Andrew", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) - + def test_single_quotes_on_both_name_not_treated_as_nickname(self): hn = HumanName("La'tanya O'connor") self.m(hn.first, "La'tanya", hn) self.m(hn.middle, "", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) - + def test_single_quotes_on_end_of_last_name_not_treated_as_nickname(self): hn = HumanName("Mari' Aube'") self.m(hn.first, "Mari'", hn) self.m(hn.middle, "", hn) self.m(hn.last, "Aube'", hn) self.m(hn.nickname, "", hn) - + def test_okina_inside_name_not_treated_as_nickname(self): hn = HumanName("Harrieta Keōpūolani Nāhiʻenaʻena") self.m(hn.first, "Harrieta", hn) @@ -1492,7 +1493,6 @@ def test_nickname_and_last_name_with_title(self): self.m(hn.nickname, "Rick", hn) - # class MaidenNameTestCase(HumanNameTestBase): # # def test_parenthesis_and_quotes_together(self): @@ -1542,12 +1542,12 @@ def test_prefix(self): hn = HumanName("Juan del Sur") self.m(hn.first, "Juan", hn) self.m(hn.last, "del Sur", hn) - + def test_prefix_with_period(self): hn = HumanName("Jill St. John") self.m(hn.first, "Jill", hn) self.m(hn.last, "St. John", hn) - + def test_prefix_before_two_part_last_name(self): hn = HumanName("pennie von bergen wessels") self.m(hn.first, "pennie", hn) @@ -1641,7 +1641,7 @@ def test_comma_three_conjunctions(self): class SuffixesTestCase(HumanNameTestBase): - + def test_suffix(self): hn = HumanName("Joe Franklin Jr") self.m(hn.first, "Joe", hn) @@ -1716,13 +1716,13 @@ def test_phd_conflict(self): self.m(hn.first, "Adolph", hn) self.m(hn.last, "D", hn) - # http://en.wikipedia.org/wiki/Ma_(surname) + def test_potential_suffix_that_is_also_last_name(self): hn = HumanName("Jack Ma") self.m(hn.first, "Jack", hn) self.m(hn.last, "Ma", hn) - + def test_potential_suffix_that_is_also_last_name_comma(self): hn = HumanName("Ma, Jack") self.m(hn.first, "Jack", hn) @@ -1820,27 +1820,27 @@ def test_chained_title_first_name_title_is_initials(self): self.m(hn.first, "Marc", hn) self.m(hn.middle, "Thomas", hn) self.m(hn.last, "Treadwell", hn) - + def test_conflict_with_chained_title_first_name_initial(self): hn = HumanName("U. S. Grant") self.m(hn.first, "U.", hn) self.m(hn.middle, "S.", hn) self.m(hn.last, "Grant", hn) - + def test_chained_title_first_name_initial_with_no_period(self): hn = HumanName("US Magistrate Judge T Michael Putnam") self.m(hn.title, "US Magistrate Judge", hn) self.m(hn.first, "T", hn) self.m(hn.middle, "Michael", hn) self.m(hn.last, "Putnam", hn) - + def test_chained_hyphenated_title(self): hn = HumanName("US Magistrate-Judge Elizabeth E Campbell") self.m(hn.title, "US Magistrate-Judge", hn) self.m(hn.first, "Elizabeth", hn) self.m(hn.middle, "E", hn) self.m(hn.last, "Campbell", hn) - + def test_chained_hyphenated_title_with_comma_suffix(self): hn = HumanName("Mag-Judge Harwell G Davis, III") self.m(hn.title, "Mag-Judge", hn) @@ -1883,7 +1883,7 @@ def test_title_with_last_initial_is_suffix(self): self.m(hn.title, "King", hn) self.m(hn.first, "John", hn) self.m(hn.last, "V.", hn) - + def test_initials_also_suffix(self): hn = HumanName("Smith, J.R.") self.m(hn.first, "J.R.", hn) @@ -2062,10 +2062,10 @@ def test_capitalize_prefix_clash_on_first_name(self): class HumanNameOutputFormatTests(HumanNameTestBase): - + def test_formatting_init_argument(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", - string_format="TEST1") + string_format="TEST1") self.assertEqual(u(hn), "TEST1") def test_formatting_constants_attribute(self): @@ -2160,7 +2160,7 @@ def test_formating_of_nicknames_in_middle(self): self.assertEqual(u(hn), "Rev John (Kenny) A. Kenneth Doe III") hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") - + def test_remove_emojis(self): hn = HumanName("Sam Smith 😊") self.m(hn.first, "Sam", hn) @@ -2359,7 +2359,7 @@ def test_keep_emojis(self): "U.S. District Judge Marc Thomas Treadwell", "Dra. Andréia da Silva", "Srta. Andréia da Silva", - + ) From 155d608a38d0b6f768a9be7e38d83632f81f0769 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Wed, 20 Oct 2021 15:54:00 +0200 Subject: [PATCH 105/163] Remove initials from members and process initials upon post processing and setting first/middle names --- nameparser/parser.py | 37 ++++++++++------- tests.py | 99 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 21 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index e58a428..38b420d 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -69,7 +69,7 @@ class HumanName(object): """ _count = 0 - _members = ['title', 'initials', 'first', 'middle', 'last', 'suffix', 'nickname'] + _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname'] unparsable = True _full_name = '' @@ -267,7 +267,7 @@ def initials(self): """" A string of all initials """ - return " ".join([initial + "." for initial in self.initials_list]) + return " ".join([initial + "." for initial in self.initials_list]) or self.C.empty_attribute_default # setter methods @@ -291,10 +291,12 @@ def title(self, value): @first.setter def first(self, value): self._set_list('first', value) + self.handle_initials() @middle.setter def middle(self, value): self._set_list('middle', value) + self.handle_initials() @last.setter def last(self, value): @@ -414,6 +416,7 @@ def post_process(self): and :py:func:`handle_capitalization`. """ self.handle_firstnames() + self.handle_initials() self.handle_capitalization() def fix_phd(self): @@ -461,11 +464,26 @@ def handle_firstnames(self): a first name. """ if self.title \ - and len(self) == 3 \ + and len(self) == 2 \ and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last self.initials_list = [] + def handle_initials(self): + """ + Initials are the concatination of the first letter of the first name and the first character of each middle name + """ + initials_list = [] + if self.first and len(self.first): + initials_list += [self.first[0]] + + if self.middle_list and len(self.middle_list): + for middle in self.middle_list: + if len(middle): + initials_list += [middle[0]] + + self.initials_list = initials_list + def parse_full_name(self): """ @@ -486,7 +504,6 @@ def parse_full_name(self): self.last_list = [] self.suffix_list = [] self.nickname_list = [] - self.initials_list = [] self.unparsable = True self.pre_process() @@ -522,8 +539,6 @@ def parse_full_name(self): self.last_list.append(piece) continue self.first_list.append(piece) - if len(piece) > 0: - self.initials_list.append(piece[0]) continue if self.are_suffixes(pieces[i+1:]) or \ ( @@ -540,8 +555,6 @@ def parse_full_name(self): continue self.middle_list.append(piece) - if len(piece) > 0: - self.initials_list.append(piece[0]) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names @@ -573,8 +586,6 @@ def parse_full_name(self): continue if not self.first: self.first_list.append(piece) - if len(piece) > 0: - self.initials_list.append(piece[0]) continue if self.are_suffixes(pieces[i+1:]): self.last_list.append(piece) @@ -584,8 +595,6 @@ def parse_full_name(self): self.last_list.append(piece) continue self.middle_list.append(piece) - if len(piece) > 0: - self.initials_list.append(piece[0]) else: # lastname comma: @@ -617,15 +626,11 @@ def parse_full_name(self): continue if not self.first: self.first_list.append(piece) - if len(piece) > 0: - self.initials_list.append(piece[0]) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) - if len(piece) > 0: - self.initials_list.append(piece[0]) try: if parts[2]: self.suffix_list += parts[2:] diff --git a/tests.py b/tests.py index e176634..89b302d 100644 --- a/tests.py +++ b/tests.py @@ -59,6 +59,7 @@ def test_utf8(self): hn = HumanName("de la Véña, Jüan") self.m(hn.first, "Jüan", hn) self.m(hn.last, "de la Véña", hn) + self.m(hn.initials, "J.", hn) def test_string_output(self): hn = HumanName("de la Véña, Jüan") @@ -67,12 +68,13 @@ def test_escaped_utf8_bytes(self): hn = HumanName(b'B\xc3\xb6ck, Gerald') self.m(hn.first, "Gerald", hn) self.m(hn.last, "Böck", hn) + self.m(hn.initials, "G.", hn) def test_len(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(len(hn), 6, hn) + self.m(len(hn), 5, hn) hn = HumanName("John Doe") - self.m(len(hn), 3, hn) + self.m(len(hn), 2, hn) @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_config_pickle(self): @@ -120,12 +122,15 @@ def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" self.m(hn.last, "de la Vega", hn) + self.m(hn.initials, "J. A. K.", hn) hn.title = "test" self.m(hn.title, "test", hn) hn.first = "test" self.m(hn.first, "test", hn) + self.m(hn.initials, "t. A. K.", hn) hn.middle = "test" self.m(hn.middle, "test", hn) + self.m(hn.initials, "t. t.", hn) hn.suffix = "test" self.m(hn.suffix, "test", hn) with self.assertRaises(TypeError): @@ -155,9 +160,9 @@ def test_comparison_case_insensitive(self): def test_slice(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(list(hn), ['Dr.', 'J. P.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) - self.m(hn[1:], ['J. P.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) - self.m(hn[1:-2], ['J. P.', 'John', 'P.', 'Doe-Ray'], hn) + self.m(list(hn), ['Dr.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) + self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) + self.m(hn[1:-2], ['John', 'P.', 'Doe-Ray'], hn) def test_getitem(self): hn = HumanName("Dr. John A. Kenneth Doe, Jr.") @@ -166,6 +171,7 @@ def test_getitem(self): self.m(hn['last'], "Doe", hn) self.m(hn['middle'], "A. Kenneth", hn) self.m(hn['suffix'], "Jr.", hn) + self.m(hn.initials, "J. A. K.", hn) def test_setitem(self): hn = HumanName("Dr. John A. Kenneth Doe, Jr.") @@ -182,16 +188,19 @@ def test_conjunction_names(self): hn = HumanName("johnny y") self.m(hn.first, "johnny", hn) self.m(hn.last, "y", hn) + self.m(hn.initials, "j.", hn) def test_prefix_names(self): hn = HumanName("vai la") self.m(hn.first, "vai", hn) self.m(hn.last, "la", hn) + self.m(hn.initials, "v.", hn) def test_blank_name(self): hn = HumanName() self.m(hn.first, "", hn) self.m(hn.last, "", hn) + self.m(hn.initials, "", hn) def test_surnames_list_attribute(self): hn = HumanName("John Edgar Casey Williams III") @@ -201,6 +210,10 @@ def test_surnames_attribute(self): hn = HumanName("John Edgar Casey Williams III") self.m(hn.surnames, "Edgar Casey Williams", hn) + def test_initials_list_attribute(self): + hn = HumanName("John Edgar Casey Williams III") + self.m(hn.initials_list, ["J", "E", "C"], hn) + class FirstNameHandlingTests(HumanNameTestBase): def test_first_name(self): @@ -2184,6 +2197,82 @@ def test_keep_emojis(self): # test cleanup +class InitialsTestCase(HumanNameTestBase): + def test_initials(self): + hn = HumanName("Andrew Boris Petersen") + self.m(hn.initials, "A. B.", hn) + self.m(hn.initials_list, ["A", "B"], hn) + + def test_title_and_last_name(self): + hn = HumanName("Dr. Andrews") + self.m(hn.initials, "", hn) + self.m(hn.initials, [], hn) + + def test_reassignment_first_name(self): + hn = HumanName("Andrew Boris Petersen") + hn.first = "John" + self.m(hn.initials, "J. B.", hn) + self.m(hn.initials_list, ["J", "B"], hn) + + def test_reassignment_middle_names(self): + hn = HumanName("Andrew Boris Petersen") + hn.middle = "John" + self.m(hn.initials, "A. J.", hn) + self.m(hn.initials_list, ["A", "J"], hn) + + def test_reassignment_middle_names_list(self): + hn = HumanName("Andrew Boris Petersen") + hn.middle = ["John", "Peter"] + self.m(hn.initials, "A. J. P.", hn) + self.m(hn.initials_list, ["A", "J", "P"], hn) + + def test_capitalization(self): + hn = HumanName("andrew boris Petersen") + self.m(hn.initials, "a. b.", hn) + self.m(hn.initials_list, ["a", "b"], hn) + hn.capitalize(force=True) + self.m(hn.initials, "A. B.", hn) + self.m(hn.initials_list, ["A", "B"], hn) + + def test_parse_initial(self): + hn = HumanName("A. Petersen") + self.m(hn.initials, "A.", hn) + self.m(hn.initials_list, ["A"], hn) + + def test_parse_multiple_initials(self): + hn = HumanName("A. B. Petersen") + self.m(hn.initials, "A. B.", hn) + self.m(hn.initials_list, ["A", "B"], hn) + + def test_parse_mixed_initials(self): + hn1 = HumanName("Andrew B. Petersen") + self.m(hn1.initials, "A. B.", hn1) + self.m(hn1.initials_list, ["A", "B"], hn1) + + hn2 = HumanName("A. Boris Petersen") + self.m(hn2.initials, "A. B.", hn2) + self.m(hn2.initials_list, ["A", "B"], hn2) + + def test_parse_commas(self): + hn = HumanName("Petersen, Andrew Boris") + self.m(hn.initials, "A. B.", hn) + self.m(hn.initials_list, ["A", "B"], hn) + + def test_parse_commas_initials(self): + hn = HumanName("Petersen, A. B.") + self.m(hn.initials, "A. B.", hn) + self.m(hn.initials_list, ["A", "B"], hn) + + def test_parse_commas_mixed_initials(self): + hn1 = HumanName("Petersen, Andrew B.") + self.m(hn1.initials, "A. B.", hn1) + self.m(hn1.initials_list, ["A", "B"], hn1) + + hn2 = HumanName("Petersen, A. Boris") + self.m(hn2.initials, "A. B.", hn2) + self.m(hn2.initials_list, ["A", "B"], hn2) + + TEST_NAMES = ( "John Doe", "John Doe, Jr.", From 9893a538810984980403f69931294d32e313dfd4 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Wed, 20 Oct 2021 16:11:16 +0200 Subject: [PATCH 106/163] Added initials to members but remove from string --- README.rst | 4 +++- nameparser/parser.py | 12 +++++++----- tests.py | 10 +++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index b347593..e47af60 100644 --- a/README.rst +++ b/README.rst @@ -13,6 +13,7 @@ individual components. * hn.suffix * hn.nickname * hn.surnames *(middle + last)* +* hn.initials Supported Name Structures ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -65,6 +66,7 @@ Quick Start Example >>> name >> name.last 'de la Vega' >>> name.as_dict() - {'last': 'de la Vega', 'suffix': 'III', 'title': 'Dr.', 'middle': 'Q. Xavier', 'nickname': 'Doc Vega', 'first': 'Juan'} + {'title': 'Dr.', 'first': 'Juan', 'middle': 'Q. Xavier', 'last': 'de la Vega', 'suffix': 'III', 'nickname': 'Doc Vega', 'initials': 'J. Q. X.'} >>> str(name) 'Dr. Juan Q. Xavier de la Vega III (Doc Vega)' >>> name.string_format = "{first} {last}" diff --git a/nameparser/parser.py b/nameparser/parser.py index 38b420d..a0a7fe3 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -69,7 +69,7 @@ class HumanName(object): """ _count = 0 - _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname'] + _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname', 'initials'] unparsable = True _full_name = '' @@ -130,7 +130,9 @@ def __next__(self): def __unicode__(self): if self.string_format: # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" - _s = self.string_format.format(**self.as_dict()) + dict_representation = self.as_dict() + dict_representation.pop('initials', None) + _s = self.string_format.format(**dict_representation) # remove trailing punctuation from missing nicknames _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "") return self.collapse_whitespace(_s).strip(', ') @@ -170,10 +172,9 @@ def as_dict(self, include_empty=True): >>> name = HumanName("Bob Dole") >>> name.as_dict() - {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'} + {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob', 'initials': 'B.'} >>> name.as_dict(False) - {'last': 'Dole', 'first': 'Bob'} - + {'last': 'Dole', 'first': 'Bob', 'initials': 'B.'} """ d = {} for m in self._members: @@ -504,6 +505,7 @@ def parse_full_name(self): self.last_list = [] self.suffix_list = [] self.nickname_list = [] + self.initials_list = [] self.unparsable = True self.pre_process() diff --git a/tests.py b/tests.py index 89b302d..4efefbf 100644 --- a/tests.py +++ b/tests.py @@ -72,9 +72,9 @@ def test_escaped_utf8_bytes(self): def test_len(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(len(hn), 5, hn) + self.m(len(hn), 6, hn) hn = HumanName("John Doe") - self.m(len(hn), 2, hn) + self.m(len(hn), 3, hn) @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_config_pickle(self): @@ -160,9 +160,9 @@ def test_comparison_case_insensitive(self): def test_slice(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(list(hn), ['Dr.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) - self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) - self.m(hn[1:-2], ['John', 'P.', 'Doe-Ray'], hn) + self.m(list(hn), ['Dr.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', 'J. P.'], hn) + self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default, 'J. P.'], hn) + self.m(hn[1:-3], ['John', 'P.', 'Doe-Ray'], hn) def test_getitem(self): hn = HumanName("Dr. John A. Kenneth Doe, Jr.") From 29b3e4fbab0eca67c450a09135e100231e6d7ae5 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Wed, 20 Oct 2021 16:31:51 +0200 Subject: [PATCH 107/163] Fix docstring of handle_firstnames() --- nameparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index a0a7fe3..b1275f3 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -459,7 +459,7 @@ def squash_emoji(self): def handle_firstnames(self): """ - If there are only three parts and one is a title, assume it's a last name + If there are only two parts and one is a title, assume it's a last name instead of a first name. e.g. Mr. Johnson. Unless it's a special title like "Sir", then when it's followed by a single name that name is always a first name. From 26829053830178ce00758ad54a5db67ebab3d7c3 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Wed, 20 Oct 2021 16:53:45 +0200 Subject: [PATCH 108/163] Change length function to exclude derived property initials from count --- nameparser/parser.py | 5 ++++- tests.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index b1275f3..64e5a25 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -88,7 +88,10 @@ def __iter__(self): return self def __len__(self): - l = 0 + """ + Initials are a derived value, so should not be considered for the length + """ + l = 0 if not self.initials else -1 for x in self: l += 1 return l diff --git a/tests.py b/tests.py index 4efefbf..168654e 100644 --- a/tests.py +++ b/tests.py @@ -72,9 +72,9 @@ def test_escaped_utf8_bytes(self): def test_len(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") - self.m(len(hn), 6, hn) + self.m(len(hn), 5, hn) hn = HumanName("John Doe") - self.m(len(hn), 3, hn) + self.m(len(hn), 2, hn) @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_config_pickle(self): From 7d7ac9ce92c2bd9908769d04517aa0433bf7ff8c Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Thu, 21 Oct 2021 12:07:43 +0200 Subject: [PATCH 109/163] Added functions and which return the initials of the name --- docs/usage.rst | 26 ++++++ nameparser/config/__init__.py | 144 +++++++++++++++++++++++-------- nameparser/parser.py | 119 +++++++++++++++++-------- tests.py | 158 ++++++++++++++++++++++++++-------- 4 files changed, 337 insertions(+), 110 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 6a65c4e..01beb48 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -176,3 +176,29 @@ Don't want to include nicknames in your output? No problem. Just omit that keywo 'Dr. Juan de la Vega' +Initials Support +---------------- + +The HumanName class can try to get the correct representation of initials. +Initials can be tricky as different format usages exist. +If you want to exclude on of the name parts from the initials, you can use one of the following boolean parameters: +`exclude_last_name`, `exclude_middle_name` or `exclude_first_name` + +You can also force the behavior using the CONSTANTS: +:py:attr:`~nameparser.config.Constants.force_exclude_last_name` +:py:attr:`~nameparser.config.Constants.force_exclude_middle_name` +:py:attr:`~nameparser.config.Constants.force_exclude_first_name` + +Furthermore, the delimiter for the string output can be set through: +:py:attr:`~nameparser.config.Constants.initials_delimiter` + +.. doctest:: initials + + >>> name = HumanName("Doe, John A. Kenneth, Jr.") + >>> name.initials() + 'J. A. K. D.' + >>> name.initials(exclude_last_name) + 'J. A. K.' + >>> name.initials_list(exclude_middle_name): + ['J', 'D'] + diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 4f1e4f2..488c899 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -49,35 +49,37 @@ DEFAULT_ENCODING = 'UTF-8' + class SetManager(Set): ''' Easily add and remove config variables per module or instance. Subclass of ``collections.abc.Set``. - + Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. - + ''' + def __init__(self, elements): self.elements = set(elements) - + def __call__(self): return self.elements - + def __repr__(self): - return "SetManager({})".format(self.elements) # used for docs - + return "SetManager({})".format(self.elements) # used for docs + def __iter__(self): return iter(self.elements) - + def __contains__(self, value): return value in self.elements - + def __len__(self): return len(self.elements) - + def next(self): return self.__next__() @@ -89,7 +91,7 @@ def __next__(self): c = self.count self.count = c + 1 return getattr(self, self.elements[c]) or next(self) - + def add_with_encoding(self, s, encoding=None): """ Add the lower case and no-period version of the string to the set. Pass an @@ -111,7 +113,7 @@ def add(self, *strings): """ [self.add_with_encoding(s) for s in strings] return self - + def remove(self, *strings): """ Remove the lower case and no-period version of the string arguments from the set. @@ -126,10 +128,11 @@ class TupleManager(dict): A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' + def __getattr__(self, attr): return self.get(attr) - __setattr__= dict.__setitem__ - __delattr__= dict.__delitem__ + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ def __getstate__(self): return dict(self) @@ -140,6 +143,7 @@ def __setstate__(self, state): def __reduce__(self): return (TupleManager, (), self.__getstate__()) + class Constants(object): """ An instance of this class hold all of the configuration constants for the parser. @@ -163,7 +167,7 @@ class Constants(object): :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ - + string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ The default string format use for all new `HumanName` instances. @@ -183,6 +187,7 @@ class Constants(object): 'John' """ + capitalize_name = False """ If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to @@ -197,6 +202,24 @@ class Constants(object): 'Bob V. de la MacDole-Eisenhower Ph.D.' """ + + initials_delimiter = '.' + """" + Determines how the initials from :py:meth:`~nameparser.parser.HumanName.initials` are seperated. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> HumanName('Shirley Maclaine').initials() + 'S. M.' + >>> CONSTANTS.initials_delimiter = '' + >>> HumanName('Shirley Maclaine').initials() + 'S M' + >>> CONSTANTS.initials_delimiter = '-' + >>> HumanName('Shirley Maclaine').initials() + 'S- M-' + """ + force_mixed_case_capitalization = False """ If set, forces the capitalization of mixed case strings when @@ -213,27 +236,77 @@ class Constants(object): """ + force_exclude_last_name_initial = False + """ + If True, forces the last name to be excluded in the initials when + :py:meth:`~nameparser.parser.HumanName.initials` or + :py:meth:`~nameparser.parser.HumanName.initials_list` is called. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_exclude_last_name_initial = True + >>> name = HumanName('Shirley Ashley Maclaine') + >>> name.initials() + 'S. A.' + >>> name.initials_list() + ['S', 'A'] + """ + + force_exclude_middle_name_initial = False + """ + If True, forces the middle name to be included in the initials when + :py:meth:`~nameparser.parser.HumanName.initials` or + :py:meth:`~nameparser.parser.HumanName.initials_list` is called. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_exclude_middle_name_initial = True + >>> name = HumanName('Shirley Ashley Maclaine') + >>> name.initials() + 'S. M.' + >>> name.initials_list() + ['S', 'M'] + """ + + force_exclude_first_name_initial = False + """ + If True, forces the first name to be included in the initials when + :py:meth:`~nameparser.parser.HumanName.initials` or + :py:meth:`~nameparser.parser.HumanName.initials_list` is called. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_exclude_first_name_initial = True + >>> name = HumanName('Shirley Ashley Maclaine') + >>> name.initials() + 'A. M.' + >>> name.initials_list() + ['A', 'M'] + """ - def __init__(self, - prefixes=PREFIXES, - suffix_acronyms=SUFFIX_ACRONYMS, - suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, - titles=TITLES, - first_name_titles=FIRST_NAME_TITLES, - conjunctions=CONJUNCTIONS, - capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, - regexes=REGEXES - ): - self.prefixes = SetManager(prefixes) - self.suffix_acronyms = SetManager(suffix_acronyms) + def __init__(self, + prefixes=PREFIXES, + suffix_acronyms=SUFFIX_ACRONYMS, + suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, + titles=TITLES, + first_name_titles=FIRST_NAME_TITLES, + conjunctions=CONJUNCTIONS, + capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, + regexes=REGEXES + ): + self.prefixes = SetManager(prefixes) + self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) - self.titles = SetManager(titles) - self.first_name_titles = SetManager(first_name_titles) - self.conjunctions = SetManager(conjunctions) + self.titles = SetManager(titles) + self.first_name_titles = SetManager(first_name_titles) + self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) - self.regexes = TupleManager(regexes) + self.regexes = TupleManager(regexes) self._pst = None - + @property def suffixes_prefixes_titles(self): if not self._pst: @@ -242,15 +315,16 @@ def suffixes_prefixes_titles(self): def __repr__(self): return "" - + def __setstate__(self, state): self.__init__(state) - + def __getstate__(self): attrs = [x for x in dir(self) if not x.startswith('_')] - return dict([(a,getattr(self, a)) for a in attrs]) + return dict([(a, getattr(self, a)) for a in attrs]) + -#: A module-level instance of the :py:class:`Constants()` class. +#: A module-level instance of the :py:class:`Constants()` class. #: Provides a common instance for the module to share #: to easily adjust configuration for the entire module. #: See `Customizing the Parser with Your Own Configuration `_. diff --git a/nameparser/parser.py b/nameparser/parser.py index bd79057..6455629 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -15,6 +15,7 @@ ENCODING = 'utf-8' + def group_contiguous_integers(data): """ return list of tuples containing first and last index @@ -27,6 +28,7 @@ def group_contiguous_integers(data): ranges.append((group[0], group[-1])) return ranges + class HumanName(object): """ Parse a person's name into individual components. @@ -67,12 +69,12 @@ class HumanName(object): """ _count = 0 - _members = ['title','first','middle','last','suffix','nickname'] + _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname'] unparsable = True _full_name = '' def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, - string_format=None): + string_format=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() @@ -130,7 +132,7 @@ def __unicode__(self): # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" _s = self.string_format.format(**self.as_dict()) # remove trailing punctuation from missing nicknames - _s = _s.replace(str(self.C.empty_attribute_default),'').replace(" ()","").replace(" ''","").replace(' ""',"") + _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "") return self.collapse_whitespace(_s).strip(', ') return " ".join(self) @@ -141,7 +143,7 @@ def __str__(self): def __repr__(self): if self.unparsable: - _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__,} + _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, } else: _string = "<%(class)s : [\n\ttitle: '%(title)s' \n\tfirst: '%(first)s' \n\tmiddle: '%(middle)s' \n\tlast: '%(last)s' \n\tsuffix: '%(suffix)s'\n\tnickname: '%(nickname)s'\n]>" % { 'class': self.__class__.__name__, @@ -182,6 +184,54 @@ def as_dict(self, include_empty=True): d[m] = val return d + def initials_list(self, exclude_last_name=False, exclude_middle_name=False, exclude_first_name=False): + """ + Return period-delimited initials of the first, middle and optionally last name. + + :param bool exclude_last_name: Exclude the last name as part of the initials + :param bool exclude_middle_name: Exclude the middle name as part of the initials + :param bool exclude_first_name: Exclude the first name as part of the initials + :rtype: str + + .. doctest:: + + >>> name = HumanName("Sir Bob Andrew Dole") + >>> name.initials() + ["B", "A", "D"] + >>> name.initials(False) + ["B", "A"] + """ + initials_list = [] + if not self.C.force_exclude_first_name_initial and not exclude_first_name: + initials_list = [name[0] for name in self.first_list if len(name)] + + if not self.C.force_exclude_middle_name_initial and not exclude_middle_name: + initials_list += [name[0] for name in self.middle_list if len(name)] + + if not self.C.force_exclude_last_name_initial and not exclude_last_name: + initials_list += [name[0] for name in self.last_list if len(name)] + + return initials_list + + def initials(self, exclude_last_name=False, exclude_middle_name=False, exclude_first_name=False, ): + """ + Return period-delimited initials of the first, middle and optionally last name. + + :param bool include_last_name: Include the last name as part of the initials + :rtype: str + + .. doctest:: + + >>> name = HumanName("Sir Bob Andrew Dole") + >>> name.initials() + "B. A. D." + >>> name.initials(False) + "B. A." + """ + initials_list = self.initials_list(exclude_last_name, exclude_middle_name, exclude_first_name) + + return " ".join([initial + self.C.initials_delimiter for initial in initials_list]) or self.C.empty_attribute_default + @property def has_own_config(self): """ @@ -190,7 +240,7 @@ def has_own_config(self): """ return self.C is not CONSTANTS - ### attributes + # attributes @property def title(self): @@ -259,7 +309,7 @@ def surnames(self): """ return " ".join(self.surnames_list) or self.C.empty_attribute_default - ### setter methods + # setter methods def _set_list(self, attr, value): if isinstance(value, list): @@ -270,8 +320,8 @@ def _set_list(self, attr, value): val = [] else: raise TypeError( - "Can only assign strings, lists or None to name attributes." - " Got {0}".format(type(value))) + "Can only assign strings, lists or None to name attributes." + " Got {0}".format(type(value))) setattr(self, attr+"_list", self.parse_pieces(val)) @title.setter @@ -298,7 +348,7 @@ def suffix(self, value): def nickname(self, value): self._set_list('nickname', value) - ### Parse helpers + # Parse helpers def is_title(self, value): """Is in the :py:data:`~nameparser.config.titles.TITLES` set.""" @@ -331,8 +381,8 @@ def is_suffix(self, piece): `C.suffix_acronyms`. """ # suffixes may have periods inside them like "M.D." - return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \ - or (lc(piece) in self.C.suffix_not_acronyms)) \ + return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) + or (lc(piece) in self.C.suffix_not_acronyms)) \ and not self.is_an_initial(piece) def are_suffixes(self, pieces): @@ -358,8 +408,7 @@ def is_an_initial(self, value): """ return bool(self.C.regexes.initial.match(value)) - - ### full_name parser + # full_name parser @property def full_name(self): @@ -376,7 +425,7 @@ def full_name(self, value): def collapse_whitespace(self, string): # collapse multiple spaces into single space - string = self.C.regexes.spaces.sub(" ", string.strip()) + string = self.C.regexes.spaces.sub(" ", string.strip()) if string.endswith(","): string = string[:-1] return string @@ -404,7 +453,7 @@ def post_process(self): self.handle_capitalization() def fix_phd(self): - _re = self.C.regexes.phd + _re = self.C.regexes.phd match = _re.search(self._full_name) if match: self.suffix_list.append(match.group(1)) @@ -474,7 +523,6 @@ def parse_full_name(self): self.nickname_list = [] self.unparsable = True - self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) @@ -516,7 +564,7 @@ def parse_full_name(self): # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) - ): + ): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break @@ -540,7 +588,6 @@ def parse_full_name(self): # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] - self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: %s", u(pieces)) @@ -614,7 +661,6 @@ def parse_full_name(self): self.unparsable = False self.post_process() - def parse_pieces(self, parts, additional_parts_count=0): """ Split parts on spaces and remove commas, join on conjunctions and @@ -648,7 +694,7 @@ def parse_pieces(self, parts, additional_parts_count=0): # split on periods, any of the split pieces titles or suffixes? # ("Lt.Gov.") period_chunks = part.split(".") - titles = list(filter(self.is_title, period_chunks)) + titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) # add the part to the constant so it will be found @@ -695,7 +741,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it conj_index = [i for i, piece in enumerate(pieces) - if self.is_conjunction(piece)] + if self.is_conjunction(piece)] contiguous_conj_i = [] for i, val in enumerate(conj_index): @@ -710,14 +756,14 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: - new_piece = " ".join(pieces[ i[0] : i[1]+1] ) - delete_i += list(range( i[0]+1, i[1]+1 )) + new_piece = " ".join(pieces[i[0]: i[1]+1]) + delete_i += list(range(i[0]+1, i[1]+1)) pieces[i[0]] = new_piece else: - new_piece = " ".join(pieces[ i : i+2 ]) + new_piece = " ".join(pieces[i: i+2]) delete_i += [i+1] pieces[i] = new_piece - #add newly joined conjunctions to constants to be found later + # add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) for i in reversed(delete_i): @@ -747,9 +793,9 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): pieces[i] = new_piece pieces.pop(i+1) # subtract 1 from the index of all the remaining conjunctions - for j,val in enumerate(conj_index): + for j, val in enumerate(conj_index): if val > i: - conj_index[j]=val-1 + conj_index[j] = val-1 else: new_piece = " ".join(pieces[i-1:i+2]) @@ -766,11 +812,10 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # subtract the number of removed pieces from the index # of all the remaining conjunctions - for j,val in enumerate(conj_index): + for j, val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count - # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: @@ -813,12 +858,11 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): log.debug("pieces: %s", pieces) return pieces - - ### Capitalization Support + # Capitalization Support def cap_word(self, word, attribute): - if (self.is_prefix(word) and attribute in ('last','middle')) \ - or self.is_conjunction(word): + if (self.is_prefix(word) and attribute in ('last', 'middle')) \ + or self.is_conjunction(word): return word.lower() exceptions = self.C.capitalization_exceptions if lc(word) in exceptions: @@ -834,7 +878,8 @@ def cap_after_mac(m): def cap_piece(self, piece, attribute): if not piece: return "" - replacement = lambda m: self.cap_word(m.group(0), attribute) + + def replacement(m): return self.cap_word(m.group(0), attribute) return self.C.regexes.word.sub(replacement, piece) def capitalize(self, force=None): @@ -872,10 +917,10 @@ def capitalize(self, force=None): if not force and not (name == name.upper() or name == name.lower()): return - self.title_list = self.cap_piece(self.title , 'title').split(' ') - self.first_list = self.cap_piece(self.first , 'first').split(' ') + self.title_list = self.cap_piece(self.title, 'title').split(' ') + self.first_list = self.cap_piece(self.first, 'first').split(' ') self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') - self.last_list = self.cap_piece(self.last , 'last').split(' ') + self.last_list = self.cap_piece(self.last, 'last').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') def handle_capitalization(self): diff --git a/tests.py b/tests.py index 5f976b8..6bcb99d 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import unittest """ Run this file to run the tests. @@ -30,7 +31,6 @@ log = logging.getLogger('HumanName') -import unittest try: unittest.expectedFailure except AttributeError: @@ -114,7 +114,6 @@ def test_get_full_name_attribute_references_internal_lists(self): hn.first_list = ["Larry"] self.m(hn.full_name, "Larry Williams", hn) - def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" @@ -210,16 +209,16 @@ def test_assume_title_and_one_other_name_is_last_name(self): hn = HumanName("Rev Andrews") self.m(hn.title, "Rev", hn) self.m(hn.last, "Andrews", hn) - + # TODO: Seems "Andrews, M.D.", Andrews should be treated as a last name - # but other suffixes like "George Jr." should be first names. Might be + # but other suffixes like "George Jr." should be first names. Might be # related to https://github.com/derek73/python-nameparser/issues/2 @unittest.expectedFailure def test_assume_suffix_title_and_one_other_name_is_last_name(self): hn = HumanName("Andrews, M.D.") self.m(hn.suffix, "M.D.", hn) self.m(hn.last, "Andrews", hn) - + def test_suffix_in_lastname_part_of_lastname_comma_format(self): hn = HumanName("Smith Jr., John") self.m(hn.last, "Smith", hn) @@ -230,22 +229,22 @@ def test_sir_exception_to_first_name_rule(self): hn = HumanName("Sir Gerald") self.m(hn.title, "Sir", hn) self.m(hn.first, "Gerald", hn) - + def test_king_exception_to_first_name_rule(self): hn = HumanName("King Henry") self.m(hn.title, "King", hn) self.m(hn.first, "Henry", hn) - + def test_queen_exception_to_first_name_rule(self): hn = HumanName("Queen Elizabeth") self.m(hn.title, "Queen", hn) self.m(hn.first, "Elizabeth", hn) - + def test_dame_exception_to_first_name_rule(self): hn = HumanName("Dame Mary") self.m(hn.title, "Dame", hn) self.m(hn.first, "Mary", hn) - + def test_first_name_is_not_prefix_if_only_two_parts(self): """When there are only two parts, don't join prefixes or conjunctions""" hn = HumanName("Van Nguyen") @@ -263,7 +262,7 @@ def test_first_name_is_prefix_if_three_parts(self): hn = HumanName("Mr. Van Nguyen") self.m(hn.first, "Van", hn) self.m(hn.last, "Nguyen", hn) - + class HumanNameBruteForceTests(HumanNameTestBase): @@ -1084,7 +1083,7 @@ def test_multiple_conjunctions(self): def test_multiple_conjunctions2(self): hn = HumanName("part1 of and The part2 of the part3 And part4") self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn) - + def test_ends_with_conjunction(self): hn = HumanName("Jon Dough and") self.m(hn.first, "Jon", hn) @@ -1242,12 +1241,12 @@ def test_le_as_last_name_with_middle_initial(self): self.m(hn.first, "Yin", hn) self.m(hn.middle, "a", hn) self.m(hn.last, "Le", hn) - + def test_conjunction_in_an_address_with_a_title(self): hn = HumanName("His Excellency Lord Duncan") self.m(hn.title, "His Excellency Lord", hn) self.m(hn.last, "Duncan", hn) - + @unittest.expectedFailure def test_conjunction_in_an_address_with_a_first_name_title(self): hn = HumanName("Her Majesty Queen Elizabeth") @@ -1272,7 +1271,7 @@ def test_add_title(self): self.m(hn.title, "Te", hn) self.m(hn.first, "Awanui-a-Rangi", hn) self.m(hn.last, "Black", hn) - + def test_remove_title(self): hn = HumanName("Hon Solo", constants=None) start_len = len(hn.C.titles) @@ -1282,7 +1281,7 @@ def test_remove_title(self): hn.parse_full_name() self.m(hn.first, "Hon", hn) self.m(hn.last, "Solo", hn) - + def test_add_multiple_arguments(self): hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=None) hn.C.titles.add('dean', 'Chemistry') @@ -1310,7 +1309,7 @@ def test_can_change_global_constants(self): self.assertEqual(hn2.has_own_config, False) # clean up so we don't mess up other tests hn.C.titles.add('hon') - + def test_remove_multiple_arguments(self): hn = HumanName("Ms Hon Solo", constants=None) hn.C.titles.remove('hon', 'ms') @@ -1370,7 +1369,7 @@ def test_nickname_in_parenthesis(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_two_word_nickname_in_parenthesis(self): hn = HumanName("Benjamin (Big Ben) Franklin") self.m(hn.first, "Benjamin", hn) @@ -1391,7 +1390,7 @@ def test_nickname_in_parenthesis_with_comma(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_nickname_in_parenthesis_with_comma_and_suffix(self): hn = HumanName("Franklin, Benjamin (Ben), Jr.") self.m(hn.first, "Benjamin", hn) @@ -1399,7 +1398,7 @@ def test_nickname_in_parenthesis_with_comma_and_suffix(self): self.m(hn.last, "Franklin", hn) self.m(hn.suffix, "Jr.", hn) self.m(hn.nickname, "Ben", hn) - + def test_nickname_in_single_quotes(self): hn = HumanName("Benjamin 'Ben' Franklin") self.m(hn.first, "Benjamin", hn) @@ -1413,28 +1412,28 @@ def test_nickname_in_double_quotes(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_single_quotes_on_first_name_not_treated_as_nickname(self): hn = HumanName("Brian Andrew O'connor") self.m(hn.first, "Brian", hn) self.m(hn.middle, "Andrew", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) - + def test_single_quotes_on_both_name_not_treated_as_nickname(self): hn = HumanName("La'tanya O'connor") self.m(hn.first, "La'tanya", hn) self.m(hn.middle, "", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) - + def test_single_quotes_on_end_of_last_name_not_treated_as_nickname(self): hn = HumanName("Mari' Aube'") self.m(hn.first, "Mari'", hn) self.m(hn.middle, "", hn) self.m(hn.last, "Aube'", hn) self.m(hn.nickname, "", hn) - + def test_okina_inside_name_not_treated_as_nickname(self): hn = HumanName("Harrieta Keōpūolani Nāhiʻenaʻena") self.m(hn.first, "Harrieta", hn) @@ -1492,7 +1491,6 @@ def test_nickname_and_last_name_with_title(self): self.m(hn.nickname, "Rick", hn) - # class MaidenNameTestCase(HumanNameTestBase): # # def test_parenthesis_and_quotes_together(self): @@ -1542,12 +1540,12 @@ def test_prefix(self): hn = HumanName("Juan del Sur") self.m(hn.first, "Juan", hn) self.m(hn.last, "del Sur", hn) - + def test_prefix_with_period(self): hn = HumanName("Jill St. John") self.m(hn.first, "Jill", hn) self.m(hn.last, "St. John", hn) - + def test_prefix_before_two_part_last_name(self): hn = HumanName("pennie von bergen wessels") self.m(hn.first, "pennie", hn) @@ -1641,7 +1639,7 @@ def test_comma_three_conjunctions(self): class SuffixesTestCase(HumanNameTestBase): - + def test_suffix(self): hn = HumanName("Joe Franklin Jr") self.m(hn.first, "Joe", hn) @@ -1716,13 +1714,13 @@ def test_phd_conflict(self): self.m(hn.first, "Adolph", hn) self.m(hn.last, "D", hn) - # http://en.wikipedia.org/wiki/Ma_(surname) + def test_potential_suffix_that_is_also_last_name(self): hn = HumanName("Jack Ma") self.m(hn.first, "Jack", hn) self.m(hn.last, "Ma", hn) - + def test_potential_suffix_that_is_also_last_name_comma(self): hn = HumanName("Ma, Jack") self.m(hn.first, "Jack", hn) @@ -1820,27 +1818,27 @@ def test_chained_title_first_name_title_is_initials(self): self.m(hn.first, "Marc", hn) self.m(hn.middle, "Thomas", hn) self.m(hn.last, "Treadwell", hn) - + def test_conflict_with_chained_title_first_name_initial(self): hn = HumanName("U. S. Grant") self.m(hn.first, "U.", hn) self.m(hn.middle, "S.", hn) self.m(hn.last, "Grant", hn) - + def test_chained_title_first_name_initial_with_no_period(self): hn = HumanName("US Magistrate Judge T Michael Putnam") self.m(hn.title, "US Magistrate Judge", hn) self.m(hn.first, "T", hn) self.m(hn.middle, "Michael", hn) self.m(hn.last, "Putnam", hn) - + def test_chained_hyphenated_title(self): hn = HumanName("US Magistrate-Judge Elizabeth E Campbell") self.m(hn.title, "US Magistrate-Judge", hn) self.m(hn.first, "Elizabeth", hn) self.m(hn.middle, "E", hn) self.m(hn.last, "Campbell", hn) - + def test_chained_hyphenated_title_with_comma_suffix(self): hn = HumanName("Mag-Judge Harwell G Davis, III") self.m(hn.title, "Mag-Judge", hn) @@ -1883,7 +1881,7 @@ def test_title_with_last_initial_is_suffix(self): self.m(hn.title, "King", hn) self.m(hn.first, "John", hn) self.m(hn.last, "V.", hn) - + def test_initials_also_suffix(self): hn = HumanName("Smith, J.R.") self.m(hn.first, "J.R.", hn) @@ -2062,10 +2060,10 @@ def test_capitalize_prefix_clash_on_first_name(self): class HumanNameOutputFormatTests(HumanNameTestBase): - + def test_formatting_init_argument(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", - string_format="TEST1") + string_format="TEST1") self.assertEqual(u(hn), "TEST1") def test_formatting_constants_attribute(self): @@ -2160,7 +2158,7 @@ def test_formating_of_nicknames_in_middle(self): self.assertEqual(u(hn), "Rev John (Kenny) A. Kenneth Doe III") hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") - + def test_remove_emojis(self): hn = HumanName("Sam Smith 😊") self.m(hn.first, "Sam", hn) @@ -2184,6 +2182,90 @@ def test_keep_emojis(self): # test cleanup +class InitialsTestCase(HumanNameTestBase): + def test_initials(self): + hn = HumanName("Andrew Boris Petersen") + self.m(hn.initials(), "A. B. P.", hn) + self.m(hn.initials(exclude_last_name=True), "A. B.", hn) + self.m(hn.initials(exclude_middle_name=True), "A. P.", hn) + self.m(hn.initials(exclude_first_name=True), "B. P.", hn) + + def test_initials_complex_name(self): + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials(), "J. A. K. D.", hn) + self.m(hn.initials(exclude_last_name=True), "J. A. K.", hn) + self.m(hn.initials(exclude_middle_name=True), "J. D.", hn) + self.m(hn.initials(exclude_first_name=True), "A. K. D.", hn) + + def test_initials_list(self): + hn = HumanName("Andrew Boris Petersen") + self.m(hn.initials_list(), ["A", "B", "P"], hn) + self.m(hn.initials_list(exclude_last_name=True), ["A", "B"], hn) + self.m(hn.initials_list(exclude_middle_name=True), ["A", "P"], hn) + self.m(hn.initials_list(exclude_first_name=True), ["B", "P"], hn) + + def test_initials_list_complex_name(self): + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials_list(), ["J", "A", "K", "D"], hn) + self.m(hn.initials_list(exclude_last_name=True), ["J", "A", "K"], hn) + self.m(hn.initials_list(exclude_middle_name=True), ["J", "D"], hn) + self.m(hn.initials_list(exclude_first_name=True), ["A", "K", "D"], hn) + + def test_initials_configuration(self): + hn = HumanName("Doe, John A. Kenneth, Jr.") + from nameparser.config import CONSTANTS + + CONSTANTS.force_exclude_last_name_initial = True + self.m(hn.initials(), "J. A. K.", hn) + self.m(hn.initials(exclude_last_name=True), "J. A. K.", hn) + self.m(hn.initials(exclude_middle_name=True), "J.", hn) + self.m(hn.initials(exclude_first_name=True), "A. K.", hn) + CONSTANTS.force_exclude_last_name_initial = False + + CONSTANTS.force_exclude_middle_name_initial = True + self.m(hn.initials(), "J. D.", hn) + self.m(hn.initials(exclude_last_name=True), "J.", hn) + self.m(hn.initials(exclude_middle_name=True), "J. D.", hn) + self.m(hn.initials(exclude_first_name=True), "D.", hn) + CONSTANTS.force_exclude_middle_name_initial = False + + CONSTANTS.force_exclude_first_name_initial = True + self.m(hn.initials(), "A. K. D.", hn) + self.m(hn.initials(exclude_last_name=True), "A. K.", hn) + self.m(hn.initials(exclude_middle_name=True), "D.", hn) + self.m(hn.initials(exclude_first_name=True), "A. K. D.", hn) + CONSTANTS.force_exclude_first_name_initial = False + + CONSTANTS.initials_delimiter = '' + self.m(hn.initials(), "J A K D", hn) + CONSTANTS.initials_delimiter = '.' + + def test_initials_configuration_list(self): + hn = HumanName("Doe, John A. Kenneth, Jr.") + from nameparser.config import CONSTANTS + + CONSTANTS.force_exclude_last_name_initial = True + self.m(hn.initials_list(), ["J", "A", "K"], hn) + self.m(hn.initials_list(exclude_last_name=True), ["J", "A", "K"], hn) + self.m(hn.initials_list(exclude_middle_name=True), ["J"], hn) + self.m(hn.initials_list(exclude_first_name=True), ["A", "K"], hn) + CONSTANTS.force_exclude_last_name_initial = False + + CONSTANTS.force_exclude_middle_name_initial = True + self.m(hn.initials_list(), ["J", "D"], hn) + self.m(hn.initials_list(exclude_last_name=True), ["J"], hn) + self.m(hn.initials_list(exclude_middle_name=True), ["J", "D"], hn) + self.m(hn.initials_list(exclude_first_name=True), ["D"], hn) + CONSTANTS.force_exclude_middle_name_initial = False + + CONSTANTS.force_exclude_first_name_initial = True + self.m(hn.initials_list(), ["A", "K", "D"], hn) + self.m(hn.initials_list(exclude_last_name=True), ["A", "K"], hn) + self.m(hn.initials_list(exclude_middle_name=True), ["D"], hn) + self.m(hn.initials_list(exclude_first_name=True), ["A", "K", "D"], hn) + CONSTANTS.force_exclude_first_name_initial = False + + TEST_NAMES = ( "John Doe", "John Doe, Jr.", @@ -2359,7 +2441,7 @@ def test_keep_emojis(self): "U.S. District Judge Marc Thomas Treadwell", "Dra. Andréia da Silva", "Srta. Andréia da Silva", - + ) From b9be118b89bdd71feca8e379e7db0a7fd8d23e17 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Thu, 21 Oct 2021 12:16:38 +0200 Subject: [PATCH 110/163] Removed unnecessary comma --- nameparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 6455629..7b4c2e9 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -213,7 +213,7 @@ def initials_list(self, exclude_last_name=False, exclude_middle_name=False, excl return initials_list - def initials(self, exclude_last_name=False, exclude_middle_name=False, exclude_first_name=False, ): + def initials(self, exclude_last_name=False, exclude_middle_name=False, exclude_first_name=False): """ Return period-delimited initials of the first, middle and optionally last name. From 19e1893058988c49f302cf01eb946ffb05f6a1a0 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Fri, 22 Oct 2021 16:59:29 +0200 Subject: [PATCH 111/163] Use string formatting for initials --- docs/usage.rst | 40 ++++++++++----- nameparser/config/__init__.py | 80 +++++------------------------ nameparser/parser.py | 42 +++++++-------- tests.py | 97 ++++++++++++----------------------- 4 files changed, 91 insertions(+), 168 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 01beb48..e6e7c40 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -181,24 +181,36 @@ Initials Support The HumanName class can try to get the correct representation of initials. Initials can be tricky as different format usages exist. -If you want to exclude on of the name parts from the initials, you can use one of the following boolean parameters: -`exclude_last_name`, `exclude_middle_name` or `exclude_first_name` +If you want to exclude on of the name parts from the initials, you can use the initials format by chainging +:py:attr:`~nameparser.config.Constants.initials_format` +Three attributes exist for the format, `first`, `middle` and `last`. + +.. doctest:: initials format + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.initials_format = "{first} {middle}" + >>> HumanName("Doe, John A. Kenneth, Jr.").initials() + 'J. A. K.' + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{last}, {first}).initials() + 'D., J.' -You can also force the behavior using the CONSTANTS: -:py:attr:`~nameparser.config.Constants.force_exclude_last_name` -:py:attr:`~nameparser.config.Constants.force_exclude_middle_name` -:py:attr:`~nameparser.config.Constants.force_exclude_first_name` Furthermore, the delimiter for the string output can be set through: :py:attr:`~nameparser.config.Constants.initials_delimiter` -.. doctest:: initials +.. doctest:: initials delimiter + + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials() + "J; A; K;" + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.initials_delimiter = "." + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}{middle}{last}).initials() + "J.A.K.D." + +If you want to receive a list representation of the initials, yo ucan use :py:meth:`~nameparser.HumanName.initials_list`. +This function is unaffected by :py:attr:`~nameparser.config.Constants.initials_format` - >>> name = HumanName("Doe, John A. Kenneth, Jr.") - >>> name.initials() - 'J. A. K. D.' - >>> name.initials(exclude_last_name) - 'J. A. K.' - >>> name.initials_list(exclude_middle_name): - ['J', 'D'] +.. doctest:: list format + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials_list() + ["J", "A", "K", "D"] diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 488c899..7b2baef 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -172,6 +172,18 @@ class Constants(object): """ The default string format use for all new `HumanName` instances. """ + + initials_format = "{first} {middle} {last}" + """ + The default initials format used for all new `HumanName` instances. + """ + + initials_delimiter = "." + """ + The default initials delimiter used for all new `HumanName` instances. + Will be used to add a delimiter between each initial. + """ + empty_attribute_default = '' """ Default return value for empty attributes. @@ -203,23 +215,6 @@ class Constants(object): """ - initials_delimiter = '.' - """" - Determines how the initials from :py:meth:`~nameparser.parser.HumanName.initials` are seperated. - - .. doctest:: - - >>> from nameparser.config import CONSTANTS - >>> HumanName('Shirley Maclaine').initials() - 'S. M.' - >>> CONSTANTS.initials_delimiter = '' - >>> HumanName('Shirley Maclaine').initials() - 'S M' - >>> CONSTANTS.initials_delimiter = '-' - >>> HumanName('Shirley Maclaine').initials() - 'S- M-' - """ - force_mixed_case_capitalization = False """ If set, forces the capitalization of mixed case strings when @@ -236,57 +231,6 @@ class Constants(object): """ - force_exclude_last_name_initial = False - """ - If True, forces the last name to be excluded in the initials when - :py:meth:`~nameparser.parser.HumanName.initials` or - :py:meth:`~nameparser.parser.HumanName.initials_list` is called. - - .. doctest:: - - >>> from nameparser.config import CONSTANTS - >>> CONSTANTS.force_exclude_last_name_initial = True - >>> name = HumanName('Shirley Ashley Maclaine') - >>> name.initials() - 'S. A.' - >>> name.initials_list() - ['S', 'A'] - """ - - force_exclude_middle_name_initial = False - """ - If True, forces the middle name to be included in the initials when - :py:meth:`~nameparser.parser.HumanName.initials` or - :py:meth:`~nameparser.parser.HumanName.initials_list` is called. - - .. doctest:: - - >>> from nameparser.config import CONSTANTS - >>> CONSTANTS.force_exclude_middle_name_initial = True - >>> name = HumanName('Shirley Ashley Maclaine') - >>> name.initials() - 'S. M.' - >>> name.initials_list() - ['S', 'M'] - """ - - force_exclude_first_name_initial = False - """ - If True, forces the first name to be included in the initials when - :py:meth:`~nameparser.parser.HumanName.initials` or - :py:meth:`~nameparser.parser.HumanName.initials_list` is called. - - .. doctest:: - - >>> from nameparser.config import CONSTANTS - >>> CONSTANTS.force_exclude_first_name_initial = True - >>> name = HumanName('Shirley Ashley Maclaine') - >>> name.initials() - 'A. M.' - >>> name.initials_list() - ['A', 'M'] - """ - def __init__(self, prefixes=PREFIXES, suffix_acronyms=SUFFIX_ACRONYMS, diff --git a/nameparser/parser.py b/nameparser/parser.py index 7b4c2e9..52ad6bd 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -53,6 +53,8 @@ class HumanName(object): `per-instance config `_. :param str encoding: string representing the encoding of your input :param str string_format: python string formatting + :param str initials_format: python initials string formatting + :param str initials_delimter: string delimiter for initials """ C = CONSTANTS @@ -74,13 +76,15 @@ class HumanName(object): _full_name = '' def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, - string_format=None): + string_format=None, initials_format=None, initials_delimiter=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() self.encoding = encoding self.string_format = string_format or self.C.string_format + self.initials_format = initials_format or self.C.initials_format + self.initials_delimiter = initials_delimiter or self.C.initials_delimiter # full_name setter triggers the parse self.full_name = full_name @@ -184,36 +188,27 @@ def as_dict(self, include_empty=True): d[m] = val return d - def initials_list(self, exclude_last_name=False, exclude_middle_name=False, exclude_first_name=False): + def initials_list(self): """ - Return period-delimited initials of the first, middle and optionally last name. - - :param bool exclude_last_name: Exclude the last name as part of the initials - :param bool exclude_middle_name: Exclude the middle name as part of the initials - :param bool exclude_first_name: Exclude the first name as part of the initials - :rtype: str + Returns the initials as a list .. doctest:: >>> name = HumanName("Sir Bob Andrew Dole") >>> name.initials() ["B", "A", "D"] - >>> name.initials(False) - ["B", "A"] + >>> name = HumanName("J. Doe") + >>> name.initials() + ["J", "D"] """ initials_list = [] - if not self.C.force_exclude_first_name_initial and not exclude_first_name: - initials_list = [name[0] for name in self.first_list if len(name)] - - if not self.C.force_exclude_middle_name_initial and not exclude_middle_name: - initials_list += [name[0] for name in self.middle_list if len(name)] - - if not self.C.force_exclude_last_name_initial and not exclude_last_name: - initials_list += [name[0] for name in self.last_list if len(name)] + initials_list = [name[0] for name in self.first_list if len(name)] + initials_list += [name[0] for name in self.middle_list if len(name)] + initials_list += [name[0] for name in self.last_list if len(name)] return initials_list - def initials(self, exclude_last_name=False, exclude_middle_name=False, exclude_first_name=False): + def initials(self): """ Return period-delimited initials of the first, middle and optionally last name. @@ -228,9 +223,14 @@ def initials(self, exclude_last_name=False, exclude_middle_name=False, exclude_f >>> name.initials(False) "B. A." """ - initials_list = self.initials_list(exclude_last_name, exclude_middle_name, exclude_first_name) - return " ".join([initial + self.C.initials_delimiter for initial in initials_list]) or self.C.empty_attribute_default + initials_dict = { + "first": (self.initials_delimiter + " ").join([name[0] for name in self.first_list if len(name)]) + self.initials_delimiter, + "middle": (self.initials_delimiter + " ").join([name[0] for name in self.middle_list if len(name)]) + self.initials_delimiter, + "last": (self.initials_delimiter + " ").join([name[0] for name in self.last_list if len(name)]) + self.initials_delimiter + } + + return self.initials_format.format(**initials_dict) @property def has_own_config(self): diff --git a/tests.py b/tests.py index 6bcb99d..0c884e5 100644 --- a/tests.py +++ b/tests.py @@ -2186,84 +2186,51 @@ class InitialsTestCase(HumanNameTestBase): def test_initials(self): hn = HumanName("Andrew Boris Petersen") self.m(hn.initials(), "A. B. P.", hn) - self.m(hn.initials(exclude_last_name=True), "A. B.", hn) - self.m(hn.initials(exclude_middle_name=True), "A. P.", hn) - self.m(hn.initials(exclude_first_name=True), "B. P.", hn) def test_initials_complex_name(self): hn = HumanName("Doe, John A. Kenneth, Jr.") self.m(hn.initials(), "J. A. K. D.", hn) - self.m(hn.initials(exclude_last_name=True), "J. A. K.", hn) - self.m(hn.initials(exclude_middle_name=True), "J. D.", hn) - self.m(hn.initials(exclude_first_name=True), "A. K. D.", hn) - def test_initials_list(self): - hn = HumanName("Andrew Boris Petersen") - self.m(hn.initials_list(), ["A", "B", "P"], hn) - self.m(hn.initials_list(exclude_last_name=True), ["A", "B"], hn) - self.m(hn.initials_list(exclude_middle_name=True), ["A", "P"], hn) - self.m(hn.initials_list(exclude_first_name=True), ["B", "P"], hn) + def test_initials_format(self): + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first} {middle}") + self.m(hn.initials(), "J. A. K.", hn) + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first} {last}") + self.m(hn.initials(), "J. D.", hn) + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{middle} {last}") + self.m(hn.initials(), "A. K. D.", hn) + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}, {last}") + self.m(hn.initials(), "J., D.", hn) - def test_initials_list_complex_name(self): + def test_initials_format_constants(self): + from nameparser.config import CONSTANTS + orig_format = CONSTANTS.initials_format + CONSTANTS.initials_format = "{first} {last}" hn = HumanName("Doe, John A. Kenneth, Jr.") - self.m(hn.initials_list(), ["J", "A", "K", "D"], hn) - self.m(hn.initials_list(exclude_last_name=True), ["J", "A", "K"], hn) - self.m(hn.initials_list(exclude_middle_name=True), ["J", "D"], hn) - self.m(hn.initials_list(exclude_first_name=True), ["A", "K", "D"], hn) - - def test_initials_configuration(self): + self.m(hn.initials(), "J. D.", hn) + CONSTANTS.initials_format = "{first} {last}" hn = HumanName("Doe, John A. Kenneth, Jr.") - from nameparser.config import CONSTANTS + self.m(hn.initials(), "J. D.", hn) + CONSTANTS.initials_format = orig_format - CONSTANTS.force_exclude_last_name_initial = True - self.m(hn.initials(), "J. A. K.", hn) - self.m(hn.initials(exclude_last_name=True), "J. A. K.", hn) - self.m(hn.initials(exclude_middle_name=True), "J.", hn) - self.m(hn.initials(exclude_first_name=True), "A. K.", hn) - CONSTANTS.force_exclude_last_name_initial = False + def test_initials_delimiter(self): + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";") + self.m(hn.initials(), "J; A; K; D;", hn) - CONSTANTS.force_exclude_middle_name_initial = True - self.m(hn.initials(), "J. D.", hn) - self.m(hn.initials(exclude_last_name=True), "J.", hn) - self.m(hn.initials(exclude_middle_name=True), "J. D.", hn) - self.m(hn.initials(exclude_first_name=True), "D.", hn) - CONSTANTS.force_exclude_middle_name_initial = False - - CONSTANTS.force_exclude_first_name_initial = True - self.m(hn.initials(), "A. K. D.", hn) - self.m(hn.initials(exclude_last_name=True), "A. K.", hn) - self.m(hn.initials(exclude_middle_name=True), "D.", hn) - self.m(hn.initials(exclude_first_name=True), "A. K. D.", hn) - CONSTANTS.force_exclude_first_name_initial = False + def test_initials_delimiter_constants(self): + from nameparser.config import CONSTANTS + orig_delimiter = CONSTANTS.initials_delimiter + CONSTANTS.initials_delimiter = ";" + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials(), "J; A; K; D;", hn) + CONSTANTS.initials_delimiter = orig_delimiter - CONSTANTS.initials_delimiter = '' - self.m(hn.initials(), "J A K D", hn) - CONSTANTS.initials_delimiter = '.' + def test_initials_list(self): + hn = HumanName("Andrew Boris Petersen") + self.m(hn.initials_list(), ["A", "B", "P"], hn) - def test_initials_configuration_list(self): + def test_initials_list_complex_name(self): hn = HumanName("Doe, John A. Kenneth, Jr.") - from nameparser.config import CONSTANTS - - CONSTANTS.force_exclude_last_name_initial = True - self.m(hn.initials_list(), ["J", "A", "K"], hn) - self.m(hn.initials_list(exclude_last_name=True), ["J", "A", "K"], hn) - self.m(hn.initials_list(exclude_middle_name=True), ["J"], hn) - self.m(hn.initials_list(exclude_first_name=True), ["A", "K"], hn) - CONSTANTS.force_exclude_last_name_initial = False - - CONSTANTS.force_exclude_middle_name_initial = True - self.m(hn.initials_list(), ["J", "D"], hn) - self.m(hn.initials_list(exclude_last_name=True), ["J"], hn) - self.m(hn.initials_list(exclude_middle_name=True), ["J", "D"], hn) - self.m(hn.initials_list(exclude_first_name=True), ["D"], hn) - CONSTANTS.force_exclude_middle_name_initial = False - - CONSTANTS.force_exclude_first_name_initial = True - self.m(hn.initials_list(), ["A", "K", "D"], hn) - self.m(hn.initials_list(exclude_last_name=True), ["A", "K"], hn) - self.m(hn.initials_list(exclude_middle_name=True), ["D"], hn) - self.m(hn.initials_list(exclude_first_name=True), ["A", "K", "D"], hn) - CONSTANTS.force_exclude_first_name_initial = False + self.m(hn.initials_list(), ["J", "A", "K", "D"], hn) TEST_NAMES = ( From 22f4d0cff7984c135881d5f56a0a882472140fc7 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Tue, 26 Oct 2021 14:19:37 +0200 Subject: [PATCH 112/163] Handle missing middlename case for initials and collapse whitespace --- nameparser/parser.py | 16 ++++++++++++---- tests.py | 22 +++++++++++++++++----- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 52ad6bd..77c8217 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -224,13 +224,21 @@ def initials(self): "B. A." """ + first_initials_list = [name[0] for name in self.first_list] + middle_initials_list = [name[0] for name in self.middle_list] + last_initials_list = [name[0] for name in self.last_list] + initials_dict = { - "first": (self.initials_delimiter + " ").join([name[0] for name in self.first_list if len(name)]) + self.initials_delimiter, - "middle": (self.initials_delimiter + " ").join([name[0] for name in self.middle_list if len(name)]) + self.initials_delimiter, - "last": (self.initials_delimiter + " ").join([name[0] for name in self.last_list if len(name)]) + self.initials_delimiter + "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter + if len(first_initials_list) else self.C.empty_attribute_default, + "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter + if len(middle_initials_list) else self.C.empty_attribute_default, + "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter + if len(last_initials_list) else self.C.empty_attribute_default } - return self.initials_format.format(**initials_dict) + _s = self.initials_format.format(**initials_dict) + return self.collapse_whitespace(_s) @property def has_own_config(self): diff --git a/tests.py b/tests.py index 0c884e5..f075cb6 100644 --- a/tests.py +++ b/tests.py @@ -2187,6 +2187,18 @@ def test_initials(self): hn = HumanName("Andrew Boris Petersen") self.m(hn.initials(), "A. B. P.", hn) + def test_initials_simple_name(self): + hn = HumanName("John Doe") + self.m(hn.initials(), "J. D.", hn) + hn = HumanName("John Doe", initials_format="{first} {last}") + self.m(hn.initials(), "J. D.", hn) + hn = HumanName("John Doe", initials_format="{last}") + self.m(hn.initials(), "D.", hn) + hn = HumanName("John Doe", initials_format="{first}") + self.m(hn.initials(), "J.", hn) + hn = HumanName("John Doe", initials_format="{middle}") + self.m(hn.initials(), "", hn) + def test_initials_complex_name(self): hn = HumanName("Doe, John A. Kenneth, Jr.") self.m(hn.initials(), "J. A. K. D.", hn) @@ -2203,14 +2215,14 @@ def test_initials_format(self): def test_initials_format_constants(self): from nameparser.config import CONSTANTS - orig_format = CONSTANTS.initials_format + _orig = CONSTANTS.initials_format CONSTANTS.initials_format = "{first} {last}" hn = HumanName("Doe, John A. Kenneth, Jr.") self.m(hn.initials(), "J. D.", hn) CONSTANTS.initials_format = "{first} {last}" hn = HumanName("Doe, John A. Kenneth, Jr.") - self.m(hn.initials(), "J. D.", hn) - CONSTANTS.initials_format = orig_format + self.m(hn.initials(), "J. D.", hn) + CONSTANTS.initials_format = _orig def test_initials_delimiter(self): hn = HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";") @@ -2218,11 +2230,11 @@ def test_initials_delimiter(self): def test_initials_delimiter_constants(self): from nameparser.config import CONSTANTS - orig_delimiter = CONSTANTS.initials_delimiter + _orig = CONSTANTS.initials_delimiter CONSTANTS.initials_delimiter = ";" hn = HumanName("Doe, John A. Kenneth, Jr.") self.m(hn.initials(), "J; A; K; D;", hn) - CONSTANTS.initials_delimiter = orig_delimiter + CONSTANTS.initials_delimiter = _orig def test_initials_list(self): hn = HumanName("Andrew Boris Petersen") From 606f77c1b435db8a5f56530b47db25aad34f8308 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Wed, 27 Oct 2021 13:04:01 +0200 Subject: [PATCH 113/163] Filter conjunctions and prefixes from initials --- nameparser/parser.py | 29 +++++++++++++++++------------ tests.py | 1 + 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 77c8217..f068f5d 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -188,6 +188,12 @@ def as_dict(self, include_empty=True): d[m] = val return d + def process_initial(self, name_part): + """ + Name parts may include prefixes or conjuctions. This function filters these from the name. + """ + return " ".join([split for split in name_part.split(" ") if len(split) and not (self.is_prefix(split) or self.is_conjunction(split))])[0] + def initials_list(self): """ Returns the initials as a list @@ -195,18 +201,16 @@ def initials_list(self): .. doctest:: >>> name = HumanName("Sir Bob Andrew Dole") - >>> name.initials() + >>> name.initials_list() ["B", "A", "D"] >>> name = HumanName("J. Doe") - >>> name.initials() + >>> name.initials_list() ["J", "D"] """ - initials_list = [] - initials_list = [name[0] for name in self.first_list if len(name)] - initials_list += [name[0] for name in self.middle_list if len(name)] - initials_list += [name[0] for name in self.last_list if len(name)] - - return initials_list + first_initials_list = [self.__process_initial__(name) for name in self.first_list if name] + middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] + last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] + return first_initials_list + middle_initials_list + last_initials_list def initials(self): """ @@ -220,13 +224,14 @@ def initials(self): >>> name = HumanName("Sir Bob Andrew Dole") >>> name.initials() "B. A. D." - >>> name.initials(False) + >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}") + >>> name.initials() "B. A." """ - first_initials_list = [name[0] for name in self.first_list] - middle_initials_list = [name[0] for name in self.middle_list] - last_initials_list = [name[0] for name in self.last_list] + first_initials_list = [self.__process_initial__(name) for name in self.first_list if name] + middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] + last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] initials_dict = { "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter diff --git a/tests.py b/tests.py index f075cb6..4e14b1d 100644 --- a/tests.py +++ b/tests.py @@ -2472,6 +2472,7 @@ def test_variations_of_TEST_NAMES(self): print((repr(hn_instance))) hn_instance.capitalize() print((repr(hn_instance))) + print("Initials: " + hn_instance.initials()) else: print("-"*80) print("Running tests") From fd3e8470477bd38fa76e3d6dcecb23e7a78faa71 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Mon, 8 Nov 2021 16:27:48 +0100 Subject: [PATCH 114/163] Rename parse_initial to __parse_initial__ --- nameparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index f068f5d..bbde276 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -188,7 +188,7 @@ def as_dict(self, include_empty=True): d[m] = val return d - def process_initial(self, name_part): + def __process_initial__(self, name_part): """ Name parts may include prefixes or conjuctions. This function filters these from the name. """ From 377bea2a1525fc9b22055fd5976f440185b08a00 Mon Sep 17 00:00:00 2001 From: Rink Stiekema Date: Thu, 11 Nov 2021 16:30:05 +0100 Subject: [PATCH 115/163] Check if name_part exists --- nameparser/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bbde276..bd93e81 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -192,7 +192,8 @@ def __process_initial__(self, name_part): """ Name parts may include prefixes or conjuctions. This function filters these from the name. """ - return " ".join([split for split in name_part.split(" ") if len(split) and not (self.is_prefix(split) or self.is_conjunction(split))])[0] + parsed = " ".join([split for split in name_part.split(" ") if len(split) and not (self.is_prefix(split) or self.is_conjunction(split))]) + return parsed[0] if len(parsed) else "" def initials_list(self): """ From b4fca52465902ad4b1134f8d612f6a287b84717f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:06:44 -0800 Subject: [PATCH 116/163] Update usage.rst update documentation --- docs/usage.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index e6e7c40..7fbe274 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -181,7 +181,7 @@ Initials Support The HumanName class can try to get the correct representation of initials. Initials can be tricky as different format usages exist. -If you want to exclude on of the name parts from the initials, you can use the initials format by chainging +To exclude any of the name parts from the initials, change the initials format string: :py:attr:`~nameparser.config.Constants.initials_format` Three attributes exist for the format, `first`, `middle` and `last`. @@ -207,7 +207,7 @@ Furthermore, the delimiter for the string output can be set through: >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}{middle}{last}).initials() "J.A.K.D." -If you want to receive a list representation of the initials, yo ucan use :py:meth:`~nameparser.HumanName.initials_list`. +To get a list representation of the initials, use :py:meth:`~nameparser.HumanName.initials_list`. This function is unaffected by :py:attr:`~nameparser.config.Constants.initials_format` .. doctest:: list format From 3623394debbda9deb6d93aee9e2bb429db9d2912 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:07:02 -0800 Subject: [PATCH 117/163] Update dev-requirements.txt prune old dev requirements --- dev-requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 8aab0b6..edd07b3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,2 @@ -ipdb -nose>=1.3.7 -coverage>=4.0.3 dill>=0.2.5 -twine Sphinx From 0e9d7e64fb9231c5572bddcc7c3c12a3681398b3 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:08:14 -0800 Subject: [PATCH 118/163] Create python-package.yml create workflow for testing package on different versions of python --- .github/workflows/python-package.yml | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..5c7731e --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10"] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From fd78ec6e20380454424730f15aa51683f9da1e45 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:27:06 -0800 Subject: [PATCH 119/163] bump to version 1.1.0 --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 38e76e4..91d4068 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.1.0 - January 3, 2022 + - Add initials support (#128) * 1.0.6 - February 8, 2020 - Fix Python 3.8 syntax error (#104) * 1.0.5 - Dec 12, 2019 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 6c898ba..a9ee753 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 0, 6) +VERSION = (1, 1, 0) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From 3f4c623ea934a0c5d6dd9cfdc1e79329c7bea20d Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:29:13 -0800 Subject: [PATCH 120/163] don't lint --- .github/workflows/python-package.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 5c7731e..11aaa6b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,12 +29,6 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | pytest From f401d9ce40a8d5dd81879e7eaa718701820a482e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:31:12 -0800 Subject: [PATCH 121/163] run tests --- .github/workflows/python-package.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 11aaa6b..e6f2be7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,8 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi - - name: Test with pytest + - name: Run Tests run: | - pytest + python tests.py From 95a20cdca787211de7dbf50348afa813e7789223 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:36:21 -0800 Subject: [PATCH 122/163] run tests back to python 3.5 --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e6f2be7..721acaa 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: Test the Python package on: push: @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 From 2a54e19e6b90aa8b8b49a907d3cff3f44e6918e4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 12:48:08 -0800 Subject: [PATCH 123/163] add publish workflow --- .github/workflows/python-publish.yml | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..b6b3cb1 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,36 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Published Python Package + +on: + release: + types: [published] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} From 4421e7b506606ee60e24d97116c095071ca3d826 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 13:14:33 -0800 Subject: [PATCH 124/163] remove duplicate titles these titles are already present --- nameparser/config/titles.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 91a9ac3..aa709b0 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -280,7 +280,6 @@ 'expert', 'fadm', 'family', - 'father', 'federal', 'field', 'film', @@ -318,8 +317,6 @@ 'high', 'highness', 'his', - 'his eminence', - 'his eminence metropolitan', 'historian', 'historicus', 'historien', From 1fce4857b8d0288d9a5b64ab8ec42d2ec0a4d35f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 13:26:07 -0800 Subject: [PATCH 125/163] remove duplicates, add tests --- nameparser/config/prefixes.py | 1 - tests.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 99bc9f4..d4356ce 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -42,7 +42,6 @@ 'ste', 'van', 'vander', - 'van der', 'vel', 'von', 'vom', diff --git a/tests.py b/tests.py index 4e14b1d..d2d433e 100644 --- a/tests.py +++ b/tests.py @@ -1979,6 +1979,21 @@ def test_title_with_periods_lastname_comma(self): self.m(hn.first, "John", hn) self.m(hn.last, "Doe", hn) + def test_mac_with_spaces(self): + hn = HumanName("Jane Mac Beth") + self.m(hn.first, "Jane", hn) + self.m(hn.last, "Mac Beth", hn) + + def test_mac_as_first_name(self): + hn = HumanName("Mac Miller") + self.m(hn.first, "Mac", hn) + self.m(hn.last, "Miller", hn) + + def test_multiple_prefixes(self): + hn = HumanName("Mike van der Velt") + self.m(hn.first, "Mike", hn) + self.m(hn.last, "van der Velt", hn) + class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): From 0f8fcaf735e71e4f4ac2e2ae83d3827b60cbed35 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 14:14:19 -0800 Subject: [PATCH 126/163] don't crash if regexes not defined test can override all of the constants --- nameparser/parser.py | 26 ++++++++++++-------- tests.py | 56 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index b6afd09..b4eb677 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import sys +import re from operator import itemgetter from itertools import groupby @@ -467,11 +468,14 @@ def post_process(self): self.handle_capitalization() def fix_phd(self): - _re = self.C.regexes.phd - match = _re.search(self._full_name) - if match: - self.suffix_list.append(match.group(1)) - self._full_name = _re.sub('', self._full_name) + try: + _re = self.C.regexes.phd + match = _re.search(self._full_name) + if match: + self.suffix_list.append(match.group(1)) + self._full_name = _re.sub('', self._full_name) + except AttributeError: + pass def parse_nicknames(self): """ @@ -485,10 +489,12 @@ def parse_nicknames(self): Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; `quoted_word`, `double_quotes` and `parenthesis`. """ - - re_quoted_word = self.C.regexes.quoted_word - re_double_quotes = self.C.regexes.double_quotes - re_parenthesis = self.C.regexes.parenthesis + + empty_re = re.compile("") + + re_quoted_word = self.C.regexes.quoted_word or empty_re + re_double_quotes = self.C.regexes.double_quotes or empty_re + re_parenthesis = self.C.regexes.parenthesis or empty_re for _re in (re_quoted_word, re_double_quotes, re_parenthesis): if _re.search(self._full_name): @@ -704,7 +710,7 @@ def parse_pieces(self, parts, additional_parts_count=0): # constants so they get parsed correctly later for part in output: # if this part has a period not at the beginning or end - if self.C.regexes.period_not_at_end.match(part): + if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part): # split on periods, any of the split pieces titles or suffixes? # ("Lt.Gov.") period_chunks = part.split(".") diff --git a/tests.py b/tests.py index d2d433e..039918f 100644 --- a/tests.py +++ b/tests.py @@ -20,6 +20,7 @@ """ import logging +import re try: import dill except ImportError: @@ -27,7 +28,7 @@ from nameparser import HumanName from nameparser.util import u -from nameparser.config import Constants +from nameparser.config import Constants, TupleManager log = logging.getLogger('HumanName') @@ -199,6 +200,59 @@ def test_surnames_attribute(self): hn = HumanName("John Edgar Casey Williams III") self.m(hn.surnames, "Edgar Casey Williams", hn) + def test_override_constants(self): + C = Constants() + hn = HumanName(constants=C) + self.assertTrue(hn.C is C) + + def test_override_regex(self): + var = TupleManager([("spaces", re.compile(r"\s+", re.U)),]) + C = Constants(regexes=var) + hn = HumanName(constants=C) + self.assertTrue(hn.C.regexes == var) + + def test_override_titles(self): + var = ["abc","def"] + C = Constants(titles=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.titles) == sorted(var)) + + def test_override_first_name_titles(self): + var = ["abc","def"] + C = Constants(first_name_titles=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.first_name_titles) == sorted(var)) + + def test_override_prefixes(self): + var = ["abc","def"] + C = Constants(prefixes=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.prefixes) == sorted(var)) + + def test_override_suffix_acronyms(self): + var = ["abc","def"] + C = Constants(suffix_acronyms=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.suffix_acronyms) == sorted(var)) + + def test_override_suffix_not_acronyms(self): + var = ["abc","def"] + C = Constants(suffix_not_acronyms=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.suffix_not_acronyms) == sorted(var)) + + def test_override_conjunctions(self): + var = ["abc","def"] + C = Constants(conjunctions=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.conjunctions) == sorted(var)) + + def test_override_capitalization_exceptions(self): + var = TupleManager([("spaces", re.compile(r"\s+", re.U)),]) + C = Constants(capitalization_exceptions=var) + hn = HumanName(constants=C) + self.assertTrue(hn.C.capitalization_exceptions == var) + class FirstNameHandlingTests(HumanNameTestBase): def test_first_name(self): From 322d5e6523fedfa8fdaf815d785dd52dd8ef81b0 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 14:14:30 -0800 Subject: [PATCH 127/163] Update release_log.rst --- docs/release_log.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release_log.rst b/docs/release_log.rst index 91d4068..57b7bf5 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -2,6 +2,7 @@ Release Log =========== * 1.1.0 - January 3, 2022 - Add initials support (#128) + - Add more titles and prefixes (#120, #127, #128, #119) * 1.0.6 - February 8, 2020 - Fix Python 3.8 syntax error (#104) * 1.0.5 - Dec 12, 2019 From 2a3edaef456e96cacf96f9d8041dd951773e86b1 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:01:12 -0800 Subject: [PATCH 128/163] fix #126, don't count as prefix when first name update initials processing to only exclude conjunctions and prefixes when it is not a first name. --- nameparser/parser.py | 18 ++++++++++++++---- tests.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index b4eb677..bbb5e39 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -191,10 +191,17 @@ def as_dict(self, include_empty=True): def __process_initial__(self, name_part): """ - Name parts may include prefixes or conjuctions. This function filters these from the name. - """ - parsed = " ".join([split for split in name_part.split(" ") if len(split) and not (self.is_prefix(split) or self.is_conjunction(split))]) - return parsed[0] if len(parsed) else "" + Name parts may include prefixes or conjuctions. This function filters these from the name unless it is + a first name, since first names cannot be conjunctions or prefixes. + """ + parts = name_part.split(" ") + parsed = "" + if len(parts) and not (name_part == 'first' and (self.is_prefix(parts) or self.is_conjunction(parts))): + parsed = " ".join(parts) + if len(parsed) > 0: + return parsed[0] + else: + return self.C.empty_attribute_default def initials_list(self): """ @@ -855,6 +862,9 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # join everything after the prefix until the next prefix or suffix try: + if i == 0 and total_length >= 1: + # If it's the first piece and there are more than 1 rootnames, assume it's a first name + continue next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) j = pieces.index(next_prefix) if j == i + 1: diff --git a/tests.py b/tests.py index 039918f..7b04e54 100644 --- a/tests.py +++ b/tests.py @@ -1605,6 +1605,17 @@ def test_prefix_before_two_part_last_name(self): self.m(hn.first, "pennie", hn) self.m(hn.last, "von bergen wessels", hn) + def test_prefix_is_first_name(self): + hn = HumanName("Van Johnson") + self.m(hn.first, "Van", hn) + self.m(hn.last, "Johnson", hn) + + def test_prefix_is_first_name_with_middle_name(self): + hn = HumanName("Van Jeremy Johnson") + self.m(hn.first, "Van", hn) + self.m(hn.middle, "Jeremy", hn) + self.m(hn.last, "Johnson", hn) + def test_prefix_before_two_part_last_name_with_suffix(self): hn = HumanName("pennie von bergen wessels III") self.m(hn.first, "pennie", hn) @@ -2313,6 +2324,10 @@ def test_initials_list_complex_name(self): hn = HumanName("Doe, John A. Kenneth, Jr.") self.m(hn.initials_list(), ["J", "A", "K", "D"], hn) + def test_initials_with_prefix_firstname(self): + hn = HumanName("Van Jeremy Johnson") + self.m(hn.initials_list(), ["V", "J", "J"], hn) + TEST_NAMES = ( "John Doe", From 7cae3e0962bb6b9f1bb6b75d5a234200456c5eb7 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:03:10 -0800 Subject: [PATCH 129/163] fix #117, add baroness title --- nameparser/config/titles.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index aa709b0..28f14ba 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -117,6 +117,7 @@ 'banner', 'bard', 'baron', + 'baroness', 'barrister', 'baseball', 'bearer', From 0c037cb3d9b05b09d8deb40d069a68a03853d301 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:09:11 -0800 Subject: [PATCH 130/163] fix #114 add mx title --- nameparser/config/titles.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index 28f14ba..e082bb8 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -430,6 +430,7 @@ 'murshid', 'musician', 'musicologist', + 'mx', 'mystery', 'nanny', 'narrator', From b3aee01edee5c4449e722e04a9ba4a06b05cb2ff Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:10:29 -0800 Subject: [PATCH 131/163] fix #116, add cppm suffix --- nameparser/config/suffixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 9765b92..804f2b5 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -239,6 +239,7 @@ 'cpm', 'cpo', 'cpp', + 'cppm', 'cprc', 'cpre', 'cprp', From 5c5b3b6a2bb37ed15f4ae2fd07e0a767f8a74d54 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:13:53 -0800 Subject: [PATCH 132/163] fix #102, add de' to prefixes --- nameparser/config/prefixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index d4356ce..1573baf 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -17,6 +17,7 @@ 'da', 'dal', 'de', + 'de\'', 'degli', 'dei', 'del', From 1d41f78ddcb8f02983138129e54a06df3dccdbbb Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:30:16 -0800 Subject: [PATCH 133/163] fix #125, remove duke from titles --- nameparser/config/titles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/config/titles.py b/nameparser/config/titles.py index e082bb8..04746bc 100644 --- a/nameparser/config/titles.py +++ b/nameparser/config/titles.py @@ -252,7 +252,7 @@ 'druid', 'drummer', 'duchesse', - 'duke', + # 'duke', # a common first name 'dutchess', 'ecologist', 'economist', From 108298fbc2f2e87b05bd1b25d0ef276e11b9c854 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 15:32:42 -0800 Subject: [PATCH 134/163] duke is no longer a title --- tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests.py b/tests.py index 7b04e54..4d5cb33 100644 --- a/tests.py +++ b/tests.py @@ -1847,8 +1847,8 @@ def test_last_name_is_also_title_no_comma(self): self.m(hn.suffix, "Jr.", hn) def test_last_name_is_also_title_with_comma(self): - hn = HumanName("Duke Martin Luther King, Jr.") - self.m(hn.title, "Duke", hn) + hn = HumanName("Dr Martin Luther King, Jr.") + self.m(hn.title, "Dr", hn) self.m(hn.first, "Martin", hn) self.m(hn.middle, "Luther", hn) self.m(hn.last, "King", hn) From c240265194f2713ece710a80d60e00eeca3a2e48 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 20:26:26 -0800 Subject: [PATCH 135/163] fix #123, "al" prefix doesn't seem to conflict with the first name "Al" --- nameparser/config/prefixes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 1573baf..0334f83 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -12,6 +12,7 @@ #: correct parsing of the last name "von bergen wessels". PREFIXES = set([ 'abu', + 'al', 'bin', 'bon', 'da', From 9101de032294a5d1f23dad3fe08bc6f6460afe22 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 20:27:51 -0800 Subject: [PATCH 136/163] fix initials for prefixes on first names Also make is_suffix, is_prefix and is_conjunction support lists --- nameparser/parser.py | 43 ++++++++++++++++++++++++++++++------------- tests.py | 16 ++++++++++++++++ 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bbb5e39..35f4135 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -189,17 +189,19 @@ def as_dict(self, include_empty=True): d[m] = val return d - def __process_initial__(self, name_part): + def __process_initial__(self, name_part, firstname=False): """ Name parts may include prefixes or conjuctions. This function filters these from the name unless it is a first name, since first names cannot be conjunctions or prefixes. """ parts = name_part.split(" ") - parsed = "" - if len(parts) and not (name_part == 'first' and (self.is_prefix(parts) or self.is_conjunction(parts))): - parsed = " ".join(parts) - if len(parsed) > 0: - return parsed[0] + initials = [] + if len(parts) and isinstance(parts, list): + for part in parts: + if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True: + initials.append(part[0]) + if len(initials) > 0: + return " ".join(initials) else: return self.C.empty_attribute_default @@ -216,7 +218,7 @@ def initials_list(self): >>> name.initials_list() ["J", "D"] """ - first_initials_list = [self.__process_initial__(name) for name in self.first_list if name] + first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] return first_initials_list + middle_initials_list + last_initials_list @@ -238,7 +240,7 @@ def initials(self): "B. A." """ - first_initials_list = [self.__process_initial__(name) for name in self.first_list if name] + first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] @@ -378,14 +380,24 @@ def is_title(self, value): def is_conjunction(self, piece): """Is in the conjunctions set and not :py:func:`is_an_initial()`.""" - return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) + if isinstance(piece, list): + for item in piece: + if self.is_conjunction(item): + return True + else: + return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) def is_prefix(self, piece): """ Lowercase and no periods version of piece is in the :py:data:`~nameparser.config.prefixes.PREFIXES` set. """ - return lc(piece) in self.C.prefixes + if isinstance(piece, list): + for item in piece: + if self.is_prefix(item): + return True + else: + return lc(piece) in self.C.prefixes def is_roman_numeral(self, value): """ @@ -403,9 +415,14 @@ def is_suffix(self, piece): `C.suffix_acronyms`. """ # suffixes may have periods inside them like "M.D." - return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) - or (lc(piece) in self.C.suffix_not_acronyms)) \ - and not self.is_an_initial(piece) + if isinstance(piece, list): + for piece in pieces: + if self.is_suffix(piece): + return True + else: + return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) + or (lc(piece) in self.C.suffix_not_acronyms)) \ + and not self.is_an_initial(piece) def are_suffixes(self, pieces): """Return True if all pieces are suffixes.""" diff --git a/tests.py b/tests.py index 4d5cb33..91917a4 100644 --- a/tests.py +++ b/tests.py @@ -200,6 +200,18 @@ def test_surnames_attribute(self): hn = HumanName("John Edgar Casey Williams III") self.m(hn.surnames, "Edgar Casey Williams", hn) + def test_is_prefix_with_list(self): + hn = HumanName() + items = ['firstname', 'lastname', 'del'] + self.assertTrue(hn.is_prefix(items)) + self.assertTrue(hn.is_prefix(items[1:])) + + def test_is_conjunction_with_list(self): + hn = HumanName() + items = ['firstname', 'lastname', 'and'] + self.assertTrue(hn.is_conjunction(items)) + self.assertTrue(hn.is_conjunction(items[1:])) + def test_override_constants(self): C = Constants() hn = HumanName(constants=C) @@ -2328,6 +2340,10 @@ def test_initials_with_prefix_firstname(self): hn = HumanName("Van Jeremy Johnson") self.m(hn.initials_list(), ["V", "J", "J"], hn) + def test_initials_with_prefix(self): + hn = HumanName("Alex van Johnson") + self.m(hn.initials_list(), ["A", "J"], hn) + TEST_NAMES = ( "John Doe", From c6f94a1be9a9c8a9122b453a2ac5bc1ccb6ef5b5 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 20:50:17 -0800 Subject: [PATCH 137/163] update badge --- README.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.rst b/README.rst index b347593..ee0c152 100644 --- a/README.rst +++ b/README.rst @@ -136,8 +136,7 @@ https://github.com/derek73/python-nameparser .. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py -.. |Build Status| image:: https://travis-ci.org/derek73/python-nameparser.svg?branch=master - :target: https://travis-ci.org/derek73/python-nameparser +.. [![Tests](https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml/badge.svg)](https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml) .. |PyPI| image:: https://img.shields.io/pypi/v/nameparser.svg :target: https://pypi.org/project/nameparser/ .. |Documentation| image:: https://readthedocs.org/projects/nameparser/badge/?version=latest From 3700dec4d0a95533a7e9cbc9f814b542ad5255d3 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 20:52:38 -0800 Subject: [PATCH 138/163] update badge --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index ee0c152..effc51b 100644 --- a/README.rst +++ b/README.rst @@ -135,8 +135,8 @@ https://github.com/derek73/python-nameparser .. _Start a New Issue: https://github.com/derek73/python-nameparser/issues .. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py - -.. [![Tests](https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml/badge.svg)](https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml) +.. |Build Status| image:: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml/badge.svg + :target: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml .. |PyPI| image:: https://img.shields.io/pypi/v/nameparser.svg :target: https://pypi.org/project/nameparser/ .. |Documentation| image:: https://readthedocs.org/projects/nameparser/badge/?version=latest From e9aef6a7ae050f63efadf447e88b5964420392ad Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 3 Jan 2022 20:59:09 -0800 Subject: [PATCH 139/163] Update README.rst --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index effc51b..11db547 100644 --- a/README.rst +++ b/README.rst @@ -13,6 +13,7 @@ individual components. * hn.suffix * hn.nickname * hn.surnames *(middle + last)* +* hn.initials *first initial of each name part+ Supported Name Structures ~~~~~~~~~~~~~~~~~~~~~~~~~ From da1bbc8ee1b1d82e0ff23c13c6165c3e75f9f381 Mon Sep 17 00:00:00 2001 From: huangwf0119 <73567665+huangwf0119@users.noreply.github.com> Date: Fri, 28 Jan 2022 15:59:41 +0800 Subject: [PATCH 140/163] Fix one bug in HumanName.is_suffix --- nameparser/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 35f4135..bff64e2 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -416,8 +416,8 @@ def is_suffix(self, piece): """ # suffixes may have periods inside them like "M.D." if isinstance(piece, list): - for piece in pieces: - if self.is_suffix(piece): + for item in piece: + if self.is_suffix(item): return True else: return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) From 993f7aa3c89d65165dc4b3095a2d32a2ffccb182 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 01:24:04 -0800 Subject: [PATCH 141/163] version 1.1.1 --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 57b7bf5..f89b8a7 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.1.1 - January 28, 2022 + - Fix bug in is_suffix handling of lists (#128) * 1.1.0 - January 3, 2022 - Add initials support (#128) - Add more titles and prefixes (#120, #127, #128, #119) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index a9ee753..6439529 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 1, 0) +VERSION = (1, 1, 1) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' From c44a281e671fec49cf66d8ccc3bea28099f0653f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 01:25:34 -0800 Subject: [PATCH 142/163] typo --- docs/release_log.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index f89b8a7..954d992 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,7 +1,7 @@ Release Log =========== * 1.1.1 - January 28, 2022 - - Fix bug in is_suffix handling of lists (#128) + - Fix bug in is_suffix handling of lists (#129) * 1.1.0 - January 3, 2022 - Add initials support (#128) - Add more titles and prefixes (#120, #127, #128, #119) From 265d2b3b9611b41060ac550c85911be95ed7ca5b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 18:26:30 -0800 Subject: [PATCH 143/163] fix rst formatting error --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 11db547..eebde5b 100644 --- a/README.rst +++ b/README.rst @@ -13,7 +13,7 @@ individual components. * hn.suffix * hn.nickname * hn.surnames *(middle + last)* -* hn.initials *first initial of each name part+ +* hn.initials *(first initial of each name part)* Supported Name Structures ~~~~~~~~~~~~~~~~~~~~~~~~~ From 95dc27b36c84e1d76b1ee5f7d74c815c0134fb86 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 18:45:07 -0800 Subject: [PATCH 144/163] add content type for readme --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index ba0cc5a..2067716 100755 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ def read(fname): packages = ['nameparser','nameparser.config'], description = 'A simple Python module for parsing human names into their individual components.', long_description = README, + long_description_content_type = "text/x-rst", version = nameparser.__version__, url = nameparser.__url__, author = nameparser.__author__, From 75fba9b3eada82151fccc8ed1f9f957b28d7a90e Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 18:48:12 -0800 Subject: [PATCH 145/163] check package can build too --- .github/workflows/python-package.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 721acaa..d47e569 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,7 +27,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install twine if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi - name: Run Tests run: | python tests.py + python setup.py sdist bdist_wheel + twine check dist/* From 3077ad5efdc696649cf5cc6371cbdc6608feebe8 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 18:52:59 -0800 Subject: [PATCH 146/163] only build source dist --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index d47e569..ea0bfa7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -32,5 +32,5 @@ jobs: - name: Run Tests run: | python tests.py - python setup.py sdist bdist_wheel + python setup.py sdist twine check dist/* From f1dff67cbf13697dad6cfd07c27da251b9030f03 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 19:31:45 -0800 Subject: [PATCH 147/163] silence error about duplicate index --- docs/modules.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/modules.rst b/docs/modules.rst index eaf3240..2056330 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -7,6 +7,7 @@ HumanName.parser .. py:module:: nameparser.parser .. py:class:: HumanName + :noindex: .. autoclass:: HumanName :members: From dc53c0e64276540d87a930420a4eb123cdb5c961 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 19:35:31 -0800 Subject: [PATCH 148/163] test that the docs build with sphinx --- .github/workflows/python-package.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ea0bfa7..5ad4ce0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -28,9 +28,11 @@ jobs: run: | python -m pip install --upgrade pip pip install twine + pip install sphinx if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi - name: Run Tests run: | python tests.py python setup.py sdist twine check dist/* + sphinx-build -b html docs dist/docs From 8b73ff9e0aed23285f451cfa7091e47e9835a608 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 28 Jan 2022 19:40:34 -0800 Subject: [PATCH 149/163] use pip from python binary --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 5ad4ce0..cf60638 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,8 +27,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install twine - pip install sphinx + python -m pip install twine + python -m pip install sphinx if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi - name: Run Tests run: | From 8144083eac5a7871101bafcd6ba289d402ca30f8 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Tue, 1 Feb 2022 15:27:33 +0000 Subject: [PATCH 150/163] Correct a spelling mistake --- nameparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bff64e2..5e3f32f 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -191,7 +191,7 @@ def as_dict(self, include_empty=True): def __process_initial__(self, name_part, firstname=False): """ - Name parts may include prefixes or conjuctions. This function filters these from the name unless it is + Name parts may include prefixes or conjunctions. This function filters these from the name unless it is a first name, since first names cannot be conjunctions or prefixes. """ parts = name_part.split(" ") From 42292eb4b9a026afbb81f47b647a03a079b71f39 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 7 Feb 2022 19:26:13 -0800 Subject: [PATCH 151/163] don't test is_title() if there's a first name --- nameparser/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index bff64e2..edeedc5 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -641,9 +641,9 @@ def parse_full_name(self): except IndexError: nxt = None - if self.is_title(piece) \ + if not self.first \ and (nxt or len(pieces) == 1) \ - and not self.first: + and self.is_title(piece): self.title_list.append(piece) continue if not self.first: From 5b1b88da5e6962d1adb9ab56a09928a7250bf6e4 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 7 Feb 2022 19:27:39 -0800 Subject: [PATCH 152/163] don't test is_title() if there's a first name --- nameparser/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index d34af4e..162b044 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -681,9 +681,9 @@ def parse_full_name(self): except IndexError: nxt = None - if self.is_title(piece) \ + if not self.first \ and (nxt or len(post_comma_pieces) == 1) \ - and not self.first: + and self.is_title(piece): self.title_list.append(piece) continue if not self.first: From ab8e5b51c9da92dabdab47fbf12b9487818464d1 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Mon, 7 Feb 2022 19:31:50 -0800 Subject: [PATCH 153/163] don't test is_title() if there's a first name --- nameparser/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 162b044..2ee146c 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -591,9 +591,9 @@ def parse_full_name(self): nxt = None # title must have a next piece, unless it's just a title - if self.is_title(piece) \ + if not self.first \ and (nxt or p_len == 1) \ - and not self.first: + and self.is_title(piece): self.title_list.append(piece) continue if not self.first: From caec4ab12cb1d55205d3eb82d2b35ad7174ac901 Mon Sep 17 00:00:00 2001 From: Pavel T Date: Thu, 29 Sep 2022 11:47:39 -0400 Subject: [PATCH 154/163] fix HumanName repr for names with single quotes eg `HumanName("O'NEILL")` --- nameparser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 2ee146c..5d15fe7 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -150,7 +150,7 @@ def __repr__(self): if self.unparsable: _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, } else: - _string = "<%(class)s : [\n\ttitle: '%(title)s' \n\tfirst: '%(first)s' \n\tmiddle: '%(middle)s' \n\tlast: '%(last)s' \n\tsuffix: '%(suffix)s'\n\tnickname: '%(nickname)s'\n]>" % { + _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % { 'class': self.__class__.__name__, 'title': self.title or '', 'first': self.first or '', From 593d2c44d36d09e24f32ee8a3ce4492320b05092 Mon Sep 17 00:00:00 2001 From: moomoohk Date: Wed, 5 Oct 2022 22:45:34 +0300 Subject: [PATCH 155/163] Make HumanName objects hashable --- nameparser/parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nameparser/parser.py b/nameparser/parser.py index 2ee146c..bb19fb6 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -141,6 +141,9 @@ def __unicode__(self): return self.collapse_whitespace(_s).strip(', ') return " ".join(self) + def __hash__(self): + return hash(str(self)) + def __str__(self): if sys.version_info[0] >= 3: return self.__unicode__() From 3ae1f2a7100ed4f7c29b06b682665ff3a9494688 Mon Sep 17 00:00:00 2001 From: moomoohk Date: Wed, 5 Oct 2022 22:51:40 +0300 Subject: [PATCH 156/163] Update python-package.yml --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index cf60638..98c83f8 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -4,6 +4,7 @@ name: Test the Python package on: + workflow_dispatch: push: branches: [ master ] pull_request: From 0033f390be4d7779aaef5acfbf519dd2d8fe1d71 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 13 Nov 2022 18:45:39 -0800 Subject: [PATCH 157/163] add support for name parts in the constructor, fix #140 --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- nameparser/parser.py | 28 ++++++++++++++++++++++++---- tests.py | 39 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 954d992..6430d05 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.1.2 - November 13, 2022 + - Add support for attributes in constructor (#140) * 1.1.1 - January 28, 2022 - Fix bug in is_suffix handling of lists (#129) * 1.1.0 - January 3, 2022 diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 6439529..eb595d6 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 1, 1) +VERSION = (1, 1, 2) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/parser.py b/nameparser/parser.py index a9874b3..c35f55e 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -36,7 +36,10 @@ class HumanName(object): Instantiation assigns to ``full_name``, and assignment to :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the - name, these instance attributes are available. + name, these instance attributes are available. Alternatively, you can pass + any of the instance attributes to the constructor method and skip the parsing + process. If any of the the instance attributes are passed to the constructor + as keywords, :py:func:`parse_full_name` will not be performed. **HumanName Instance Attributes** @@ -56,6 +59,12 @@ class HumanName(object): :param str string_format: python string formatting :param str initials_format: python initials string formatting :param str initials_delimter: string delimiter for initials + :param str first: first name + :param str middle: middle name + :param str last: last name + :param str title: The title or prenominal + :param str suffix: The suffix or postnominal + :param str nickname: Nicknames """ C = CONSTANTS @@ -77,7 +86,9 @@ class HumanName(object): _full_name = '' def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, - string_format=None, initials_format=None, initials_delimiter=None): + string_format=None, initials_format=None, initials_delimiter=None, + first=None, middle=None, last=None, title=None, suffix=None, + nickname=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() @@ -86,8 +97,17 @@ def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, self.string_format = string_format or self.C.string_format self.initials_format = initials_format or self.C.initials_format self.initials_delimiter = initials_delimiter or self.C.initials_delimiter - # full_name setter triggers the parse - self.full_name = full_name + if (first or middle or last or title or suffix or nickname): + self.first = first + self.middle = middle + self.last = last + self.title = title + self.suffix = suffix + self.nickname = nickname + self.unparsable = False + else: + # full_name setter triggers the parse + self.full_name = full_name def __iter__(self): return self diff --git a/tests.py b/tests.py index 91917a4..5eb1c72 100644 --- a/tests.py +++ b/tests.py @@ -2343,7 +2343,44 @@ def test_initials_with_prefix_firstname(self): def test_initials_with_prefix(self): hn = HumanName("Alex van Johnson") self.m(hn.initials_list(), ["A", "J"], hn) - + + def test_constructor_first(self): + hn = HumanName(first="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.first, "TheName", hn) + + def test_constructor_middle(self): + hn = HumanName(middle="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.middle, "TheName", hn) + + def test_constructor_last(self): + hn = HumanName(last="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.last, "TheName", hn) + + def test_constructor_title(self): + hn = HumanName(title="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.title, "TheName", hn) + + def test_constructor_suffix(self): + hn = HumanName(suffix="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.suffix, "TheName", hn) + + def test_constructor_nickname(self): + hn = HumanName(nickname="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.nickname, "TheName", hn) + + def test_constructor_multiple(self): + hn = HumanName(first="TheName", last="lastname", title="mytitle", full_name="donotparse") + self.assertFalse(hn.unparsable) + self.m(hn.first, "TheName", hn) + self.m(hn.last, "lastname", hn) + self.m(hn.title, "mytitle", hn) + TEST_NAMES = ( "John Doe", From c2d07184489ae733ff7dcb6c3e395cf682e7bb5b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sun, 13 Nov 2022 18:59:30 -0800 Subject: [PATCH 158/163] update release notes and resources --- docs/release_log.rst | 2 ++ docs/resources.rst | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/docs/release_log.rst b/docs/release_log.rst index 6430d05..a6d6aa4 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -2,6 +2,8 @@ Release Log =========== * 1.1.2 - November 13, 2022 - Add support for attributes in constructor (#140) + - Make HumanName instances hashable (#138) + - Update repr for names with single quotes (#137) * 1.1.1 - January 28, 2022 - Fix bug in is_suffix handling of lists (#129) * 1.1.0 - January 3, 2022 diff --git a/docs/resources.rst b/docs/resources.rst index 6cc28e8..8934aae 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -7,6 +7,8 @@ Naming Practices and Resources * Wikipedia_Anthroponymy_ * Wikipedia_Naming_conventions_ * Wikipedia_List_Of_Titles_ + * Tussenvoegsel_ + * Family_Name_Affixes_ .. _US_Census_Surname_Data_2000: https://www.census.gov/data/developers/data-sets/surnames/2000.html .. _US_Social_Security_Administration_Baby_Names_Index: https://www.ssa.gov/oact/babynames/limits.html @@ -14,3 +16,5 @@ Naming Practices and Resources .. _Wikipedia_Anthroponymy: https://en.wikipedia.org/wiki/Anthroponymy .. _Wikipedia_Naming_conventions: http://en.wikipedia.org/wiki/Wikipedia:Naming_conventions_(people) .. _Wikipedia_List_Of_Titles: https://en.wikipedia.org/wiki/Title +.. _Tussenvoegsel: https://en.wikipedia.org/wiki/Tussenvoegsel +.. _Family_Name_Affixes : https://en.wikipedia.org/wiki/List_of_family_name_affixes From 89851f43ef9c09a1459fa87d42a3a2e47816a31e Mon Sep 17 00:00:00 2001 From: Evgeny Liskovets Date: Thu, 14 Sep 2023 12:36:50 -0400 Subject: [PATCH 159/163] Fix case when we have two same prefixes in the name --- nameparser/parser.py | 12 ++++++------ tests.py | 10 +++++++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index c35f55e..50607f6 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -36,10 +36,10 @@ class HumanName(object): Instantiation assigns to ``full_name``, and assignment to :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the - name, these instance attributes are available. Alternatively, you can pass + name, these instance attributes are available. Alternatively, you can pass any of the instance attributes to the constructor method and skip the parsing - process. If any of the the instance attributes are passed to the constructor - as keywords, :py:func:`parse_full_name` will not be performed. + process. If any of the the instance attributes are passed to the constructor + as keywords, :py:func:`parse_full_name` will not be performed. **HumanName Instance Attributes** @@ -536,9 +536,9 @@ def parse_nicknames(self): Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; `quoted_word`, `double_quotes` and `parenthesis`. """ - + empty_re = re.compile("") - + re_quoted_word = self.C.regexes.quoted_word or empty_re re_double_quotes = self.C.regexes.double_quotes or empty_re re_parenthesis = self.C.regexes.parenthesis or empty_re @@ -906,7 +906,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # If it's the first piece and there are more than 1 rootnames, assume it's a first name continue next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) - j = pieces.index(next_prefix) + j = pieces.index(next_prefix, i+1) if j == i + 1: # if there are two prefixes in sequence, join to the following piece j += 1 diff --git a/tests.py b/tests.py index 5eb1c72..2760991 100644 --- a/tests.py +++ b/tests.py @@ -2071,6 +2071,10 @@ def test_multiple_prefixes(self): self.m(hn.first, "Mike", hn) self.m(hn.last, "van der Velt", hn) + def test_prefix_as_fist_name(self): + hh = HumanName("Van Ma Van") + self.m(hh.first, "Van Ma", hh) + self.m(hh.last, "Van", hh) class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): @@ -2343,12 +2347,12 @@ def test_initials_with_prefix_firstname(self): def test_initials_with_prefix(self): hn = HumanName("Alex van Johnson") self.m(hn.initials_list(), ["A", "J"], hn) - + def test_constructor_first(self): hn = HumanName(first="TheName") self.assertFalse(hn.unparsable) self.m(hn.first, "TheName", hn) - + def test_constructor_middle(self): hn = HumanName(middle="TheName") self.assertFalse(hn.unparsable) @@ -2380,7 +2384,7 @@ def test_constructor_multiple(self): self.m(hn.first, "TheName", hn) self.m(hn.last, "lastname", hn) self.m(hn.title, "mytitle", hn) - + TEST_NAMES = ( "John Doe", From 609be71e72205b6c0719d8eb55386df2218daee0 Mon Sep 17 00:00:00 2001 From: Evgeny Liskovets Date: Thu, 14 Sep 2023 12:45:54 -0400 Subject: [PATCH 160/163] Rename test with better description --- nameparser/parser.py | 2 +- tests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 50607f6..a5eb352 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -906,7 +906,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # If it's the first piece and there are more than 1 rootnames, assume it's a first name continue next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) - j = pieces.index(next_prefix, i+1) + j = pieces.index(next_prefix, i + 1) if j == i + 1: # if there are two prefixes in sequence, join to the following piece j += 1 diff --git a/tests.py b/tests.py index 2760991..be407cc 100644 --- a/tests.py +++ b/tests.py @@ -2071,7 +2071,7 @@ def test_multiple_prefixes(self): self.m(hn.first, "Mike", hn) self.m(hn.last, "van der Velt", hn) - def test_prefix_as_fist_name(self): + def test_2_same_prefixes_in_the_name(self): hh = HumanName("Van Ma Van") self.m(hh.first, "Van Ma", hh) self.m(hh.last, "Van", hh) From ed322da1eb6cfb571118453e0398a9db3b03484f Mon Sep 17 00:00:00 2001 From: Evgeny Liskovets Date: Thu, 14 Sep 2023 18:29:11 -0400 Subject: [PATCH 161/163] Change test --- tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests.py b/tests.py index be407cc..2cdd526 100644 --- a/tests.py +++ b/tests.py @@ -2072,9 +2072,10 @@ def test_multiple_prefixes(self): self.m(hn.last, "van der Velt", hn) def test_2_same_prefixes_in_the_name(self): - hh = HumanName("Van Ma Van") - self.m(hh.first, "Van Ma", hh) - self.m(hh.last, "Van", hh) + hh = HumanName("Vincent van Gogh van Beethoven") + self.m(hh.first, "Vincent", hh) + self.m(hh.middle, "van Gogh", hh) + self.m(hh.last, "van Beethoven", hh) class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): From 42a3b7b8ceba9d5d84329970dbbaa3dcb4ce28f2 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Thu, 14 Sep 2023 21:28:33 -0700 Subject: [PATCH 162/163] Update to current python versions --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 98c83f8..0cc23c2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v2 From 759a1316f2fda4395714f36d777fd014dcdd51b0 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 20 Sep 2023 17:05:34 -0700 Subject: [PATCH 163/163] v1.1.3 update version and release notes --- docs/release_log.rst | 2 ++ nameparser/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index a6d6aa4..a0ab7ee 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,7 @@ Release Log =========== +* 1.1.3 - September 20, 2023 + - Fix case when we have two same prefixes in the name ()#147) * 1.1.2 - November 13, 2022 - Add support for attributes in constructor (#140) - Make HumanName instances hashable (#138) diff --git a/nameparser/__init__.py b/nameparser/__init__.py index eb595d6..ab914e9 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (1, 1, 2) +VERSION = (1, 1, 3) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com'