diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..55170d0 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# http://editorconfig.org + +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.{py,rst,ini}] +indent_style = space +indent_size = 4 + +[*.{html,json,yml}] +indent_style = space +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..0cc23c2 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Test the Python package + +on: + workflow_dispatch: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install twine + python -m pip install sphinx + if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi + - name: Run Tests + run: | + python tests.py + python setup.py sdist + twine check dist/* + sphinx-build -b html docs dist/docs diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..b6b3cb1 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,36 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Published Python Package + +on: + release: + types: [published] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index c586728..3874fab 100644 --- a/.gitignore +++ b/.gitignore @@ -5,12 +5,13 @@ __pycache__/ .python2/ MANIFEST nameparser.egg-info/ -dummycert.pem build *.egg .coverage dist .idea +Pipfile +Pipfile.lock # docs docs/_* diff --git a/.travis.yml b/.travis.yml index dc37c42..42fadab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,15 @@ language: python python: - - "2.6" - "2.7" - - "3.2" - - "3.3" - "3.4" - "3.5" - "3.6" + - "3.7" + - "3.8" # command to install dependencies -install: +install: - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi - - "pip install dill" + - if [[ $TRAVIS_PYTHON_VERSION -ne '3.4' ]]; then pip install dill; fi - "python setup.py install" # command to run tests script: python tests.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2e1b8ae..bb9fc4a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Contributing Development Environment Setup -------------------------------- -There are some exernal dependencies required in order to run the +There are some external dependencies required in order to run the tests, located in the dev-requirements.txt file. pip install -r dev-requirements.txt @@ -67,7 +67,7 @@ don't blow up, so it can be a helpful regression indicator. New Releases ------------ -[https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/](Publishing to Pypi Guide) +[Publishing to Pypi Guide](https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/) $ python setup.py sdist bdist_wheel $ twine upload dist/* diff --git a/README.rst b/README.rst index da2265c..eebde5b 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,7 @@ Name Parser =========== -.. image:: https://travis-ci.org/derek73/python-nameparser.svg?branch=master - :target: https://travis-ci.org/derek73/python-nameparser -.. image:: https://badge.fury.io/py/nameparser.svg - :target: http://badge.fury.io/py/nameparser +|Build Status| |PyPI| |PyPI version| |Documentation| A simple Python (3.2+ & 2.6+) module for parsing human names into their individual components. @@ -15,6 +12,8 @@ individual components. * hn.last * hn.suffix * hn.nickname +* hn.surnames *(middle + last)* +* hn.initials *(first initial of each name part)* Supported Name Structures ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -52,9 +51,9 @@ install with pip using the command below. ``pip install -e git+git://github.com/derek73/python-nameparser.git#egg=nameparser`` -If you're looking for a web service, check out -`eyeseast's nameparse service `_, a -simple Heroku-friendly Flask wrapper for this module. +If you need to handle lists of names, check out +`namesparser `_, a +compliment to this module that handles multiple names in a string. Quick Start Example @@ -135,4 +134,13 @@ https://github.com/derek73/python-nameparser .. _CONTRIBUTING.md: https://github.com/derek73/python-nameparser/tree/master/CONTRIBUTING.md .. _Start a New Issue: https://github.com/derek73/python-nameparser/issues -.. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py \ No newline at end of file +.. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py + +.. |Build Status| image:: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml/badge.svg + :target: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml +.. |PyPI| image:: https://img.shields.io/pypi/v/nameparser.svg + :target: https://pypi.org/project/nameparser/ +.. |Documentation| image:: https://readthedocs.org/projects/nameparser/badge/?version=latest + :target: http://nameparser.readthedocs.io/en/latest/?badge=latest +.. |PyPI version| image:: https://img.shields.io/pypi/pyversions/nameparser.svg + :target: https://pypi.org/project/nameparser/ diff --git a/dev-requirements.txt b/dev-requirements.txt index 8aab0b6..edd07b3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,2 @@ -ipdb -nose>=1.3.7 -coverage>=4.0.3 dill>=0.2.5 -twine Sphinx diff --git a/docs/customize.rst b/docs/customize.rst index 0809c89..1e4f38d 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). Editable attributes of nameparser.config.CONSTANTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:obj:`~nameparser.config.Constants.titles` - Pieces that come before the name. Cannot include things that may be first names -* :py:obj:`~nameparser.config.Constants.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David" -* :py:obj:`~nameparser.config.Constants.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d." -* :py:obj:`~nameparser.config.Constants.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr." -* :py:obj:`~nameparser.config.Constants.conjunctions` - Connectors like "and" that join the preceeding piece to the following piece. -* :py:obj:`~nameparser.config.Constants.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceeding -* :py:obj:`~nameparser.config.Constants.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D" -* :py:obj:`~nameparser.config.Constants.regexes` - Regular expressions used to find words, initials, nicknames, etc. +* :py:data:`~nameparser.config.titles.TITLES` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. +* :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". +* :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". +* :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece. +* :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. +* :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". +* :py:data:`~nameparser.config.regexes.REGEXES` - Regular expressions used to find words, initials, nicknames, etc. Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning the constants for your project. These methods automatically lower case and @@ -57,6 +57,8 @@ Other editable attributes * :py:obj:`~nameparser.config.Constants.string_format` - controls output from `str()` * :py:obj:`~nameparser.config.Constants.empty_attribute_default` - value returned by empty attributes, defaults to empty string +* :py:obj:`~nameparser.config.Constants.capitalize_name` - If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to :py:class:`~nameparser.parser.HumanName` instance. +* :py:obj:`~nameparser.config.Constants.force_mixed_case_capitalization` - If set, forces the capitalization of mixed case strings when :py:meth:`~nameparser.parser.HumanName.capitalize` is called. diff --git a/docs/modules.rst b/docs/modules.rst index eaf3240..2056330 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -7,6 +7,7 @@ HumanName.parser .. py:module:: nameparser.parser .. py:class:: HumanName + :noindex: .. autoclass:: HumanName :members: diff --git a/docs/release_log.rst b/docs/release_log.rst index f626358..a0ab7ee 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -1,5 +1,50 @@ Release Log =========== +* 1.1.3 - September 20, 2023 + - Fix case when we have two same prefixes in the name ()#147) +* 1.1.2 - November 13, 2022 + - Add support for attributes in constructor (#140) + - Make HumanName instances hashable (#138) + - Update repr for names with single quotes (#137) +* 1.1.1 - January 28, 2022 + - Fix bug in is_suffix handling of lists (#129) +* 1.1.0 - January 3, 2022 + - Add initials support (#128) + - Add more titles and prefixes (#120, #127, #128, #119) +* 1.0.6 - February 8, 2020 + - Fix Python 3.8 syntax error (#104) +* 1.0.5 - Dec 12, 2019 + - Fix suffix parsing bug in comma parts (#98) + - Fix deprecation warning on Python 3.7 (#94) + - Improved capitalization support of mixed case names (#90) + - Remove "elder" from titles (#96) + - Add post-nominal list from Wikipedia to suffixes (#93) +* 1.0.4 - June 26, 2019 + - Better nickname handling of multiple single quotes (#86) + - full_name attribute now returns formatted string output instead of original string (#87) +* 1.0.3 - April 18, 2019 + - fix sys.stdin usage when stdin doesn't exist (#82) + - support for escaping log entry arguments (#84) +* 1.0.2 - Oct 26, 2018 + - Fix handling of only nickname and last name (#78) +* 1.0.1 - August 30, 2018 + - Fix overzealous regex for "Ph. D." (#43) + - Add `surnames` attribute as aggregate of middle and last names +* 1.0.0 - August 30, 2018 + - Fix support for nicknames in single quotes (#74) + - Change prefix handling to support prefixes on first names (#60) + - Fix prefix capitalization when not part of lastname (#70) + - Handle erroneous space in "Ph. D." (#43) +* 0.5.8 - August 19, 2018 + - Add "Junior" to suffixes (#76) + - Add "dra" and "srta" to titles (#77) +* 0.5.7 - June 16, 2018 + - Fix doc link (#73) + - Fix handling of "do" and "dos" Portuguese prefixes (#71, #72) +* 0.5.6 - January 15, 2018 + - Fix python version check (#64) +* 0.5.5 - January 10, 2018 + - Support J.D. as suffix and Wm. as title * 0.5.4 - December 10, 2017 - Add Dr to suffixes (#62) - Add the full set of Italian derivatives from "di" (#59) @@ -78,7 +123,7 @@ Release Log - Generate documentation using sphinx and host on readthedocs. * 0.2.10 - May 6, 2014 - If name is only a title and one part, assume it's a last name instead of a first name, with exceptions for some titles like 'Sir'. (`#7 `_). - - Add some judicial and other common titles. (#9) + - Add some judicial and other common titles. (#9) * 0.2.9 - Apr 1, 2014 - Add a new nickname attribute containing anything in parenthesis or double quotes (`Issue 33 `_). * 0.2.8 - Oct 25, 2013 @@ -91,7 +136,7 @@ Release Log * 0.2.5 - Feb 11, 2013 - Set logging handler to NullHandler - Remove 'ben' from PREFIXES because it's more common as a name than a prefix. - - Deprecate BlankHumanNameError. Do not raise exceptions if full_name is empty string. + - Deprecate BlankHumanNameError. Do not raise exceptions if full_name is empty string. * 0.2.4 - Feb 10, 2013 - Adjust logging, don't set basicConfig. Fix `Issue 10 `_ and `Issue 26 `_. - Fix handling of single lower case initials that are also conjunctions, e.g. "john e smith". Re `Issue 11 `_. @@ -102,12 +147,12 @@ Release Log - tests/test.py can now take an optional name argument that will return repr() for that name. * 0.2.3 - Fix overzealous "Mac" regex * 0.2.2 - Fix parsing error -* 0.2.0 +* 0.2.0 - Significant refactor of parsing logic. Handle conjunctions and prefixes before parsing into attribute buckets. - Support attribute overriding by assignment. - - Support multiple titles. - - Lowercase titles constants to fix bug with comparison. + - Support multiple titles. + - Lowercase titles constants to fix bug with comparison. - Move documentation to README.rst, add release log. * 0.1.4 - Use set() in constants for improved speed. setuptools compatibility - sketerpot * 0.1.3 - Add capitalization feature - twotwo diff --git a/docs/resources.rst b/docs/resources.rst index 0c70695..8934aae 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -2,13 +2,19 @@ Naming Practices and Resources ============================== * US_Census_Surname_Data_2000_ + * US_Social_Security_Administration_Baby_Names_Index_ * Naming_practice_guide_UK_2006_ * Wikipedia_Anthroponymy_ * Wikipedia_Naming_conventions_ * Wikipedia_List_Of_Titles_ + * Tussenvoegsel_ + * Family_Name_Affixes_ -.. _US_Census_Surname_Data_2000: http://www.census.gov/genealogy/www/data/2000surnames/index.html +.. _US_Census_Surname_Data_2000: https://www.census.gov/data/developers/data-sets/surnames/2000.html +.. _US_Social_Security_Administration_Baby_Names_Index: https://www.ssa.gov/oact/babynames/limits.html .. _Naming_practice_guide_UK_2006: https://www.fbiic.gov/public/2008/nov/Naming_practice_guide_UK_2006.pdf .. _Wikipedia_Anthroponymy: https://en.wikipedia.org/wiki/Anthroponymy .. _Wikipedia_Naming_conventions: http://en.wikipedia.org/wiki/Wikipedia:Naming_conventions_(people) .. _Wikipedia_List_Of_Titles: https://en.wikipedia.org/wiki/Title +.. _Tussenvoegsel: https://en.wikipedia.org/wiki/Tussenvoegsel +.. _Family_Name_Affixes : https://en.wikipedia.org/wiki/List_of_family_name_affixes diff --git a/docs/usage.rst b/docs/usage.rst index aa21951..7fbe274 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -23,6 +23,8 @@ The examples use Python 3, but Python 2.6+ is supported. 'de la Vega' >>> name.suffix 'III' + >>> name.surnames + 'Q. Xavier de la Vega' >>> name.full_name = "Juan Q. Xavier Velasquez y Garcia, Jr." >>> name >> str(name) 'Shirley MacLaine' +To apply capitalization to all `HumanName` instances, set +:py:attr:`~nameparser.config.Constants.capitalize_name` to `True`. + +.. doctest:: capitalize_name + :options: +NORMALIZE_WHITESPACE + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.capitalize_name = True + >>> name = HumanName("bob v. de la macdole-eisenhower phd") + >>> str(name) + 'Bob V. de la MacDole-Eisenhower Ph.D.' + +To force the capitalization of mixed case strings on all `HumanName` instances, +set :py:attr:`~nameparser.config.Constants.force_mixed_case_capitalization` to `True`. + +.. doctest:: force_mixed_case_capitalization + :options: +NORMALIZE_WHITESPACE + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_mixed_case_capitalization = True + >>> name = HumanName('Shirley Maclaine') + >>> name.capitalize() + >>> str(name) + 'Shirley MacLaine' + Nickname Handling ------------------ -The content of parenthesis or double quotes in the name will be +The content of parenthesis or quotes in the name will be available from the nickname attribute. .. doctest:: nicknames @@ -136,7 +162,7 @@ You can change the default formatting for all `HumanName` instances by setting a >>> str(name) 'Robert (Rob) Johnson' -You can control the order and presense of any name fields by changing the +You can control the order and presence of any name fields by changing the :py:attr:`~nameparser.config.Constants.string_format` attribute of the shared CONSTANTS instance. Don't want to include nicknames in your output? No problem. Just omit that keyword from the `string_format` attribute. @@ -150,3 +176,41 @@ Don't want to include nicknames in your output? No problem. Just omit that keywo 'Dr. Juan de la Vega' +Initials Support +---------------- + +The HumanName class can try to get the correct representation of initials. +Initials can be tricky as different format usages exist. +To exclude any of the name parts from the initials, change the initials format string: +:py:attr:`~nameparser.config.Constants.initials_format` +Three attributes exist for the format, `first`, `middle` and `last`. + +.. doctest:: initials format + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.initials_format = "{first} {middle}" + >>> HumanName("Doe, John A. Kenneth, Jr.").initials() + 'J. A. K.' + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{last}, {first}).initials() + 'D., J.' + + +Furthermore, the delimiter for the string output can be set through: +:py:attr:`~nameparser.config.Constants.initials_delimiter` + +.. doctest:: initials delimiter + + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials() + "J; A; K;" + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.initials_delimiter = "." + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}{middle}{last}).initials() + "J.A.K.D." + +To get a list representation of the initials, use :py:meth:`~nameparser.HumanName.initials_list`. +This function is unaffected by :py:attr:`~nameparser.config.Constants.initials_format` + +.. doctest:: list format + >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials_list() + ["J", "A", "K", "D"] + diff --git a/nameparser/__init__.py b/nameparser/__init__.py index 074bf5e..ab914e9 100644 --- a/nameparser/__init__.py +++ b/nameparser/__init__.py @@ -1,4 +1,4 @@ -VERSION = (0, 5, 4) +VERSION = (1, 1, 3) __version__ = '.'.join(map(str, VERSION)) __author__ = "Derek Gulbranson" __author_email__ = 'derek73@gmail.com' diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 3b11e88..7b2baef 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,8 +29,12 @@ unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals -import collections import sys +try: + # Python 3.3+ + from collections.abc import Set +except ImportError: + from collections import Set from nameparser.util import binary_type from nameparser.util import lc @@ -45,35 +49,37 @@ DEFAULT_ENCODING = 'UTF-8' -class SetManager(collections.Set): + +class SetManager(Set): ''' Easily add and remove config variables per module or instance. Subclass of - ``collections.Set``. - + ``collections.abc.Set``. + Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. - + ''' + def __init__(self, elements): self.elements = set(elements) - + def __call__(self): return self.elements - + def __repr__(self): - return "SetManager({})".format(self.elements) # used for docs - + return "SetManager({})".format(self.elements) # used for docs + def __iter__(self): return iter(self.elements) - + def __contains__(self, value): return value in self.elements - + def __len__(self): return len(self.elements) - + def next(self): return self.__next__() @@ -85,14 +91,17 @@ def __next__(self): c = self.count self.count = c + 1 return getattr(self, self.elements[c]) or next(self) - + def add_with_encoding(self, s, encoding=None): """ Add the lower case and no-period version of the string to the set. Pass an explicit `encoding` parameter to specify the encoding of binary strings that are not DEFAULT_ENCODING (UTF-8). """ - encoding = encoding or sys.stdin.encoding or DEFAULT_ENCODING + stdin_encoding = None + if sys.stdin: + stdin_encoding = sys.stdin.encoding + encoding = encoding or stdin_encoding or DEFAULT_ENCODING if type(s) == binary_type: s = s.decode(encoding) self.elements.add(lc(s)) @@ -104,7 +113,7 @@ def add(self, *strings): """ [self.add_with_encoding(s) for s in strings] return self - + def remove(self, *strings): """ Remove the lower case and no-period version of the string arguments from the set. @@ -119,10 +128,11 @@ class TupleManager(dict): A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' + def __getattr__(self, attr): return self.get(attr) - __setattr__= dict.__setitem__ - __delattr__= dict.__delitem__ + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ def __getstate__(self): return dict(self) @@ -133,6 +143,7 @@ def __setstate__(self, state): def __reduce__(self): return (TupleManager, (), self.__getstate__()) + class Constants(object): """ An instance of this class hold all of the configuration constants for the parser. @@ -156,11 +167,23 @@ class Constants(object): :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ - + string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ The default string format use for all new `HumanName` instances. """ + + initials_format = "{first} {middle} {last}" + """ + The default initials format used for all new `HumanName` instances. + """ + + initials_delimiter = "." + """ + The default initials delimiter used for all new `HumanName` instances. + Will be used to add a delimiter between each initial. + """ + empty_attribute_default = '' """ Default return value for empty attributes. @@ -176,28 +199,58 @@ class Constants(object): 'John' """ - - - def __init__(self, - prefixes=PREFIXES, - suffix_acronyms=SUFFIX_ACRONYMS, - suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, - titles=TITLES, - first_name_titles=FIRST_NAME_TITLES, - conjunctions=CONJUNCTIONS, - capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, - regexes=REGEXES - ): - self.prefixes = SetManager(prefixes) - self.suffix_acronyms = SetManager(suffix_acronyms) + + capitalize_name = False + """ + If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to + :py:class:`~nameparser.parser.HumanName` instance. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.capitalize_name = True + >>> name = HumanName("bob v. de la macdole-eisenhower phd") + >>> str(name) + 'Bob V. de la MacDole-Eisenhower Ph.D.' + + """ + + force_mixed_case_capitalization = False + """ + If set, forces the capitalization of mixed case strings when + :py:meth:`~nameparser.parser.HumanName.capitalize` is called. + + .. doctest:: + + >>> from nameparser.config import CONSTANTS + >>> CONSTANTS.force_mixed_case_capitalization = True + >>> name = HumanName('Shirley Maclaine') + >>> name.capitalize() + >>> str(name) + 'Shirley MacLaine' + + """ + + def __init__(self, + prefixes=PREFIXES, + suffix_acronyms=SUFFIX_ACRONYMS, + suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, + titles=TITLES, + first_name_titles=FIRST_NAME_TITLES, + conjunctions=CONJUNCTIONS, + capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, + regexes=REGEXES + ): + self.prefixes = SetManager(prefixes) + self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) - self.titles = SetManager(titles) - self.first_name_titles = SetManager(first_name_titles) - self.conjunctions = SetManager(conjunctions) + self.titles = SetManager(titles) + self.first_name_titles = SetManager(first_name_titles) + self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) - self.regexes = TupleManager(regexes) + self.regexes = TupleManager(regexes) self._pst = None - + @property def suffixes_prefixes_titles(self): if not self._pst: @@ -206,15 +259,16 @@ def suffixes_prefixes_titles(self): def __repr__(self): return "" - + def __setstate__(self, state): self.__init__(state) - + def __getstate__(self): attrs = [x for x in dir(self) if not x.startswith('_')] - return dict([(a,getattr(self, a)) for a in attrs]) + return dict([(a, getattr(self, a)) for a in attrs]) + -#: A module-level instance of the :py:class:`Constants()` class. +#: A module-level instance of the :py:class:`Constants()` class. #: Provides a common instance for the module to share #: to easily adjust configuration for the entire module. #: See `Customizing the Parser with Your Own Configuration `_. diff --git a/nameparser/config/capitalization.py b/nameparser/config/capitalization.py index 4aa3214..84dfbef 100644 --- a/nameparser/config/capitalization.py +++ b/nameparser/config/capitalization.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals CAPITALIZATION_EXCEPTIONS = ( - ('ii' ,'II'), - ('iii','III'), - ('iv' ,'IV'), - ('md' ,'M.D.'), - ('phd','Ph.D.'), + ('ii', 'II'), + ('iii', 'III'), + ('iv', 'IV'), + ('md', 'M.D.'), + ('phd', 'Ph.D.'), ) """ Any pieces that are not capitalized by capitalizing the first letter. diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index 21c82fa..0334f83 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -1,14 +1,24 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -#: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece. +#: Name pieces that appear before a last name. Prefixes join to the piece +#: that follows them to make one new piece. They can be chained together, e.g +#: "von der" and "de la". Because they only appear in middle or last names, +#: they also signify that all following name pieces should be in the same name +#: part, for example, "von" will be joined to all following pieces that are not +#: prefixes or suffixes, allowing recognition of double last names when they +#: appear after a prefixes. So in "pennie von bergen wessels MD", "von" will +#: join with all following name pieces until the suffix "MD", resulting in the +#: correct parsing of the last name "von bergen wessels". PREFIXES = set([ 'abu', + 'al', 'bin', 'bon', 'da', 'dal', 'de', + 'de\'', 'degli', 'dei', 'del', @@ -19,16 +29,22 @@ 'dello', 'der', 'di', - 'du', 'dí', + 'do', + 'dos', + 'du', 'ibn', 'la', 'le', + 'mac', + 'mc', 'san', 'santa', 'st', 'ste', 'van', + 'vander', 'vel', 'von', + 'vom', ]) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index 42da85d..bd4b320 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -23,11 +23,14 @@ ("word", re.compile(r"(\w|\.)+", re.U)), ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("nickname", re.compile(r'\s*?[\("](.+?)[\)"]', re.U)), + ("quoted_word", re.compile(r'(?`_. + :param constants constants: + a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for + `per-instance config `_. :param str encoding: string representing the encoding of your input - :param str string_format: python string formatting + :param str string_format: python string formatting + :param str initials_format: python initials string formatting + :param str initials_delimter: string delimiter for initials + :param str first: first name + :param str middle: middle name + :param str last: last name + :param str title: The title or prenominal + :param str suffix: The suffix or postnominal + :param str nickname: Nicknames """ - + C = CONSTANTS """ A reference to the configuration for this instance, which may or may not be - a reference to the shared, module-wide instance at - :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser + a reference to the shared, module-wide instance at + :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser `_. """ - + original = '' """ The original string, untouched by the parser. """ - + _count = 0 - _members = ['title','first','middle','last','suffix','nickname'] + _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname'] unparsable = True _full_name = '' - + def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, - string_format=None): + string_format=None, initials_format=None, initials_delimiter=None, + first=None, middle=None, last=None, title=None, suffix=None, + nickname=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() - + self.encoding = encoding self.string_format = string_format or self.C.string_format - # full_name setter triggers the parse - self.full_name = full_name - + self.initials_format = initials_format or self.C.initials_format + self.initials_delimiter = initials_delimiter or self.C.initials_delimiter + if (first or middle or last or title or suffix or nickname): + self.first = first + self.middle = middle + self.last = last + self.title = title + self.suffix = suffix + self.nickname = nickname + self.unparsable = False + else: + # full_name setter triggers the parse + self.full_name = full_name + def __iter__(self): return self - + def __len__(self): l = 0 for x in self: l += 1 return l - + def __eq__(self, other): """ - HumanName instances are equal to other objects whose + HumanName instances are equal to other objects whose lower case unicode representation is the same. """ return (u(self)).lower() == (u(other)).lower() - + def __ne__(self, other): return not (u(self)).lower() == (u(other)).lower() - + def __getitem__(self, key): if isinstance(key, slice): return [getattr(self, x) for x in self._members[key]] @@ -129,20 +157,23 @@ def __unicode__(self): # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" _s = self.string_format.format(**self.as_dict()) # remove trailing punctuation from missing nicknames - _s = _s.replace(str(self.C.empty_attribute_default),'').replace(" ()","").replace(" ''","").replace(' ""',"") + _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "") return self.collapse_whitespace(_s).strip(', ') return " ".join(self) - + + def __hash__(self): + return hash(str(self)) + def __str__(self): - if sys.version >= '3': + if sys.version_info[0] >= 3: return self.__unicode__() return self.__unicode__().encode(self.encoding) - + def __repr__(self): if self.unparsable: - _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__,} + _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, } else: - _string = "<%(class)s : [\n\ttitle: '%(title)s' \n\tfirst: '%(first)s' \n\tmiddle: '%(middle)s' \n\tlast: '%(last)s' \n\tsuffix: '%(suffix)s'\n\tnickname: '%(nickname)s'\n]>" % { + _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % { 'class': self.__class__.__name__, 'title': self.title or '', 'first': self.first or '', @@ -151,25 +182,25 @@ def __repr__(self): 'suffix': self.suffix or '', 'nickname': self.nickname or '', } - if sys.version >= '3': + if sys.version_info[0] >= 3: return _string return _string.encode(self.encoding) - + def as_dict(self, include_empty=True): """ Return the parsed name as a dictionary of its attributes. - + :param bool include_empty: Include keys in the dictionary for empty name attributes. :rtype: dict - + .. doctest:: - + >>> name = HumanName("Bob Dole") >>> name.as_dict() {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'} >>> name.as_dict(False) {'last': 'Dole', 'first': 'Bob'} - + """ d = {} for m in self._members: @@ -180,72 +211,153 @@ def as_dict(self, include_empty=True): if val: d[m] = val return d - + + def __process_initial__(self, name_part, firstname=False): + """ + Name parts may include prefixes or conjunctions. This function filters these from the name unless it is + a first name, since first names cannot be conjunctions or prefixes. + """ + parts = name_part.split(" ") + initials = [] + if len(parts) and isinstance(parts, list): + for part in parts: + if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True: + initials.append(part[0]) + if len(initials) > 0: + return " ".join(initials) + else: + return self.C.empty_attribute_default + + def initials_list(self): + """ + Returns the initials as a list + + .. doctest:: + + >>> name = HumanName("Sir Bob Andrew Dole") + >>> name.initials_list() + ["B", "A", "D"] + >>> name = HumanName("J. Doe") + >>> name.initials_list() + ["J", "D"] + """ + first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] + middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] + last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] + return first_initials_list + middle_initials_list + last_initials_list + + def initials(self): + """ + Return period-delimited initials of the first, middle and optionally last name. + + :param bool include_last_name: Include the last name as part of the initials + :rtype: str + + .. doctest:: + + >>> name = HumanName("Sir Bob Andrew Dole") + >>> name.initials() + "B. A. D." + >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}") + >>> name.initials() + "B. A." + """ + + first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] + middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] + last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] + + initials_dict = { + "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter + if len(first_initials_list) else self.C.empty_attribute_default, + "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter + if len(middle_initials_list) else self.C.empty_attribute_default, + "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter + if len(last_initials_list) else self.C.empty_attribute_default + } + + _s = self.initials_format.format(**initials_dict) + return self.collapse_whitespace(_s) + @property def has_own_config(self): """ - True if this instance is not using the shared module-level + True if this instance is not using the shared module-level configuration. """ return self.C is not CONSTANTS - - ### attributes - + + # attributes + @property def title(self): """ - The person's titles. Any string of consecutive pieces in - :py:mod:`~nameparser.config.titles` or + The person's titles. Any string of consecutive pieces in + :py:mod:`~nameparser.config.titles` or :py:mod:`~nameparser.config.conjunctions` at the beginning of :py:attr:`full_name`. """ return " ".join(self.title_list) or self.C.empty_attribute_default - + @property def first(self): """ - The person's first name. The first name piece after any known + The person's first name. The first name piece after any known :py:attr:`title` pieces parsed from :py:attr:`full_name`. """ return " ".join(self.first_list) or self.C.empty_attribute_default - + @property def middle(self): """ - The person's middle names. All name pieces after the first name and + The person's middle names. All name pieces after the first name and before the last name parsed from :py:attr:`full_name`. """ return " ".join(self.middle_list) or self.C.empty_attribute_default - + @property def last(self): """ - The person's last name. The last name piece parsed from + The person's last name. The last name piece parsed from :py:attr:`full_name`. """ return " ".join(self.last_list) or self.C.empty_attribute_default - + @property def suffix(self): """ The persons's suffixes. Pieces at the end of the name that are found in :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end - of comma separated formats, e.g. - "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed + of comma separated formats, e.g. + "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed from :py:attr:`full_name`. """ return ", ".join(self.suffix_list) or self.C.empty_attribute_default - + @property def nickname(self): """ - The person's nicknames. Any text found inside of quotes (``""``) or + The person's nicknames. Any text found inside of quotes (``""``) or parenthesis (``()``) """ return " ".join(self.nickname_list) or self.C.empty_attribute_default - - ### setter methods - + + @property + def surnames_list(self): + """ + List of middle names followed by last name. + """ + return self.middle_list + self.last_list + + @property + def surnames(self): + """ + A string of all middle names followed by the last name. + """ + return " ".join(self.surnames_list) or self.C.empty_attribute_default + + # setter methods + def _set_list(self, attr, value): if isinstance(value, list): val = value @@ -255,102 +367,116 @@ def _set_list(self, attr, value): val = [] else: raise TypeError( - "Can only assign strings, lists or None to name attributes." - " Got {0}".format(type(value))) + "Can only assign strings, lists or None to name attributes." + " Got {0}".format(type(value))) setattr(self, attr+"_list", self.parse_pieces(val)) - + @title.setter def title(self, value): self._set_list('title', value) - + @first.setter def first(self, value): self._set_list('first', value) - + @middle.setter def middle(self, value): self._set_list('middle', value) - + @last.setter def last(self, value): self._set_list('last', value) - + @suffix.setter def suffix(self, value): self._set_list('suffix', value) - + @nickname.setter def nickname(self, value): self._set_list('nickname', value) - - ### Parse helpers - + + # Parse helpers + def is_title(self, value): """Is in the :py:data:`~nameparser.config.titles.TITLES` set.""" return lc(value) in self.C.titles - + def is_conjunction(self, piece): - """Is in the conjuctions set and not :py:func:`is_an_initial()`.""" - return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) - + """Is in the conjunctions set and not :py:func:`is_an_initial()`.""" + if isinstance(piece, list): + for item in piece: + if self.is_conjunction(item): + return True + else: + return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) + def is_prefix(self, piece): """ - Lowercase and no periods version of piece is in the - `~nameparser.config.titles.PREFIXES` set. + Lowercase and no periods version of piece is in the + :py:data:`~nameparser.config.prefixes.PREFIXES` set. """ - return lc(piece) in self.C.prefixes + if isinstance(piece, list): + for item in piece: + if self.is_prefix(item): + return True + else: + return lc(piece) in self.C.prefixes def is_roman_numeral(self, value): """ - Matches the ``roman_numeral`` regular expression in + Matches the ``roman_numeral`` regular expression in :py:data:`~nameparser.config.regexes.REGEXES`. """ return bool(self.C.regexes.roman_numeral.match(value)) - + def is_suffix(self, piece): """ - Is in the suffixes set and not :py:func:`is_an_initial()`. - - Some suffixes may be acronyms (M.B.A) while some are not (Jr.), + Is in the suffixes set and not :py:func:`is_an_initial()`. + + Some suffixes may be acronyms (M.B.A) while some are not (Jr.), so we remove the periods from `piece` when testing against `C.suffix_acronyms`. """ # suffixes may have periods inside them like "M.D." - return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \ - or (lc(piece) in self.C.suffix_not_acronyms)) \ - and not self.is_an_initial(piece) - + if isinstance(piece, list): + for item in piece: + if self.is_suffix(item): + return True + else: + return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) + or (lc(piece) in self.C.suffix_not_acronyms)) \ + and not self.is_an_initial(piece) + def are_suffixes(self, pieces): """Return True if all pieces are suffixes.""" for piece in pieces: if not self.is_suffix(piece): return False return True - + def is_rootname(self, piece): """ Is not a known title, suffix or prefix. Just first, middle, last names. """ return lc(piece) not in self.C.suffixes_prefixes_titles \ - and not self.is_an_initial(piece) - + and not self.is_an_initial(piece) + def is_an_initial(self, value): """ Words with a single period at the end, or a single uppercase letter. - - Matches the ``initial`` regular expression in + + Matches the ``initial`` regular expression in :py:data:`~nameparser.config.regexes.REGEXES`. """ return bool(self.C.regexes.initial.match(value)) - - ### full_name parser - + # full_name parser + @property def full_name(self): - """The name string to be parsed.""" - return self._full_name - + """The string output of the HumanName instance.""" + return self.__str__() + @full_name.setter def full_name(self, value): self.original = value @@ -358,41 +484,69 @@ def full_name(self, value): if isinstance(value, binary_type): self._full_name = value.decode(self.encoding) self.parse_full_name() - + def collapse_whitespace(self, string): # collapse multiple spaces into single space - return self.C.regexes.spaces.sub(" ", string.strip()) - + string = self.C.regexes.spaces.sub(" ", string.strip()) + if string.endswith(","): + string = string[:-1] + return string + def pre_process(self): """ - + This method happens at the beginning of the :py:func:`parse_full_name` before any other processing of the string aside from unicode normalization, so it's a good place to do any custom handling in a - subclass. Runs :py:func:`parse_nicknames`. - + subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. + """ + self.fix_phd() self.parse_nicknames() self.squash_emoji() def post_process(self): """ This happens at the end of the :py:func:`parse_full_name` after - all other processing has taken place. Runs :py:func:`handle_firstnames`. + all other processing has taken place. Runs :py:func:`handle_firstnames` + and :py:func:`handle_capitalization`. """ self.handle_firstnames() + self.handle_capitalization() + + def fix_phd(self): + try: + _re = self.C.regexes.phd + match = _re.search(self._full_name) + if match: + self.suffix_list.append(match.group(1)) + self._full_name = _re.sub('', self._full_name) + except AttributeError: + pass def parse_nicknames(self): """ - The content of parenthesis or double quotes in the name will - be treated as nicknames. This happens before any other - processing of the name. + The content of parenthesis or quotes in the name will be added to the + nicknames list. This happens before any other processing of the name. + + Single quotes cannot span white space characters and must border + white space to allow for quotes in names like O'Connor and Kawai'ae'a. + Double quotes and parenthesis can span white space. + + Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; + `quoted_word`, `double_quotes` and `parenthesis`. """ - # https://code.google.com/p/python-nameparser/issues/detail?id=33 - re_nickname = self.C.regexes.nickname - if re_nickname.search(self._full_name): - self.nickname_list = re_nickname.findall(self._full_name) - self._full_name = re_nickname.sub('', self._full_name) + + empty_re = re.compile("") + + re_quoted_word = self.C.regexes.quoted_word or empty_re + re_double_quotes = self.C.regexes.double_quotes or empty_re + re_parenthesis = self.C.regexes.parenthesis or empty_re + + for _re in (re_quoted_word, re_double_quotes, re_parenthesis): + if _re.search(self._full_name): + self.nickname_list += [x for x in _re.findall(self._full_name)] + self._full_name = _re.sub('', self._full_name) def squash_emoji(self): """ @@ -407,7 +561,7 @@ def handle_firstnames(self): If there are only two parts and one is a title, assume it's a last name instead of a first name. e.g. Mr. Johnson. Unless it's a special title like "Sir", then when it's followed by a single name that name is always - a first name. + a first name. """ if self.title \ and len(self) == 2 \ @@ -416,18 +570,18 @@ def handle_firstnames(self): def parse_full_name(self): """ - + The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. - + :py:func:`parse_pieces` then splits those parts on spaces and - :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. + :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ - + self.title_list = [] self.first_list = [] self.middle_list = [] @@ -435,23 +589,22 @@ def parse_full_name(self): self.suffix_list = [] self.nickname_list = [] self.unparsable = True - - + self.pre_process() - + self._full_name = self.collapse_whitespace(self._full_name) - + # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] - - log.debug("full_name: {0}".format(self._full_name)) - log.debug("parts: {0}".format(parts)) - + + log.debug("full_name: %s", self._full_name) + log.debug("parts: %s", parts) + if len(parts) == 1: - + # no commas, title first middle middle middle last suffix # part[0] - + pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): @@ -459,56 +612,61 @@ def parse_full_name(self): nxt = pieces[i + 1] except IndexError: nxt = None - + # title must have a next piece, unless it's just a title - if self.is_title(piece) \ + if not self.first \ and (nxt or p_len == 1) \ - and not self.first: + and self.is_title(piece): self.title_list.append(piece) continue if not self.first: + if p_len == 1 and self.nickname: + self.last_list.append(piece) + continue self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ - ( + ( # if the next piece is the last piece and a roman # numeral but this piece is not an initial - self.is_roman_numeral(nxt) and i == p_len - 2 + self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) - ): + ): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break if not nxt: self.last_list.append(piece) continue - + self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names # only, and allows potential first names to be in suffixes, e.g. # "Johnson, Bart" + + post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1) + if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: - - # suffix comma: + + # suffix comma: # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] - - + self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) - log.debug("pieces: {0}".format(u(pieces))) + log.debug("pieces: %s", u(pieces)) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None - if self.is_title(piece) \ + if not self.first \ and (nxt or len(pieces) == 1) \ - and not self.first: + and self.is_title(piece): self.title_list.append(piece) continue if not self.first: @@ -523,33 +681,32 @@ def parse_full_name(self): continue self.middle_list.append(piece) else: - - # lastname comma: + + # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] - pieces = self.parse_pieces(parts[1].split(' '), 1) - - log.debug("pieces: {0}".format(u(pieces))) - + + log.debug("post-comma pieces: %s", u(post_comma_pieces)) + # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: - # the first one is always a last name, even if it look like + # the first one is always a last name, even if it looks like # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) - - for i, piece in enumerate(pieces): + + for i, piece in enumerate(post_comma_pieces): try: - nxt = pieces[i + 1] + nxt = post_comma_pieces[i + 1] except IndexError: nxt = None - - if self.is_title(piece) \ - and (nxt or len(pieces) == 1) \ - and not self.first: + + if not self.first \ + and (nxt or len(post_comma_pieces) == 1) \ + and self.is_title(piece): self.title_list.append(piece) continue if not self.first: @@ -564,50 +721,49 @@ def parse_full_name(self): self.suffix_list += parts[2:] except IndexError: pass - + if len(self) < 0: - log.info("Unparsable: \"{}\" ".format(self.original)) + log.info("Unparsable: \"%s\" ", self.original) else: self.unparsable = False self.post_process() - def parse_pieces(self, parts, additional_parts_count=0): """ Split parts on spaces and remove commas, join on conjunctions and lastname prefixes. If parts have periods in the middle, try splitting on periods and check if the parts are titles or suffixes. If they are add to the constant so they will be found. - + :param list parts: name part strings from the comma split - :param int additional_parts_count: - - if the comma format contains other parts, we need to know - how many there are to decide if things should be considered a + :param int additional_parts_count: + + if the comma format contains other parts, we need to know + how many there are to decide if things should be considered a conjunction. :return: pieces split on spaces and joined on conjunctions :rtype: list """ - + output = [] for part in parts: if not isinstance(part, text_types): raise TypeError("Name parts must be strings. " "Got {0}".format(type(part))) output += [x.strip(' ,') for x in part.split(' ')] - + # If part contains periods, check if it's multiple titles or suffixes # together without spaces if so, add the new part with periods to the # constants so they get parsed correctly later for part in output: # if this part has a period not at the beginning or end - if self.C.regexes.period_not_at_end.match(part): + if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part): # split on periods, any of the split pieces titles or suffixes? # ("Lt.Gov.") period_chunks = part.split(".") - titles = list(filter(self.is_title, period_chunks)) + titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) - + # add the part to the constant so it will be found if len(list(titles)): self.C.titles.add(part) @@ -615,45 +771,45 @@ def parse_pieces(self, parts, additional_parts_count=0): if len(list(suffixes)): self.C.suffix_not_acronyms.add(part) continue - + return self.join_on_conjunctions(output, additional_parts_count) - + def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: - + ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> ['Mr. and Mrs.', 'John', 'Doe'] - + ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> ['The Secretary of State', 'Hillary', 'Clinton'] - + When joining titles, saves newly formed piece to the instance's titles constant so they will be parsed correctly later. E.g. after parsing the example names above, 'The Secretary of State' and 'Mr. and Mrs.' would be present in the titles constant set. - + :param list pieces: name pieces strings after split on spaces - :param int additional_parts_count: - :return: new list with piece next to conjunctions merged into one piece - with spaces in it. + :param int additional_parts_count: + :return: new list with piece next to conjunctions merged into one piece + with spaces in it. :rtype: list - + """ length = len(pieces) + additional_parts_count # don't join on conjunctions if there's only 2 parts if length < 3: return pieces - + rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length = len(rootname_pieces) + additional_parts_count - + # find all the conjunctions, join any conjunctions that are next to each # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it - conj_index = [i for i, piece in enumerate(pieces) - if self.is_conjunction(piece)] - + conj_index = [i for i, piece in enumerate(pieces) + if self.is_conjunction(piece)] + contiguous_conj_i = [] for i, val in enumerate(conj_index): try: @@ -661,22 +817,22 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): contiguous_conj_i += [val] except IndexError: pass - + contiguous_conj_i = group_contiguous_integers(conj_index) - - delete_i = [] + + delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: - new_piece = " ".join(pieces[ i[0] : i[1]+1] ) - delete_i += list(range( i[0]+1, i[1]+1 )) + new_piece = " ".join(pieces[i[0]: i[1]+1]) + delete_i += list(range(i[0]+1, i[1]+1)) pieces[i[0]] = new_piece else: - new_piece = " ".join(pieces[ i : i+2 ]) + new_piece = " ".join(pieces[i: i+2]) delete_i += [i+1] pieces[i] = new_piece - #add newly joined conjunctions to constants to be found later + # add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) - + for i in reversed(delete_i): # delete pieces in reverse order or the index changes on each delete del pieces[i] @@ -687,7 +843,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # refresh conjunction index locations conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] - + for i in conj_index: if len(pieces[i]) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes @@ -695,8 +851,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # treating it as an initial rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue - - if i is 0: + + if i == 0: new_piece = " ".join(pieces[i:i+2]) if self.is_title(pieces[i+1]): # when joining to a title, make new_piece a title too @@ -704,11 +860,11 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): pieces[i] = new_piece pieces.pop(i+1) # subtract 1 from the index of all the remaining conjunctions - for j,val in enumerate(conj_index): + for j, val in enumerate(conj_index): if val > i: - conj_index[j]=val-1 - - else: + conj_index[j] = val-1 + + else: new_piece = " ".join(pieces[i-1:i+2]) if self.is_title(pieces[i-1]): # when joining to a title, make new_piece a title too @@ -723,33 +879,60 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # subtract the number of removed pieces from the index # of all the remaining conjunctions - for j,val in enumerate(conj_index): + for j, val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count - - + # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: - i = pieces.index(prefixes[0]) - # join everything after the prefix until the next suffix - next_suffix = list(filter(self.is_suffix, pieces[i:])) - if next_suffix: - j = pieces.index(next_suffix[0]) - new_piece = ' '.join(pieces[i:j]) - pieces = pieces[:i] + [new_piece] + pieces[j:] - else: - new_piece = ' '.join(pieces[i:]) - pieces = pieces[:i] + [new_piece] - - log.debug("pieces: {0}".format(pieces)) + for prefix in prefixes: + try: + i = pieces.index(prefix) + except ValueError: + # If the prefix is no longer in pieces, it's because it has been + # combined with the prefix that appears right before (or before that when + # chained together) in the last loop, so the index of that newly created + # piece is the same as in the last loop, i==i still, and we want to join + # it to the next piece. + pass + + new_piece = '' + + # join everything after the prefix until the next prefix or suffix + + try: + if i == 0 and total_length >= 1: + # If it's the first piece and there are more than 1 rootnames, assume it's a first name + continue + next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) + j = pieces.index(next_prefix, i + 1) + if j == i + 1: + # if there are two prefixes in sequence, join to the following piece + j += 1 + new_piece = ' '.join(pieces[i:j]) + pieces = pieces[:i] + [new_piece] + pieces[j:] + except StopIteration: + try: + # if there are no more prefixes, look for a suffix to stop at + stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) + j = pieces.index(stop_at) + new_piece = ' '.join(pieces[i:j]) + pieces = pieces[:i] + [new_piece] + pieces[j:] + except StopIteration: + # if there were no suffixes, nothing to stop at so join all + # remaining pieces + new_piece = ' '.join(pieces[i:]) + pieces = pieces[:i] + [new_piece] + + log.debug("pieces: %s", pieces) return pieces - - - ### Capitalization Support - - def cap_word(self, word): - if self.is_prefix(word) or self.is_conjunction(word): + + # Capitalization Support + + def cap_word(self, word, attribute): + if (self.is_prefix(word) and attribute in ('last', 'middle')) \ + or self.is_conjunction(word): return word.lower() exceptions = self.C.capitalization_exceptions if lc(word) in exceptions: @@ -762,25 +945,28 @@ def cap_after_mac(m): else: return word.capitalize() - def cap_piece(self, piece): + def cap_piece(self, piece, attribute): if not piece: return "" - replacement = lambda m: self.cap_word(m.group(0)) + + def replacement(m): return self.cap_word(m.group(0), attribute) return self.C.regexes.word.sub(replacement, piece) - def capitalize(self, force=False): + def capitalize(self, force=None): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. By default, it will not adjust the case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. - - :param bool force: force capitalization of strings that include mixed case + + :param bool force: Forces capitalization of mixed case strings. This + parameter overrides rules set within + :py:class:`~nameparser.config.CONSTANTS`. **Usage** - + .. doctest:: capitalize - + >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> str(name) @@ -788,18 +974,29 @@ def capitalize(self, force=False): >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() - >>> str(name) + >>> str(name) 'Shirley Maclaine' >>> name.capitalize(force=True) - >>> str(name) + >>> str(name) 'Shirley MacLaine' - + """ name = u(self) + force = self.C.force_mixed_case_capitalization \ + if force is None else force + if not force and not (name == name.upper() or name == name.lower()): return - self.title_list = self.cap_piece(self.title ).split(' ') - self.first_list = self.cap_piece(self.first ).split(' ') - self.middle_list = self.cap_piece(self.middle).split(' ') - self.last_list = self.cap_piece(self.last ).split(' ') - self.suffix_list = self.cap_piece(self.suffix).split(', ') + self.title_list = self.cap_piece(self.title, 'title').split(' ') + self.first_list = self.cap_piece(self.first, 'first').split(' ') + self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') + self.last_list = self.cap_piece(self.last, 'last').split(' ') + self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') + + def handle_capitalization(self): + """ + Handles capitalization configurations set within + :py:class:`~nameparser.config.CONSTANTS`. + """ + if self.C.capitalize_name: + self.capitalize() diff --git a/nameparser/util.py b/nameparser/util.py index 899bcb0..4ef7458 100644 --- a/nameparser/util.py +++ b/nameparser/util.py @@ -13,7 +13,7 @@ def emit(self, record): import sys -if sys.version < '3': +if sys.version_info[0] < 3: text_type = unicode binary_type = str diff --git a/setup.py b/setup.py index 4986703..2067716 100755 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ def read(fname): packages = ['nameparser','nameparser.config'], description = 'A simple Python module for parsing human names into their individual components.', long_description = README, + long_description_content_type = "text/x-rst", version = nameparser.__version__, url = nameparser.__url__, author = nameparser.__author__, @@ -26,13 +27,8 @@ def read(fname): 'Operating System :: OS Independent', "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', 'Development Status :: 5 - Production/Stable', 'Natural Language :: English', "Topic :: Software Development :: Libraries :: Python Modules", diff --git a/tests.py b/tests.py index 3123a6f..2cdd526 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import unittest """ Run this file to run the tests. @@ -19,6 +20,7 @@ """ import logging +import re try: import dill except ImportError: @@ -26,11 +28,10 @@ from nameparser import HumanName from nameparser.util import u -from nameparser.config import Constants +from nameparser.config import Constants, TupleManager log = logging.getLogger('HumanName') -import unittest try: unittest.expectedFailure except AttributeError: @@ -40,18 +41,17 @@ class HumanNameTestBase(unittest.TestCase): def m(self, actual, expected, hn): - """assertEquals with a better message and awareness of hn.C.empty_attribute_default""" + """assertEqual with a better message and awareness of hn.C.empty_attribute_default""" expected = expected or hn.C.empty_attribute_default try: self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%r" % ( actual, expected, - hn.full_name, + hn.original, hn )) except UnicodeDecodeError: - self.assertEquals(actual, expected) - + self.assertEqual(actual, expected) class HumanNamePythonTests(HumanNameTestBase): @@ -63,8 +63,6 @@ def test_utf8(self): def test_string_output(self): hn = HumanName("de la Véña, Jüan") - print(hn) - print(repr(hn)) def test_escaped_utf8_bytes(self): hn = HumanName(b'B\xc3\xb6ck, Gerald') @@ -77,12 +75,12 @@ def test_len(self): hn = HumanName("John Doe") self.m(len(hn), 2, hn) - @unittest.skipUnless(dill,"requires python-dill module to test pickling") + @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_config_pickle(self): - C = Constants() - self.assertTrue(dill.pickles(C)) + constants = Constants() + self.assertTrue(dill.pickles(constants)) - @unittest.skipUnless(dill,"requires python-dill module to test pickling") + @unittest.skipUnless(dill, "requires python-dill module to test pickling") def test_name_instance_pickle(self): hn = HumanName("Title First Middle Middle Last, Jr.") self.assertTrue(dill.pickles(hn)) @@ -91,7 +89,7 @@ def test_comparison(self): hn1 = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") hn2 = HumanName("Dr. John P. Doe-Ray, CLU, CFP, LUTC") self.assertTrue(hn1 == hn2) - self.assertTrue(not hn1 is hn2) + self.assertTrue(hn1 is not hn2) self.assertTrue(hn1 == "Dr. John P. Doe-Ray CLU, CFP, LUTC") hn1 = HumanName("Doe, Dr. John P., CLU, CFP, LUTC") hn2 = HumanName("Dr. John P. Doe-Ray, CLU, CFP, LUTC") @@ -112,6 +110,11 @@ def test_assignment_to_full_name(self): self.m(hn.last, "Velasquez y Garcia", hn) self.m(hn.suffix, "III", hn) + def test_get_full_name_attribute_references_internal_lists(self): + hn = HumanName("John Williams") + hn.first_list = ["Larry"] + self.m(hn.full_name, "Larry Williams", hn) + def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" @@ -127,17 +130,17 @@ def test_assignment_to_attribute(self): with self.assertRaises(TypeError): hn.suffix = [['test']] with self.assertRaises(TypeError): - hn.suffix = {"test":"test"} + hn.suffix = {"test": "test"} def test_assign_list_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") - hn.title = ["test1","test2"] + hn.title = ["test1", "test2"] self.m(hn.title, "test1 test2", hn) - hn.first = ["test3","test4"] + hn.first = ["test3", "test4"] self.m(hn.first, "test3 test4", hn) - hn.middle = ["test5","test6","test7"] + hn.middle = ["test5", "test6", "test7"] self.m(hn.middle, "test5 test6 test7", hn) - hn.last = ["test8","test9","test10"] + hn.last = ["test8", "test9", "test10"] self.m(hn.last, "test8 test9 test10", hn) hn.suffix = ['test'] self.m(hn.suffix, "test", hn) @@ -146,13 +149,13 @@ def test_comparison_case_insensitive(self): hn1 = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") hn2 = HumanName("dr. john p. doe-Ray, CLU, CFP, LUTC") self.assertTrue(hn1 == hn2) - self.assertTrue(not hn1 is hn2) + self.assertTrue(hn1 is not hn2) self.assertTrue(hn1 == "Dr. John P. Doe-ray clu, CFP, LUTC") def test_slice(self): hn = HumanName("Doe-Ray, Dr. John P., CLU, CFP, LUTC") self.m(list(hn), ['Dr.', 'John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC'], hn) - self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC',hn.C.empty_attribute_default], hn) + self.m(hn[1:], ['John', 'P.', 'Doe-Ray', 'CLU, CFP, LUTC', hn.C.empty_attribute_default], hn) self.m(hn[1:-2], ['John', 'P.', 'Doe-Ray'], hn) def test_getitem(self): @@ -167,12 +170,12 @@ def test_setitem(self): hn = HumanName("Dr. John A. Kenneth Doe, Jr.") hn['title'] = 'test' self.m(hn['title'], "test", hn) - hn['last'] = ['test','test2'] + hn['last'] = ['test', 'test2'] self.m(hn['last'], "test test2", hn) with self.assertRaises(TypeError): hn["suffix"] = [['test']] with self.assertRaises(TypeError): - hn["suffix"] = {"test":"test"} + hn["suffix"] = {"test": "test"} def test_conjunction_names(self): hn = HumanName("johnny y") @@ -189,6 +192,79 @@ def test_blank_name(self): self.m(hn.first, "", hn) self.m(hn.last, "", hn) + def test_surnames_list_attribute(self): + hn = HumanName("John Edgar Casey Williams III") + self.m(hn.surnames_list, ["Edgar", "Casey", "Williams"], hn) + + def test_surnames_attribute(self): + hn = HumanName("John Edgar Casey Williams III") + self.m(hn.surnames, "Edgar Casey Williams", hn) + + def test_is_prefix_with_list(self): + hn = HumanName() + items = ['firstname', 'lastname', 'del'] + self.assertTrue(hn.is_prefix(items)) + self.assertTrue(hn.is_prefix(items[1:])) + + def test_is_conjunction_with_list(self): + hn = HumanName() + items = ['firstname', 'lastname', 'and'] + self.assertTrue(hn.is_conjunction(items)) + self.assertTrue(hn.is_conjunction(items[1:])) + + def test_override_constants(self): + C = Constants() + hn = HumanName(constants=C) + self.assertTrue(hn.C is C) + + def test_override_regex(self): + var = TupleManager([("spaces", re.compile(r"\s+", re.U)),]) + C = Constants(regexes=var) + hn = HumanName(constants=C) + self.assertTrue(hn.C.regexes == var) + + def test_override_titles(self): + var = ["abc","def"] + C = Constants(titles=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.titles) == sorted(var)) + + def test_override_first_name_titles(self): + var = ["abc","def"] + C = Constants(first_name_titles=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.first_name_titles) == sorted(var)) + + def test_override_prefixes(self): + var = ["abc","def"] + C = Constants(prefixes=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.prefixes) == sorted(var)) + + def test_override_suffix_acronyms(self): + var = ["abc","def"] + C = Constants(suffix_acronyms=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.suffix_acronyms) == sorted(var)) + + def test_override_suffix_not_acronyms(self): + var = ["abc","def"] + C = Constants(suffix_not_acronyms=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.suffix_not_acronyms) == sorted(var)) + + def test_override_conjunctions(self): + var = ["abc","def"] + C = Constants(conjunctions=var) + hn = HumanName(constants=C) + self.assertTrue(sorted(hn.C.conjunctions) == sorted(var)) + + def test_override_capitalization_exceptions(self): + var = TupleManager([("spaces", re.compile(r"\s+", re.U)),]) + C = Constants(capitalization_exceptions=var) + hn = HumanName(constants=C) + self.assertTrue(hn.C.capitalization_exceptions == var) + class FirstNameHandlingTests(HumanNameTestBase): def test_first_name(self): @@ -199,16 +275,16 @@ def test_assume_title_and_one_other_name_is_last_name(self): hn = HumanName("Rev Andrews") self.m(hn.title, "Rev", hn) self.m(hn.last, "Andrews", hn) - + # TODO: Seems "Andrews, M.D.", Andrews should be treated as a last name - # but other suffixes like "George Jr." should be first names. Might be + # but other suffixes like "George Jr." should be first names. Might be # related to https://github.com/derek73/python-nameparser/issues/2 @unittest.expectedFailure def test_assume_suffix_title_and_one_other_name_is_last_name(self): hn = HumanName("Andrews, M.D.") self.m(hn.suffix, "M.D.", hn) self.m(hn.last, "Andrews", hn) - + def test_suffix_in_lastname_part_of_lastname_comma_format(self): hn = HumanName("Smith Jr., John") self.m(hn.last, "Smith", hn) @@ -219,22 +295,22 @@ def test_sir_exception_to_first_name_rule(self): hn = HumanName("Sir Gerald") self.m(hn.title, "Sir", hn) self.m(hn.first, "Gerald", hn) - + def test_king_exception_to_first_name_rule(self): hn = HumanName("King Henry") self.m(hn.title, "King", hn) self.m(hn.first, "Henry", hn) - + def test_queen_exception_to_first_name_rule(self): hn = HumanName("Queen Elizabeth") self.m(hn.title, "Queen", hn) self.m(hn.first, "Elizabeth", hn) - + def test_dame_exception_to_first_name_rule(self): hn = HumanName("Dame Mary") self.m(hn.title, "Dame", hn) self.m(hn.first, "Mary", hn) - + def test_first_name_is_not_prefix_if_only_two_parts(self): """When there are only two parts, don't join prefixes or conjunctions""" hn = HumanName("Van Nguyen") @@ -252,7 +328,7 @@ def test_first_name_is_prefix_if_three_parts(self): hn = HumanName("Mr. Van Nguyen") self.m(hn.first, "Van", hn) self.m(hn.last, "Nguyen", hn) - + class HumanNameBruteForceTests(HumanNameTestBase): @@ -1073,7 +1149,7 @@ def test_multiple_conjunctions(self): def test_multiple_conjunctions2(self): hn = HumanName("part1 of and The part2 of the part3 And part4") self.m(hn.first, "part1 of and The part2 of the part3 And part4", hn) - + def test_ends_with_conjunction(self): hn = HumanName("Jon Dough and") self.m(hn.first, "Jon", hn) @@ -1231,12 +1307,12 @@ def test_le_as_last_name_with_middle_initial(self): self.m(hn.first, "Yin", hn) self.m(hn.middle, "a", hn) self.m(hn.last, "Le", hn) - + def test_conjunction_in_an_address_with_a_title(self): hn = HumanName("His Excellency Lord Duncan") self.m(hn.title, "His Excellency Lord", hn) self.m(hn.last, "Duncan", hn) - + @unittest.expectedFailure def test_conjunction_in_an_address_with_a_first_name_title(self): hn = HumanName("Her Majesty Queen Elizabeth") @@ -1253,26 +1329,32 @@ class ConstantsCustomization(HumanNameTestBase): def test_add_title(self): hn = HumanName("Te Awanui-a-Rangi Black", constants=None) + start_len = len(hn.C.titles) + self.assertTrue(start_len > 0) hn.C.titles.add('te') + self.assertEqual(start_len + 1, len(hn.C.titles)) hn.parse_full_name() - self.m(hn.title,"Te", hn) - self.m(hn.first,"Awanui-a-Rangi", hn) - self.m(hn.last,"Black", hn) - + self.m(hn.title, "Te", hn) + self.m(hn.first, "Awanui-a-Rangi", hn) + self.m(hn.last, "Black", hn) + def test_remove_title(self): hn = HumanName("Hon Solo", constants=None) + start_len = len(hn.C.titles) + self.assertTrue(start_len > 0) hn.C.titles.remove('hon') + self.assertEqual(start_len - 1, len(hn.C.titles)) hn.parse_full_name() - self.m(hn.first,"Hon", hn) - self.m(hn.last,"Solo", hn) - + self.m(hn.first, "Hon", hn) + self.m(hn.last, "Solo", hn) + def test_add_multiple_arguments(self): hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=None) hn.C.titles.add('dean', 'Chemistry') hn.parse_full_name() - self.m(hn.title,"Assoc Dean of Chemistry", hn) - self.m(hn.first,"Robert", hn) - self.m(hn.last,"Johns", hn) + self.m(hn.title, "Assoc Dean of Chemistry", hn) + self.m(hn.first, "Robert", hn) + self.m(hn.last, "Johns", hn) def test_instances_can_have_own_constants(self): hn = HumanName("", None) @@ -1282,8 +1364,7 @@ def test_instances_can_have_own_constants(self): self.assertEqual(hn.has_own_config, True) self.assertEqual('hon' in hn2.C.titles, True) self.assertEqual(hn2.has_own_config, False) - - + def test_can_change_global_constants(self): hn = HumanName("") hn2 = HumanName("") @@ -1294,23 +1375,23 @@ def test_can_change_global_constants(self): self.assertEqual(hn2.has_own_config, False) # clean up so we don't mess up other tests hn.C.titles.add('hon') - + def test_remove_multiple_arguments(self): hn = HumanName("Ms Hon Solo", constants=None) hn.C.titles.remove('hon', 'ms') hn.parse_full_name() - self.m(hn.first,"Ms", hn) - self.m(hn.middle,"Hon", hn) - self.m(hn.last,"Solo", hn) + self.m(hn.first, "Ms", hn) + self.m(hn.middle, "Hon", hn) + self.m(hn.last, "Solo", hn) def test_chain_multiple_arguments(self): hn = HumanName("Dean Ms Hon Solo", constants=None) hn.C.titles.remove('hon', 'ms').add('dean') hn.parse_full_name() - self.m(hn.title,"Dean", hn) - self.m(hn.first,"Ms", hn) - self.m(hn.middle,"Hon", hn) - self.m(hn.last,"Solo", hn) + self.m(hn.title, "Dean", hn) + self.m(hn.first, "Ms", hn) + self.m(hn.middle, "Hon", hn) + self.m(hn.last, "Solo", hn) def test_empty_attribute_default(self): from nameparser.config import CONSTANTS @@ -1346,7 +1427,7 @@ def test_add_constant_with_explicit_encoding(self): self.assertIn('béck', c.titles) -class HumanNameNicknameTestCase(HumanNameTestBase): +class NicknameTestCase(HumanNameTestBase): # https://code.google.com/p/python-nameparser/issues/detail?id=33 def test_nickname_in_parenthesis(self): hn = HumanName("Benjamin (Ben) Franklin") @@ -1354,14 +1435,28 @@ def test_nickname_in_parenthesis(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + + def test_two_word_nickname_in_parenthesis(self): + hn = HumanName("Benjamin (Big Ben) Franklin") + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Big Ben", hn) + + def test_two_words_in_quotes(self): + hn = HumanName('Benjamin "Big Ben" Franklin') + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Big Ben", hn) + def test_nickname_in_parenthesis_with_comma(self): hn = HumanName("Franklin, Benjamin (Ben)") self.m(hn.first, "Benjamin", hn) self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_nickname_in_parenthesis_with_comma_and_suffix(self): hn = HumanName("Franklin, Benjamin (Ben), Jr.") self.m(hn.first, "Benjamin", hn) @@ -1369,10 +1464,7 @@ def test_nickname_in_parenthesis_with_comma_and_suffix(self): self.m(hn.last, "Franklin", hn) self.m(hn.suffix, "Jr.", hn) self.m(hn.nickname, "Ben", hn) - - # it would be hard to support this without breaking some of the - # other examples with single quotes in the names. - @unittest.expectedFailure + def test_nickname_in_single_quotes(self): hn = HumanName("Benjamin 'Ben' Franklin") self.m(hn.first, "Benjamin", hn) @@ -1386,65 +1478,168 @@ def test_nickname_in_double_quotes(self): self.m(hn.middle, "", hn) self.m(hn.last, "Franklin", hn) self.m(hn.nickname, "Ben", hn) - + def test_single_quotes_on_first_name_not_treated_as_nickname(self): - hn = HumanName("Brian O'connor") + hn = HumanName("Brian Andrew O'connor") self.m(hn.first, "Brian", hn) - self.m(hn.middle, "", hn) + self.m(hn.middle, "Andrew", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) - + def test_single_quotes_on_both_name_not_treated_as_nickname(self): hn = HumanName("La'tanya O'connor") self.m(hn.first, "La'tanya", hn) self.m(hn.middle, "", hn) self.m(hn.last, "O'connor", hn) self.m(hn.nickname, "", hn) - + def test_single_quotes_on_end_of_last_name_not_treated_as_nickname(self): hn = HumanName("Mari' Aube'") self.m(hn.first, "Mari'", hn) self.m(hn.middle, "", hn) self.m(hn.last, "Aube'", hn) self.m(hn.nickname, "", hn) - - #http://code.google.com/p/python-nameparser/issues/detail?id=17 - def test_parenthesis_are_removed(self): - hn = HumanName("John Jones (Google Docs)") + + def test_okina_inside_name_not_treated_as_nickname(self): + hn = HumanName("Harrieta Keōpūolani Nāhiʻenaʻena") + self.m(hn.first, "Harrieta", hn) + self.m(hn.middle, "Keōpūolani", hn) + self.m(hn.last, "Nāhiʻenaʻena", hn) + self.m(hn.nickname, "", hn) + + def test_single_quotes_not_treated_as_nickname_Hawaiian_example(self): + hn = HumanName("Harietta Keopuolani Nahi'ena'ena") + self.m(hn.first, "Harietta", hn) + self.m(hn.middle, "Keopuolani", hn) + self.m(hn.last, "Nahi'ena'ena", hn) + self.m(hn.nickname, "", hn) + + def test_single_quotes_not_treated_as_nickname_Kenyan_example(self): + hn = HumanName("Naomi Wambui Ng'ang'a") + self.m(hn.first, "Naomi", hn) + self.m(hn.middle, "Wambui", hn) + self.m(hn.last, "Ng'ang'a", hn) + self.m(hn.nickname, "", hn) + + def test_single_quotes_not_treated_as_nickname_Samoan_example(self): + hn = HumanName("Va'apu'u Vitale") + self.m(hn.first, "Va'apu'u", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Vitale", hn) + self.m(hn.nickname, "", hn) + + # http://code.google.com/p/python-nameparser/issues/detail?id=17 + def test_parenthesis_are_removed_from_name(self): + hn = HumanName("John Jones (Unknown)") self.m(hn.first, "John", hn) self.m(hn.last, "Jones", hn) # not testing the nicknames because we don't actually care - # about Google Docs. - - def test_parenthesis_are_removed2(self): + # about Google Docs here + + def test_duplicate_parenthesis_are_removed_from_name(self): hn = HumanName("John Jones (Google Docs), Jr. (Unknown)") self.m(hn.first, "John", hn) self.m(hn.last, "Jones", hn) self.m(hn.suffix, "Jr.", hn) + def test_nickname_and_last_name(self): + hn = HumanName('"Rick" Edmonds') + self.m(hn.first, "", hn) + self.m(hn.last, "Edmonds", hn) + self.m(hn.nickname, "Rick", hn) + + @unittest.expectedFailure + def test_nickname_and_last_name_with_title(self): + hn = HumanName('Senator "Rick" Edmonds') + self.m(hn.title, "Senator", hn) + self.m(hn.first, "", hn) + self.m(hn.last, "Edmonds", hn) + self.m(hn.nickname, "Rick", hn) + + +# class MaidenNameTestCase(HumanNameTestBase): +# +# def test_parenthesis_and_quotes_together(self): +# hn = HumanName("Jennifer 'Jen' Jones (Duff)") +# self.m(hn.first, "Jennifer", hn) +# self.m(hn.last, "Jones", hn) +# self.m(hn.nickname, "Jen", hn) +# self.m(hn.maiden, "Duff", hn) +# +# def test_maiden_name_with_nee(self): +# # https://en.wiktionary.org/wiki/née +# hn = HumanName("Mary Toogood nee Johnson") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_accented_nee(self): +# # https://en.wiktionary.org/wiki/née +# hn = HumanName("Mary Toogood née Johnson") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_nee_and_comma(self): +# # https://en.wiktionary.org/wiki/née +# hn = HumanName("Mary Toogood, née Johnson") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_nee_with_parenthesis(self): +# hn = HumanName("Mary Toogood (nee Johnson)") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# +# def test_maiden_name_with_parenthesis(self): +# hn = HumanName("Mary Toogood (Johnson)") +# self.m(hn.first, "Mary", hn) +# self.m(hn.last, "Toogood", hn) +# self.m(hn.maiden, "Johnson", hn) +# + class PrefixesTestCase(HumanNameTestBase): def test_prefix(self): hn = HumanName("Juan del Sur") self.m(hn.first, "Juan", hn) self.m(hn.last, "del Sur", hn) - + def test_prefix_with_period(self): hn = HumanName("Jill St. John") self.m(hn.first, "Jill", hn) self.m(hn.last, "St. John", hn) - + def test_prefix_before_two_part_last_name(self): hn = HumanName("pennie von bergen wessels") self.m(hn.first, "pennie", hn) self.m(hn.last, "von bergen wessels", hn) + def test_prefix_is_first_name(self): + hn = HumanName("Van Johnson") + self.m(hn.first, "Van", hn) + self.m(hn.last, "Johnson", hn) + + def test_prefix_is_first_name_with_middle_name(self): + hn = HumanName("Van Jeremy Johnson") + self.m(hn.first, "Van", hn) + self.m(hn.middle, "Jeremy", hn) + self.m(hn.last, "Johnson", hn) + def test_prefix_before_two_part_last_name_with_suffix(self): hn = HumanName("pennie von bergen wessels III") self.m(hn.first, "pennie", hn) self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "III", hn) + def test_prefix_before_two_part_last_name_with_acronym_suffix(self): + hn = HumanName("pennie von bergen wessels M.D.") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "M.D.", hn) + def test_two_part_last_name_with_suffix_comma(self): hn = HumanName("pennie von bergen wessels, III") self.m(hn.first, "pennie", hn) @@ -1457,9 +1652,71 @@ def test_two_part_last_name_with_suffix(self): self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "III", hn) + def test_last_name_two_part_last_name_with_two_suffixes(self): + hn = HumanName("von bergen wessels MD, pennie III") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD, III", hn) + + def test_comma_two_part_last_name_with_acronym_suffix(self): + hn = HumanName("von bergen wessels, pennie MD") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD", hn) + + def test_comma_two_part_last_name_with_suffix_in_first_part(self): + # I'm kinda surprised this works, not really sure if this is a + # realistic place for a suffix to be. + hn = HumanName("von bergen wessels MD, pennie") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD", hn) + + def test_title_two_part_last_name_with_suffix_in_first_part(self): + hn = HumanName("pennie von bergen wessels MD, III") + self.m(hn.first, "pennie", hn) + self.m(hn.last, "von bergen wessels", hn) + self.m(hn.suffix, "MD, III", hn) + + def test_portuguese_dos(self): + hn = HumanName("Rafael Sousa dos Anjos") + self.m(hn.first, "Rafael", hn) + self.m(hn.middle, "Sousa", hn) + self.m(hn.last, "dos Anjos", hn) + + def test_portuguese_prefixes(self): + hn = HumanName("Joao da Silva do Amaral de Souza") + self.m(hn.first, "Joao", hn) + self.m(hn.middle, "da Silva do Amaral", hn) + self.m(hn.last, "de Souza", hn) + + def test_three_conjunctions(self): + hn = HumanName("Dr. Juan Q. Xavier de la dos Vega III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + + def test_lastname_three_conjunctions(self): + hn = HumanName("de la dos Vega, Dr. Juan Q. Xavier III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + + def test_comma_three_conjunctions(self): + hn = HumanName("Dr. Juan Q. Xavier de la dos Vega, III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + class SuffixesTestCase(HumanNameTestBase): - + def test_suffix(self): hn = HumanName("Joe Franklin Jr") self.m(hn.first, "Joe", hn) @@ -1476,7 +1733,7 @@ def test_two_suffixes(self): hn = HumanName("Kenneth Clarke QC MP") self.m(hn.first, "Kenneth", hn) self.m(hn.last, "Clarke", hn) - # NOTE: this adds a comma when the orginal format did not have one. + # NOTE: this adds a comma when the original format did not have one. # not ideal but at least its in the right bucket self.m(hn.suffix, "QC, MP", hn) @@ -1484,7 +1741,7 @@ def test_two_suffixes_lastname_comma_format(self): hn = HumanName("Washington Jr. MD, Franklin") self.m(hn.first, "Franklin", hn) self.m(hn.last, "Washington", hn) - # NOTE: this adds a comma when the orginal format did not have one. + # NOTE: this adds a comma when the original format did not have one. self.m(hn.suffix, "Jr., MD", hn) def test_two_suffixes_suffix_comma_format(self): @@ -1523,36 +1780,29 @@ def test_suffix_with_double_comma_format(self): self.m(hn.last, "Doe", hn) self.m(hn.suffix, "jr., MD", hn) - @unittest.expectedFailure def test_phd_with_erroneous_space(self): hn = HumanName("John Smith, Ph. D.") self.m(hn.first, "John", hn) self.m(hn.last, "Smith", hn) self.m(hn.suffix, "Ph. D.", hn) - #http://en.wikipedia.org/wiki/Ma_(surname) + def test_phd_conflict(self): + hn = HumanName("Adolph D") + self.m(hn.first, "Adolph", hn) + self.m(hn.last, "D", hn) + + # http://en.wikipedia.org/wiki/Ma_(surname) + def test_potential_suffix_that_is_also_last_name(self): hn = HumanName("Jack Ma") self.m(hn.first, "Jack", hn) self.m(hn.last, "Ma", hn) - + def test_potential_suffix_that_is_also_last_name_comma(self): hn = HumanName("Ma, Jack") self.m(hn.first, "Jack", hn) self.m(hn.last, "Ma", hn) - - def test_potential_suffix_that_is_also_first_name_comma(self): - hn = HumanName("Johnson, Bart") - self.m(hn.first, "Bart", hn) - self.m(hn.last, "Johnson", hn) - - # TODO: handle conjunctions in last names followed by first names clashing with suffixes - @unittest.expectedFailure - def test_potential_suffix_that_is_also_first_name_comma_with_conjunction(self): - hn = HumanName("De la Vina, Bart") - self.m(hn.first, "Bart", hn) - self.m(hn.last, "De la Vina", hn) - + def test_potential_suffix_that_is_also_last_name_with_suffix(self): hn = HumanName("Jack Ma Jr") self.m(hn.first, "Jack", hn) @@ -1573,23 +1823,23 @@ def test_king(self): self.m(hn.last, "King", hn) self.m(hn.suffix, "Jr", hn) - def test_suffix_with_periods(self): + def test_multiple_letter_suffix_with_periods(self): hn = HumanName("John Doe Msc.Ed.") - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) - self.m(hn.suffix,"Msc.Ed.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + self.m(hn.suffix, "Msc.Ed.", hn) def test_suffix_with_periods_with_comma(self): hn = HumanName("John Doe, Msc.Ed.") - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) - self.m(hn.suffix,"Msc.Ed.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + self.m(hn.suffix, "Msc.Ed.", hn) def test_suffix_with_periods_with_lastname_comma(self): hn = HumanName("Doe, John Msc.Ed.") - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) - self.m(hn.suffix,"Msc.Ed.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + self.m(hn.suffix, "Msc.Ed.", hn) class TitleTestCase(HumanNameTestBase): @@ -1609,8 +1859,8 @@ def test_last_name_is_also_title_no_comma(self): self.m(hn.suffix, "Jr.", hn) def test_last_name_is_also_title_with_comma(self): - hn = HumanName("Duke Martin Luther King, Jr.") - self.m(hn.title, "Duke", hn) + hn = HumanName("Dr Martin Luther King, Jr.") + self.m(hn.title, "Dr", hn) self.m(hn.first, "Martin", hn) self.m(hn.middle, "Luther", hn) self.m(hn.last, "King", hn) @@ -1639,33 +1889,33 @@ def test_title_is_title(self): # TODO: fix handling of U.S. @unittest.expectedFailure - def test_chained_title_first_name_initial(self): + def test_chained_title_first_name_title_is_initials(self): hn = HumanName("U.S. District Judge Marc Thomas Treadwell") self.m(hn.title, "U.S. District Judge", hn) self.m(hn.first, "Marc", hn) self.m(hn.middle, "Thomas", hn) self.m(hn.last, "Treadwell", hn) - + def test_conflict_with_chained_title_first_name_initial(self): hn = HumanName("U. S. Grant") self.m(hn.first, "U.", hn) self.m(hn.middle, "S.", hn) self.m(hn.last, "Grant", hn) - - def test_chained_title_first_name_initial(self): + + def test_chained_title_first_name_initial_with_no_period(self): hn = HumanName("US Magistrate Judge T Michael Putnam") self.m(hn.title, "US Magistrate Judge", hn) self.m(hn.first, "T", hn) self.m(hn.middle, "Michael", hn) self.m(hn.last, "Putnam", hn) - + def test_chained_hyphenated_title(self): hn = HumanName("US Magistrate-Judge Elizabeth E Campbell") self.m(hn.title, "US Magistrate-Judge", hn) self.m(hn.first, "Elizabeth", hn) self.m(hn.middle, "E", hn) self.m(hn.last, "Campbell", hn) - + def test_chained_hyphenated_title_with_comma_suffix(self): hn = HumanName("Mag-Judge Harwell G Davis, III") self.m(hn.title, "Mag-Judge", hn) @@ -1708,7 +1958,7 @@ def test_title_with_last_initial_is_suffix(self): self.m(hn.title, "King", hn) self.m(hn.first, "John", hn) self.m(hn.last, "V.", hn) - + def test_initials_also_suffix(self): hn = HumanName("Smith, J.R.") self.m(hn.first, "J.R.", hn) @@ -1768,8 +2018,8 @@ def test_possible_conflict_with_suffix_that_could_be_initial(self): @unittest.expectedFailure def test_ben_as_conjunction(self): hn = HumanName("Ahmad ben Husain") - self.m(hn.first,"Ahmad", hn) - self.m(hn.last,"ben Husain", hn) + self.m(hn.first, "Ahmad", hn) + self.m(hn.last, "ben Husain", hn) def test_ben_as_first_name(self): hn = HumanName("Ben Johnson") @@ -1796,15 +2046,36 @@ def test_last_name_also_prefix(self): def test_title_with_periods(self): hn = HumanName("Lt.Gov. John Doe") - self.m(hn.title,"Lt.Gov.", hn) - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) + self.m(hn.title, "Lt.Gov.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) def test_title_with_periods_lastname_comma(self): hn = HumanName("Doe, Lt.Gov. John") - self.m(hn.title,"Lt.Gov.", hn) - self.m(hn.first,"John", hn) - self.m(hn.last,"Doe", hn) + self.m(hn.title, "Lt.Gov.", hn) + self.m(hn.first, "John", hn) + self.m(hn.last, "Doe", hn) + + def test_mac_with_spaces(self): + hn = HumanName("Jane Mac Beth") + self.m(hn.first, "Jane", hn) + self.m(hn.last, "Mac Beth", hn) + + def test_mac_as_first_name(self): + hn = HumanName("Mac Miller") + self.m(hn.first, "Mac", hn) + self.m(hn.last, "Miller", hn) + + def test_multiple_prefixes(self): + hn = HumanName("Mike van der Velt") + self.m(hn.first, "Mike", hn) + self.m(hn.last, "van der Velt", hn) + + def test_2_same_prefixes_in_the_name(self): + hh = HumanName("Vincent van Gogh van Beethoven") + self.m(hh.first, "Vincent", hh) + self.m(hh.middle, "van Gogh", hh) + self.m(hh.last, "van Beethoven", hh) class HumanNameCapitalizationTestCase(HumanNameTestBase): def test_capitalization_exception_for_III(self): @@ -1874,11 +2145,22 @@ def test_short_names_with_mac(self): hn.capitalize() self.m(str(hn), 'Mack Johnson', hn) + def test_portuguese_prefixes(self): + hn = HumanName("joao da silva do amaral de souza") + hn.capitalize() + self.m(str(hn), 'Joao da Silva do Amaral de Souza', hn) + + def test_capitalize_prefix_clash_on_first_name(self): + hn = HumanName("van nguyen") + hn.capitalize() + self.m(str(hn), 'Van Nguyen', hn) + + class HumanNameOutputFormatTests(HumanNameTestBase): - + def test_formatting_init_argument(self): - hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", - string_format = "TEST1") + hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", + string_format="TEST1") self.assertEqual(u(hn), "TEST1") def test_formatting_constants_attribute(self): @@ -1889,6 +2171,28 @@ def test_formatting_constants_attribute(self): self.assertEqual(u(hn), "TEST2") CONSTANTS.string_format = _orig + def test_capitalize_name_constants_attribute(self): + from nameparser.config import CONSTANTS + CONSTANTS.capitalize_name = True + hn = HumanName("bob v. de la macdole-eisenhower phd") + self.assertEqual(str(hn), "Bob V. de la MacDole-Eisenhower Ph.D.") + CONSTANTS.capitalize_name = False + + def test_force_mixed_case_capitalization_constants_attribute(self): + from nameparser.config import CONSTANTS + CONSTANTS.force_mixed_case_capitalization = True + hn = HumanName('Shirley Maclaine') + hn.capitalize() + self.assertEqual(str(hn), "Shirley MacLaine") + CONSTANTS.force_mixed_case_capitalization = False + + def test_capitalize_name_and_force_mixed_case_capitalization_constants_attributes(self): + from nameparser.config import CONSTANTS + CONSTANTS.capitalize_name = True + CONSTANTS.force_mixed_case_capitalization = True + hn = HumanName('Shirley Maclaine') + self.assertEqual(str(hn), "Shirley MacLaine") + def test_quote_nickname_formating(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" @@ -1917,51 +2221,51 @@ def test_formating_removing_pieces_from_name_buckets(self): self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'") hn.string_format = "{title} {first} {middle} {last} {suffix}" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") - hn.middle='' + hn.middle = '' self.assertEqual(u(hn), "Rev John Doe III") - hn.suffix='' + hn.suffix = '' self.assertEqual(u(hn), "Rev John Doe") - hn.title='' + hn.title = '' self.assertEqual(u(hn), "John Doe") def test_formating_of_nicknames_with_parenthesis(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III (Kenny)") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_formating_of_nicknames_with_single_quotes(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III 'Kenny'") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_formating_of_nicknames_with_double_quotes(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} \"{nickname}\"" self.assertEqual(u(hn), "Rev John A. Kenneth Doe III \"Kenny\"") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") def test_formating_of_nicknames_in_middle(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} ({nickname}) {middle} {last} {suffix}" self.assertEqual(u(hn), "Rev John (Kenny) A. Kenneth Doe III") - hn.nickname='' + hn.nickname = '' self.assertEqual(u(hn), "Rev John A. Kenneth Doe III") - + def test_remove_emojis(self): hn = HumanName("Sam Smith 😊") - self.m(hn.first,"Sam", hn) - self.m(hn.last,"Smith", hn) + self.m(hn.first, "Sam", hn) + self.m(hn.last, "Smith", hn) self.assertEqual(u(hn), "Sam Smith") def test_keep_non_emojis(self): hn = HumanName("∫≜⩕ Smith 😊") - self.m(hn.first,"∫≜⩕", hn) - self.m(hn.last,"Smith", hn) + self.m(hn.first, "∫≜⩕", hn) + self.m(hn.last, "Smith", hn) self.assertEqual(u(hn), "∫≜⩕ Smith") def test_keep_emojis(self): @@ -1969,11 +2273,120 @@ def test_keep_emojis(self): constants = Constants() constants.regexes.emoji = False hn = HumanName("∫≜⩕ Smith😊", constants) - self.m(hn.first,"∫≜⩕", hn) - self.m(hn.last,"Smith😊", hn) + self.m(hn.first, "∫≜⩕", hn) + self.m(hn.last, "Smith😊", hn) self.assertEqual(u(hn), "∫≜⩕ Smith😊") # test cleanup + +class InitialsTestCase(HumanNameTestBase): + def test_initials(self): + hn = HumanName("Andrew Boris Petersen") + self.m(hn.initials(), "A. B. P.", hn) + + def test_initials_simple_name(self): + hn = HumanName("John Doe") + self.m(hn.initials(), "J. D.", hn) + hn = HumanName("John Doe", initials_format="{first} {last}") + self.m(hn.initials(), "J. D.", hn) + hn = HumanName("John Doe", initials_format="{last}") + self.m(hn.initials(), "D.", hn) + hn = HumanName("John Doe", initials_format="{first}") + self.m(hn.initials(), "J.", hn) + hn = HumanName("John Doe", initials_format="{middle}") + self.m(hn.initials(), "", hn) + + def test_initials_complex_name(self): + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials(), "J. A. K. D.", hn) + + def test_initials_format(self): + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first} {middle}") + self.m(hn.initials(), "J. A. K.", hn) + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first} {last}") + self.m(hn.initials(), "J. D.", hn) + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{middle} {last}") + self.m(hn.initials(), "A. K. D.", hn) + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}, {last}") + self.m(hn.initials(), "J., D.", hn) + + def test_initials_format_constants(self): + from nameparser.config import CONSTANTS + _orig = CONSTANTS.initials_format + CONSTANTS.initials_format = "{first} {last}" + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials(), "J. D.", hn) + CONSTANTS.initials_format = "{first} {last}" + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials(), "J. D.", hn) + CONSTANTS.initials_format = _orig + + def test_initials_delimiter(self): + hn = HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";") + self.m(hn.initials(), "J; A; K; D;", hn) + + def test_initials_delimiter_constants(self): + from nameparser.config import CONSTANTS + _orig = CONSTANTS.initials_delimiter + CONSTANTS.initials_delimiter = ";" + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials(), "J; A; K; D;", hn) + CONSTANTS.initials_delimiter = _orig + + def test_initials_list(self): + hn = HumanName("Andrew Boris Petersen") + self.m(hn.initials_list(), ["A", "B", "P"], hn) + + def test_initials_list_complex_name(self): + hn = HumanName("Doe, John A. Kenneth, Jr.") + self.m(hn.initials_list(), ["J", "A", "K", "D"], hn) + + def test_initials_with_prefix_firstname(self): + hn = HumanName("Van Jeremy Johnson") + self.m(hn.initials_list(), ["V", "J", "J"], hn) + + def test_initials_with_prefix(self): + hn = HumanName("Alex van Johnson") + self.m(hn.initials_list(), ["A", "J"], hn) + + def test_constructor_first(self): + hn = HumanName(first="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.first, "TheName", hn) + + def test_constructor_middle(self): + hn = HumanName(middle="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.middle, "TheName", hn) + + def test_constructor_last(self): + hn = HumanName(last="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.last, "TheName", hn) + + def test_constructor_title(self): + hn = HumanName(title="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.title, "TheName", hn) + + def test_constructor_suffix(self): + hn = HumanName(suffix="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.suffix, "TheName", hn) + + def test_constructor_nickname(self): + hn = HumanName(nickname="TheName") + self.assertFalse(hn.unparsable) + self.m(hn.nickname, "TheName", hn) + + def test_constructor_multiple(self): + hn = HumanName(first="TheName", last="lastname", title="mytitle", full_name="donotparse") + self.assertFalse(hn.unparsable) + self.m(hn.first, "TheName", hn) + self.m(hn.last, "lastname", hn) + self.m(hn.title, "mytitle", hn) + + TEST_NAMES = ( "John Doe", "John Doe, Jr.", @@ -2147,7 +2560,9 @@ def test_keep_emojis(self): "Designated Judge David A. Ezra", "Sr US District Judge Richard G Kopf", "U.S. District Judge Marc Thomas Treadwell", - + "Dra. Andréia da Silva", + "Srta. Andréia da Silva", + ) @@ -2162,7 +2577,7 @@ def test_variations_of_TEST_NAMES(self): hn = HumanName(name) if len(hn.suffix_list) > 1: hn = HumanName("{title} {first} {middle} {last} {suffix}".format(**hn.as_dict()).split(',')[0]) - hn.C.empty_attribute_default = '' # format strings below require empty string + hn.C.empty_attribute_default = '' # format strings below require empty string hn_dict = hn.as_dict() attrs = [ 'title', @@ -2194,11 +2609,12 @@ def test_variations_of_TEST_NAMES(self): if len(sys.argv) > 1: log.setLevel(logging.ERROR) log.addHandler(logging.StreamHandler()) - name = sys.argv[1] - hn = HumanName(name, encoding=sys.stdout.encoding) - print((repr(hn))) - hn.capitalize() - print((repr(hn))) + name_string = sys.argv[1] + hn_instance = HumanName(name_string, encoding=sys.stdout.encoding) + print((repr(hn_instance))) + hn_instance.capitalize() + print((repr(hn_instance))) + print("Initials: " + hn_instance.initials()) else: print("-"*80) print("Running tests")