├── AUTHORS
├── dev-requirements.txt
├── setup.cfg
├── MANIFEST.in
├── .tm_properties
├── .gitignore
├── nameparser
    ├── __init__.py
    ├── config
    │   ├── capitalization.py
    │   ├── conjunctions.py
    │   ├── prefixes.py
    │   ├── regexes.py
    │   ├── __init__.py
    │   ├── suffixes.py
    │   └── titles.py
    ├── util.py
    └── parser.py
├── docs
    ├── contributing.rst
    ├── modules.rst
    ├── resources.rst
    ├── index.rst
    ├── Makefile
    ├── usage.rst
    ├── release_log.rst
    ├── conf.py
    └── customize.rst
├── .editorconfig
├── .travis.yml
├── LICENSE
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── python-package.yml
├── setup.py
├── CONTRIBUTING.md
└── README.rst


/AUTHORS:
--------------------------------------------------------------------------------
1 | Derek Gulbranson <derek73@gmail.com>
2 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | dill>=0.2.5
2 | Sphinx
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHORS
2 | include LICENSE
3 | include README.rst
4 | include tests.py
5 | 


--------------------------------------------------------------------------------
/.tm_properties:
--------------------------------------------------------------------------------
1 | excludeDirectories = "{$excludeDirectories,dist,*.egg-info,build,docs/_*}"
2 | include = "{$include,.gitignore,.hgignore,.travis.yml}"
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.hgrc
 2 | *.DS_Store
 3 | __pycache__/
 4 | *.py[cod]
 5 | .python2/
 6 | MANIFEST
 7 | nameparser.egg-info/
 8 | build
 9 | *.egg
10 | .coverage
11 | dist
12 | .idea
13 | Pipfile
14 | Pipfile.lock
15 | 
16 | # docs
17 | docs/_*
18 | 


--------------------------------------------------------------------------------
/nameparser/__init__.py:
--------------------------------------------------------------------------------
 1 | VERSION = (1, 1, 3)
 2 | __version__ = '.'.join(map(str, VERSION))
 3 | __author__ = "Derek Gulbranson"
 4 | __author_email__ = 'derek73@gmail.com'
 5 | __license__ = "LGPL"
 6 | __url__ = "https://github.com/derek73/python-nameparser"
 7 | 
 8 | 
 9 | from nameparser.parser import HumanName
10 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ============
 3 | 
 4 | The project is hosted on GitHub:
 5 | 
 6 | https://github.com/derek73/python-nameparser
 7 | 
 8 | Find more information about running tests and contributing the project at the projects contribution guide.
 9 | 
10 | https://github.com/derek73/python-nameparser/blob/master/CONTRIBUTING.md
11 | 
12 | 


--------------------------------------------------------------------------------
/nameparser/config/capitalization.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | CAPITALIZATION_EXCEPTIONS = (
 5 |     ('ii', 'II'),
 6 |     ('iii', 'III'),
 7 |     ('iv', 'IV'),
 8 |     ('md', 'M.D.'),
 9 |     ('phd', 'Ph.D.'),
10 | )
11 | """
12 | Any pieces that are not capitalized by capitalizing the first letter.
13 | """


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | charset = utf-8
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 
11 | [*.{py,rst,ini}]
12 | indent_style = space
13 | indent_size = 4
14 | 
15 | [*.{html,json,yml}]
16 | indent_style = space
17 | indent_size = 2
18 | 
19 | [*.md]
20 | trim_trailing_whitespace = false
21 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.4"
 5 |   - "3.5"
 6 |   - "3.6"
 7 |   - "3.7"
 8 |   - "3.8"
 9 | # command to install dependencies
10 | install:
11 |   - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi
12 |   - if [[ $TRAVIS_PYTHON_VERSION -ne '3.4' ]]; then pip install dill; fi
13 |   - "python setup.py install"
14 | # command to run tests
15 | script: python tests.py
16 | sudo: false
17 | 


--------------------------------------------------------------------------------
/nameparser/config/conjunctions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | CONJUNCTIONS = set([
 5 |     '&',
 6 |     'and',
 7 |     'et',
 8 |     'e',
 9 |     'of',
10 |     'the',
11 |     'und',
12 |     'y',
13 | ])
14 | """
15 | Pieces that should join to their neighboring pieces, e.g. "and", "y" and "&".
16 | "of" and "the" are also include to facilitate joining multiple titles,
17 | e.g. "President of the United States".
18 | """


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Derek Gulbranson <derek73 at gmail>.
 2 | http://derekgulbranson.com/
 3 | 
 4 | -----
 5 | 
 6 | LGPL-2.1+
 7 | http://www.opensource.org/licenses/lgpl-license.html
 8 | 
 9 | This library is free software; you can redistribute it and/or modify it under the
10 | terms of the GNU Lesser General Public License as published by the Free Software
11 | Foundation; either version 2.1 of the License, or (at your option) any later
12 | version.
13 | 
14 | This library is distributed in the hope that it will be useful, but WITHOUT ANY
15 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
16 | PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
17 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
 1 | HumanName Class Documentation
 2 | ==============================
 3 | 
 4 | HumanName.parser
 5 | ----------------
 6 | 
 7 | .. py:module:: nameparser.parser
 8 | 
 9 | .. py:class:: HumanName
10 |     :noindex:
11 | 
12 | .. autoclass:: HumanName
13 |     :members:
14 |     :special-members: __eq__, __init__
15 | 
16 | HumanName.config
17 | ----------------
18 | 
19 | .. automodule:: nameparser.config
20 |     :members:
21 | 
22 | HumanName.config Defaults
23 | -------------------------
24 | 
25 | 
26 | .. automodule:: nameparser.config.titles
27 |     :members:
28 | .. automodule:: nameparser.config.suffixes
29 |     :members:
30 | .. automodule:: nameparser.config.prefixes
31 |     :members:
32 | .. automodule:: nameparser.config.conjunctions
33 |     :members:
34 | .. automodule:: nameparser.config.capitalization
35 |     :members:
36 | .. automodule:: nameparser.config.regexes
37 |     :members: 
38 | 


--------------------------------------------------------------------------------
/nameparser/util.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | # http://code.google.com/p/python-nameparser/issues/detail?id=10
 4 | log = logging.getLogger('HumanName')
 5 | try:
 6 |     log.addHandler(logging.NullHandler())
 7 | except AttributeError:
 8 |     class NullHandler(logging.Handler):
 9 |         def emit(self, record):
10 |             pass
11 |     log.addHandler(NullHandler())
12 | log.setLevel(logging.ERROR)
13 | 
14 | 
15 | import sys
16 | if sys.version_info[0] < 3:
17 | 
18 |     text_type = unicode
19 |     binary_type = str
20 | 
21 |     def u(x, encoding=None):
22 |         if encoding:
23 |             return unicode(x, encoding)
24 |         else:
25 |             return unicode(x)
26 | 
27 | else:
28 |     text_type = str
29 |     binary_type = bytes
30 | 
31 |     def u(x, encoding=None):
32 |         return text_type(x)
33 | 
34 | text_types = (text_type, binary_type)
35 | def lc(value):
36 |     """Lower case and remove any periods to normalize for comparison."""
37 |     if not value:
38 |         return ''
39 |     return value.lower().strip('.')
40 | 


--------------------------------------------------------------------------------
/docs/resources.rst:
--------------------------------------------------------------------------------
 1 | Naming Practices and Resources
 2 | ==============================
 3 | 
 4 |     * US_Census_Surname_Data_2000_
 5 |     * US_Social_Security_Administration_Baby_Names_Index_
 6 |     * Naming_practice_guide_UK_2006_
 7 |     * Wikipedia_Anthroponymy_
 8 |     * Wikipedia_Naming_conventions_
 9 |     * Wikipedia_List_Of_Titles_
10 |     * Tussenvoegsel_
11 |     * Family_Name_Affixes_
12 | 
13 | .. _US_Census_Surname_Data_2000: https://www.census.gov/data/developers/data-sets/surnames/2000.html
14 | .. _US_Social_Security_Administration_Baby_Names_Index: https://www.ssa.gov/oact/babynames/limits.html
15 | .. _Naming_practice_guide_UK_2006: https://www.fbiic.gov/public/2008/nov/Naming_practice_guide_UK_2006.pdf
16 | .. _Wikipedia_Anthroponymy: https://en.wikipedia.org/wiki/Anthroponymy
17 | .. _Wikipedia_Naming_conventions: http://en.wikipedia.org/wiki/Wikipedia:Naming_conventions_(people)
18 | .. _Wikipedia_List_Of_Titles: https://en.wikipedia.org/wiki/Title
19 | .. _Tussenvoegsel: https://en.wikipedia.org/wiki/Tussenvoegsel
20 | .. _Family_Name_Affixes : https://en.wikipedia.org/wiki/List_of_family_name_affixes
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Published Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   deploy:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: '3.x'
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install build
30 |     - name: Build package
31 |       run: python -m build
32 |     - name: Publish package
33 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 |       with:
35 |         user: __token__
36 |         password: ${{ secrets.PYPI_API_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Test the Python package
 5 | 
 6 | on:
 7 |   workflow_dispatch:
 8 |   push:
 9 |     branches: [ master ]
10 |   pull_request:
11 |     branches: [ master ]
12 | 
13 | jobs:
14 |   build:
15 | 
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v2
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v2
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install twine
32 |         python -m pip install sphinx
33 |         if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi
34 |     - name: Run Tests
35 |       run: |
36 |         python tests.py
37 |         python setup.py sdist
38 |         twine check dist/*
39 |         sphinx-build -b html docs dist/docs
40 | 


--------------------------------------------------------------------------------
/nameparser/config/prefixes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | #: Name pieces that appear before a last name. Prefixes join to the piece
 5 | #: that follows them to make one new piece. They can be chained together, e.g
 6 | #: "von der" and "de la". Because they only appear in middle or last names,
 7 | #: they also signify that all following name pieces should be in the same name
 8 | #: part, for example, "von" will be joined to all following pieces that are not
 9 | #: prefixes or suffixes, allowing recognition of double last names when they
10 | #: appear after a prefixes. So in "pennie von bergen wessels MD", "von" will
11 | #: join with all following name pieces until the suffix "MD", resulting in the
12 | #: correct parsing of the last name "von bergen wessels".
13 | PREFIXES = set([
14 |     'abu',
15 |     'al',
16 |     'bin',
17 |     'bon',
18 |     'da',
19 |     'dal',
20 |     'de',
21 |     'de\'',
22 |     'degli',
23 |     'dei',
24 |     'del',
25 |     'dela',
26 |     'della',
27 |     'delle',
28 |     'delli',
29 |     'dello',
30 |     'der',
31 |     'di',
32 |     'dí',
33 |     'do',
34 |     'dos',
35 |     'du',
36 |     'ibn',
37 |     'la',
38 |     'le',
39 |     'mac',
40 |     'mc',
41 |     'san',
42 |     'santa',
43 |     'st',
44 |     'ste',
45 |     'van',
46 |     'vander',
47 |     'vel',
48 |     'von',
49 |     'vom',
50 | ])
51 | 


--------------------------------------------------------------------------------
/nameparser/config/regexes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | import re
 4 | 
 5 | # emoji regex from https://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python
 6 | try:
 7 |     # Wide UCS-4 build
 8 |     re_emoji = re.compile('['
 9 |         '\U0001F300-\U0001F64F'
10 |         '\U0001F680-\U0001F6FF'
11 |         '\u2600-\u26FF\u2700-\u27BF]+', 
12 |         re.UNICODE)
13 | except re.error:
14 |     # Narrow UCS-2 build
15 |     re_emoji = re.compile('('
16 |         '\ud83c[\udf00-\udfff]|'
17 |         '\ud83d[\udc00-\ude4f\ude80-\udeff]|'
18 |         '[\u2600-\u26FF\u2700-\u27BF])+', 
19 |         re.UNICODE)
20 | 
21 | REGEXES = set([
22 |     ("spaces", re.compile(r"\s+", re.U)),
23 |     ("word", re.compile(r"(\w|\.)+", re.U)),
24 |     ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)),
25 |     ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)),
26 |     ("quoted_word", re.compile(r'(?<!\w)\'([^\s]*?)\'(?!\w)', re.U)),
27 |     ("double_quotes", re.compile(r'\"(.*?)\"', re.U)),
28 |     ("parenthesis", re.compile(r'\((.*?)\)', re.U)),
29 |     ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)),
30 |     ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)),
31 |     ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)),
32 |     ("emoji",re_emoji),
33 |     ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)),
34 | ])
35 | """
36 | All regular expressions used by the parser are precompiled and stored in the config.
37 | """
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | try:
 3 |   from setuptools import setup
 4 | except ImportError:
 5 |   from distutils.core import setup
 6 | import nameparser
 7 | import os
 8 | 
 9 | def read(fname):
10 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
11 | 
12 | README = read('README.rst')
13 | 
14 | setup(name='nameparser',
15 |       packages      = ['nameparser','nameparser.config'],
16 |       description   = 'A simple Python module for parsing human names into their individual components.',
17 |       long_description = README,
18 |       long_description_content_type = "text/x-rst",
19 |       version       = nameparser.__version__,
20 |       url           = nameparser.__url__,
21 |       author        = nameparser.__author__,
22 |       author_email  = nameparser.__author_email__,
23 |       license       = nameparser.__license__,
24 |       keywords      = ['names','parser'],
25 |       classifiers = [
26 |           'Intended Audience :: Developers',
27 |           'Operating System :: OS Independent',
28 |           "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
29 |           'Programming Language :: Python',
30 |           'Programming Language :: Python :: 2',
31 |           'Programming Language :: Python :: 3',
32 |           'Development Status :: 5 - Production/Stable',
33 |           'Natural Language :: English',
34 |           "Topic :: Software Development :: Libraries :: Python Modules",
35 |           'Topic :: Text Processing :: Linguistic',
36 |       ]
37 |       )
38 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ==============
 3 | 
 4 | Development Environment Setup
 5 | --------------------------------
 6 | 
 7 | There are some external dependencies required in order to run the
 8 | tests, located in the dev-requirements.txt file.
 9 | 
10 |     pip install -r dev-requirements.txt
11 | 
12 | If you are running Python 2.6 you will also need to `pip install unitest2`
13 | in order to run the tests.
14 | 
15 | Travis CI
16 | ---------
17 | 
18 | [![Build Status](https://travis-ci.org/derek73/python-nameparser.svg?branch=master)](https://travis-ci.org/derek73/python-nameparser)
19 | 
20 | The GitHub project is set up with Travis CI. Tests are run
21 | automatically against new code pushes to any branch in the main
22 | repository. Test results may be viewed here:
23 | 
24 | https://travis-ci.org/derek73/python-nameparser
25 | 
26 | Running Tests
27 | ---------------
28 | 
29 | To run the tests locally, run `python tests.py`.
30 | 
31 | 
32 |     python tests.py
33 | 
34 | 
35 | You can also pass a name string to `tests.py` to see how it will be parsed.
36 | 
37 |     $ python tests.py "Secretary of State Hillary Rodham-Clinton"
38 |     <HumanName : [
39 |     	Title: 'Secretary of State' 
40 |     	First: 'Hillary' 
41 |     	Middle: '' 
42 |     	Last: 'Rodham-Clinton' 
43 |     	Suffix: ''
44 |     ]>
45 | 
46 | 
47 | Writing Tests
48 | ----------------
49 | 
50 | If you make changes, please make sure you include tests with example
51 | names that you want to be parsed correctly.
52 | 
53 | It's a good idea to include tests of alternate comma placement formats
54 | of the name to ensure that the 3 code paths for the 3 formats work in
55 | the same way.
56 | 
57 | The tests could be MUCH better. If the spirit moves you to design or
58 | implement a much more intelligent test strategy, please know that your
59 | efforts will be welcome and appreciated.
60 | 
61 | Unless you add better coverage someplace else, add a few examples of
62 | your names to `TEST_NAMES`. A test attempts to try the 3 different
63 | comma variations of these names automatically and make sure things
64 | don't blow up, so it can be a helpful regression indicator.
65 | 
66 | 
67 | New Releases
68 | ------------
69 | 
70 | [Publishing to Pypi Guide](https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/)
71 | 
72 |     $ python setup.py sdist bdist_wheel
73 |     $ twine upload dist/*
74 | 
75 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Nameparser documentation master file, created by
 2 |    sphinx-quickstart on Fri May 16 01:29:58 2014.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Python Human Name Parser
 7 | ========================
 8 | 
 9 | Version |release| 
10 | 
11 | A simple Python module for parsing human names into their individual 
12 | components. 
13 | 
14 | * hn.title
15 | * hn.first
16 | * hn.middle
17 | * hn.last
18 | * hn.suffix
19 | * hn.nickname
20 | 
21 | Supports 3 different comma placement variations in the input string.
22 | 
23 | 1. Title Firstname "Nickname" Middle Middle Lastname Suffix
24 | 2. Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix]
25 | 3. Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix]
26 | 
27 | 
28 | It attempts the best guess that can be made with a simple, rule-based 
29 | approach. It's not perfect, but it gets you pretty far. 
30 | 
31 | Its main use case is English, but it may be useful for other latin-based languages, especially 
32 | if you are willing to `customize it`_, but it is not likely to be useful for languages 
33 | that do not share the same structure as English names.
34 | 
35 | .. _customize it: customize.html
36 | 
37 | Instantiating the `HumanName` class with a string splits on commas and then spaces, 
38 | classifying name parts based on placement in the string and matches against known name 
39 | pieces like titles. It joins name pieces on conjunctions and special prefixes to last names like 
40 | "del". Titles can be chained together and include conjunctions to handle 
41 | titles like "Asst Secretary of State". It can also try to correct 
42 | capitalization.
43 | 
44 | It does not attempt to correct input mistakes. When there is ambiguity that cannot be resolved by a rule-based approach, 
45 | HumanName prefers to handle the most common cases correctly. For example, 
46 | "Dean" is not parsed as title because it is more common as a first name 
47 | (You can customize this behavior though, see `Parser Customization Examples`_).
48 | 
49 | .. _Parser Customization Examples: customize.html#parser-customization-examples
50 | 
51 | 
52 | Parsing Names
53 | -------------
54 | 
55 | .. toctree::
56 |    :maxdepth: 2
57 |    
58 |    usage
59 |    customize
60 | 
61 | **Developer Documentation**
62 | 
63 | .. toctree::
64 |    :maxdepth: 2
65 |    
66 |    modules
67 |    resources
68 |    release_log
69 |    contributing
70 | 
71 | 
72 | 
73 | Indices and tables
74 | ==================
75 | 
76 | * :ref:`genindex`
77 | * :ref:`modindex`
78 | * :ref:`search`
79 | 
80 | 
81 | **GitHub Project**: https://github.com/derek73/python-nameparser
82 | 
83 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Name Parser
  2 | ===========
  3 | 
  4 | |Build Status| |PyPI| |PyPI version| |Documentation|
  5 | 
  6 | A simple Python (3.2+ & 2.6+) module for parsing human names into their
  7 | individual components. 
  8 | 
  9 | * hn.title
 10 | * hn.first
 11 | * hn.middle
 12 | * hn.last
 13 | * hn.suffix
 14 | * hn.nickname
 15 | * hn.surnames *(middle + last)*
 16 | * hn.initials *(first initial of each name part)*
 17 | 
 18 | Supported Name Structures
 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 20 | 
 21 | The supported name structure is generally "Title First Middle Last Suffix", where all pieces 
 22 | are optional. Comma-separated format like "Last, First" is also supported.
 23 | 
 24 | 1. Title Firstname "Nickname" Middle Middle Lastname Suffix
 25 | 2. Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix]
 26 | 3. Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix]
 27 | 
 28 | Instantiating the `HumanName` class with a string splits on commas and then spaces, 
 29 | classifying name parts based on placement in the string and matches against known name 
 30 | pieces like titles and suffixes. 
 31 | 
 32 | It correctly handles some common conjunctions and special prefixes to last names
 33 | like "del". Titles and conjunctions can be chained together to handle complex
 34 | titles like "Asst Secretary of State". It can also try to correct capitalization
 35 | of names that are all upper- or lowercase names.
 36 | 
 37 | It attempts the best guess that can be made with a simple, rule-based approach. 
 38 | Its main use case is English and it is not likely to be useful for languages 
 39 | that do not conform to the supported name structure. It's not perfect, but it 
 40 | gets you pretty far.
 41 | 
 42 | Installation
 43 | ------------
 44 | 
 45 | ::
 46 | 
 47 |   pip install nameparser
 48 | 
 49 | If you want to try out the latest code from GitHub you can
 50 | install with pip using the command below.
 51 | 
 52 | ``pip install -e git+git://github.com/derek73/python-nameparser.git#egg=nameparser``
 53 | 
 54 | If you need to handle lists of names, check out
 55 | `namesparser <https://github.com/gwu-libraries/namesparser>`_, a
 56 | compliment to this module that handles multiple names in a string.
 57 | 
 58 | 
 59 | Quick Start Example
 60 | -------------------
 61 | 
 62 | ::
 63 | 
 64 |     >>> from nameparser import HumanName
 65 |     >>> name = HumanName("Dr. Juan Q. Xavier de la Vega III (Doc Vega)")
 66 |     >>> name 
 67 |     <HumanName : [
 68 |     	title: 'Dr.' 
 69 |     	first: 'Juan' 
 70 |     	middle: 'Q. Xavier' 
 71 |     	last: 'de la Vega' 
 72 |     	suffix: 'III'
 73 |     	nickname: 'Doc Vega'
 74 |     ]>
 75 |     >>> name.last
 76 |     'de la Vega'
 77 |     >>> name.as_dict()
 78 |     {'last': 'de la Vega', 'suffix': 'III', 'title': 'Dr.', 'middle': 'Q. Xavier', 'nickname': 'Doc Vega', 'first': 'Juan'}
 79 |     >>> str(name)
 80 |     'Dr. Juan Q. Xavier de la Vega III (Doc Vega)'
 81 |     >>> name.string_format = "{first} {last}"
 82 |     >>> str(name)
 83 |     'Juan de la Vega'
 84 | 
 85 | 
 86 | The parser does not attempt to correct mistakes in the input. It mostly just splits on white
 87 | space and puts things in buckets based on their position in the string. This also means
 88 | the difference between 'title' and 'suffix' is positional, not semantic. "Dr" is a title
 89 | when it comes before the name and a suffix when it comes after. ("Pre-nominal"
 90 | and "post-nominal" would probably be better names.)
 91 | 
 92 | ::
 93 | 
 94 |     >>> name = HumanName("1 & 2, 3 4 5, Mr.")
 95 |     >>> name 
 96 |     <HumanName : [
 97 |     	title: '' 
 98 |     	first: '3' 
 99 |     	middle: '4 5' 
100 |     	last: '1 & 2' 
101 |     	suffix: 'Mr.'
102 |     	nickname: ''
103 |     ]>
104 | 
105 | Customization
106 | -------------
107 | 
108 | Your project may need some adjustment for your dataset. You can
109 | do this in your own pre- or post-processing, by `customizing the configured pre-defined 
110 | sets`_ of titles, prefixes, etc., or by subclassing the `HumanName` class. See the 
111 | `full documentation`_ for more information.
112 | 
113 | 
114 | `Full documentation`_
115 | ~~~~~~~~~~~~~~~~~~~~~
116 | 
117 | .. _customizing the configured pre-defined sets: http://nameparser.readthedocs.org/en/latest/customize.html
118 | .. _Full documentation: http://nameparser.readthedocs.org/en/latest/
119 | 
120 | 
121 | Contributing
122 | ------------
123 | 
124 | If you come across name piece that you think should be in the default config, you're
125 | probably right. `Start a New Issue`_ and we can get them added. 
126 | 
127 | Please let me know if there are ways this library could be structured to make
128 | it easier for you to use in your projects. Read CONTRIBUTING.md_ for more info
129 | on running the tests and contributing to the project.
130 | 
131 | **GitHub Project**
132 | 
133 | https://github.com/derek73/python-nameparser
134 | 
135 | .. _CONTRIBUTING.md: https://github.com/derek73/python-nameparser/tree/master/CONTRIBUTING.md
136 | .. _Start a New Issue: https://github.com/derek73/python-nameparser/issues
137 | .. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py
138 | 
139 | .. |Build Status| image:: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml/badge.svg
140 |    :target: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml
141 | .. |PyPI| image:: https://img.shields.io/pypi/v/nameparser.svg
142 |    :target: https://pypi.org/project/nameparser/
143 | .. |Documentation| image:: https://readthedocs.org/projects/nameparser/badge/?version=latest
144 |    :target: http://nameparser.readthedocs.io/en/latest/?badge=latest
145 | .. |PyPI version| image:: https://img.shields.io/pypi/pyversions/nameparser.svg
146 |    :target: https://pypi.org/project/nameparser/
147 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Nameparser.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Nameparser.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Nameparser"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Nameparser"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
  1 | Using the HumanName Parser
  2 | ==========================
  3 | 
  4 | Example Usage
  5 | -------------
  6 | 
  7 | The examples use Python 3, but Python 2.6+ is supported.
  8 | 
  9 | .. doctest::
 10 |     :options: +NORMALIZE_WHITESPACE
 11 | 
 12 |     >>> from nameparser import HumanName
 13 |     >>> name = HumanName("Dr. Juan Q. Xavier de la Vega III")
 14 |     >>> name.title
 15 |     'Dr.'
 16 |     >>> name["title"]
 17 |     'Dr.'
 18 |     >>> name.first
 19 |     'Juan'
 20 |     >>> name.middle
 21 |     'Q. Xavier'
 22 |     >>> name.last
 23 |     'de la Vega'
 24 |     >>> name.suffix
 25 |     'III'
 26 |     >>> name.surnames
 27 |     'Q. Xavier de la Vega'
 28 |     >>> name.full_name = "Juan Q. Xavier Velasquez y Garcia, Jr."
 29 |     >>> name
 30 |     <HumanName : [
 31 |     	title: '' 
 32 |     	first: 'Juan' 
 33 |     	middle: 'Q. Xavier' 
 34 |     	last: 'Velasquez y Garcia' 
 35 |     	suffix: 'Jr.'
 36 |     	nickname: ''
 37 |     ]>
 38 |     >>> name.middle = "Jason Alexander"
 39 |     >>> name.middle
 40 |     'Jason Alexander'
 41 |     >>> name
 42 |     <HumanName : [
 43 |         title: '' 
 44 |         first: 'Juan' 
 45 |         middle: 'Jason Alexander' 
 46 |         last: 'Velasquez y Garcia' 
 47 |         suffix: 'Jr.'
 48 |         nickname: ''
 49 |     ]>
 50 |     >>> name.middle = ["custom","values"]
 51 |     >>> name.middle
 52 |     'custom values'
 53 |     >>> name.full_name = 'Doe-Ray, Jonathan "John" A. Harris'
 54 |     >>> name.as_dict()
 55 |     {'last': 'Doe-Ray', 'suffix': '', 'title': '', 'middle': 'A. Harris', 'nickname': 'John', 'first': 'Jonathan'}
 56 |     >>> name.as_dict(False) # add False to hide keys with empty values
 57 |     {'middle': 'A. Harris', 'nickname': 'John', 'last': 'Doe-Ray', 'first': 'Jonathan'}
 58 |     >>> name = HumanName("Dr. Juan Q. Xavier de la Vega III")
 59 |     >>> name2 = HumanName("de la vega, dr. juan Q. xavier III")
 60 |     >>> name == name2
 61 |     True
 62 |     >>> len(name)
 63 |     5
 64 |     >>> list(name)
 65 |     ['Dr.', 'Juan', 'Q. Xavier', 'de la Vega', 'III']
 66 |     >>> name[1:-2]
 67 |     ['Juan', 'Q. Xavier', 'de la Vega']
 68 | 
 69 | 
 70 | Capitalization Support
 71 | ----------------------
 72 | 
 73 | The HumanName class can try to guess the correct capitalization of name
 74 | entered in all upper or lower case. By default, it will not adjust 
 75 | the case of names entered in mixed case. To run capitalization on a
 76 | `HumanName` instance, pass the parameter `force=True`.
 77 | 
 78 |     Capitalize the name.
 79 | 
 80 |     * bob v. de la macdole-eisenhower phd -> Bob V. de la MacDole-Eisenhower Ph.D.
 81 | 
 82 | .. doctest:: capitalize
 83 | 
 84 |     >>> name = HumanName("bob v. de la macdole-eisenhower phd")
 85 |     >>> name.capitalize()
 86 |     >>> str(name)
 87 |     'Bob V. de la MacDole-Eisenhower Ph.D.'
 88 |     >>> name = HumanName('Shirley Maclaine') # Don't change mixed case names
 89 |     >>> name.capitalize()
 90 |     >>> str(name)
 91 |     'Shirley Maclaine'
 92 |     >>> name.capitalize(force=True)
 93 |     >>> str(name) 
 94 |     'Shirley MacLaine'
 95 | 
 96 | To apply capitalization to all `HumanName` instances, set
 97 | :py:attr:`~nameparser.config.Constants.capitalize_name` to `True`.
 98 | 
 99 | .. doctest:: capitalize_name
100 |     :options: +NORMALIZE_WHITESPACE
101 | 
102 |     >>> from nameparser.config import CONSTANTS
103 |     >>> CONSTANTS.capitalize_name = True
104 |     >>> name = HumanName("bob v. de la macdole-eisenhower phd")
105 |     >>> str(name)
106 |     'Bob V. de la MacDole-Eisenhower Ph.D.'
107 | 
108 | To force the capitalization of mixed case strings on all `HumanName` instances,
109 | set :py:attr:`~nameparser.config.Constants.force_mixed_case_capitalization` to `True`. 
110 | 
111 | .. doctest:: force_mixed_case_capitalization
112 |     :options: +NORMALIZE_WHITESPACE
113 | 
114 |     >>> from nameparser.config import CONSTANTS
115 |     >>> CONSTANTS.force_mixed_case_capitalization = True
116 |     >>> name = HumanName('Shirley Maclaine')
117 |     >>> name.capitalize()
118 |     >>> str(name)
119 |     'Shirley MacLaine'
120 | 
121 | 
122 | Nickname Handling
123 | ------------------
124 | 
125 | The content of parenthesis or quotes in the name will be
126 | available from the nickname attribute.
127 | 
128 | .. doctest:: nicknames
129 |     :options: +NORMALIZE_WHITESPACE
130 | 
131 |     >>> name = HumanName('Jonathan "John" A. Smith')
132 |     >>> name
133 |     <HumanName : [
134 |       title: ''
135 |       first: 'Jonathan'
136 |       middle: 'A.'
137 |       last: 'Smith'
138 |       suffix: ''
139 |       nickname: 'John'
140 |     ]>
141 | 
142 | Change the output string with string formatting
143 | -----------------------------------------------
144 | 
145 | The string representation of a `HumanName` instance is controlled by its `string_format` attribute.
146 | The default value, `"{title} {first} {middle} {last} {suffix} ({nickname})"`, includes parenthesis
147 | around nicknames. Trailing commas and empty quotes and parenthesis are automatically removed if the
148 | name has no nickname pieces.
149 | 
150 | You can change the default formatting for all `HumanName` instances by setting a new
151 | :py:attr:`~nameparser.config.Constants.string_format` value on the shared
152 | :py:class:`~nameparser.config.CONSTANTS` configuration instance.
153 | 
154 | .. doctest:: string format
155 | 
156 |   >>> from nameparser.config import CONSTANTS
157 |   >>> CONSTANTS.string_format = "{title} {first} ({nickname}) {middle} {last} {suffix}"
158 |   >>> name = HumanName('Robert Johnson')
159 |   >>> str(name)
160 |   'Robert Johnson'
161 |   >>> name = HumanName('Robert "Rob" Johnson')
162 |   >>> str(name)
163 |   'Robert (Rob) Johnson'
164 | 
165 | You can control the order and presence of any name fields by changing the
166 | :py:attr:`~nameparser.config.Constants.string_format` attribute of the shared CONSTANTS instance.
167 | Don't want to include nicknames in your output? No problem. Just omit that keyword from the 
168 | `string_format` attribute.
169 | 
170 | .. doctest:: string format
171 | 
172 |   >>> from nameparser.config import CONSTANTS
173 |   >>> CONSTANTS.string_format = "{title} {first} {last}"
174 |   >>> name = HumanName("Dr. Juan Ruiz de la Vega III (Doc Vega)")
175 |   >>> str(name)
176 |   'Dr. Juan de la Vega'
177 | 
178 | 
179 | Initials Support
180 | ----------------
181 | 
182 | The HumanName class can try to get the correct representation of initials.
183 | Initials can be tricky as different format usages exist. 
184 | To exclude any of the name parts from the initials, change the initials format string: 
185 | :py:attr:`~nameparser.config.Constants.initials_format`
186 | Three attributes exist for the format, `first`, `middle` and `last`. 
187 | 
188 | .. doctest:: initials format
189 | 
190 |   >>> from nameparser.config import CONSTANTS
191 |   >>> CONSTANTS.initials_format = "{first} {middle}"
192 |   >>> HumanName("Doe, John A. Kenneth, Jr.").initials()
193 |   'J. A. K.'
194 |   >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{last}, {first}).initials()
195 |   'D., J.'
196 | 
197 | 
198 | Furthermore, the delimiter for the string output can be set through:
199 | :py:attr:`~nameparser.config.Constants.initials_delimiter`
200 | 
201 | .. doctest:: initials delimiter
202 | 
203 |   >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials()
204 |   "J; A; K;"
205 |   >>> from nameparser.config import CONSTANTS
206 |   >>> CONSTANTS.initials_delimiter = "."
207 |   >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}{middle}{last}).initials()
208 |   "J.A.K.D."
209 | 
210 | To get a list representation of the initials, use :py:meth:`~nameparser.HumanName.initials_list`.
211 | This function is unaffected by :py:attr:`~nameparser.config.Constants.initials_format`
212 | 
213 | .. doctest:: list format
214 |   >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials_list()
215 |   ["J", "A", "K", "D"]
216 |     
217 | 


--------------------------------------------------------------------------------
/docs/release_log.rst:
--------------------------------------------------------------------------------
  1 | Release Log
  2 | ===========
  3 | * 1.1.3 - September 20, 2023
  4 |     - Fix case when we have two same prefixes in the name ()#147)
  5 | * 1.1.2 - November 13, 2022
  6 |     - Add support for attributes in constructor (#140)
  7 |     - Make HumanName instances hashable (#138)
  8 |     - Update repr for names with single quotes (#137)
  9 | * 1.1.1 - January 28, 2022
 10 |     - Fix bug in is_suffix handling of lists (#129)
 11 | * 1.1.0 - January 3, 2022
 12 |     - Add initials support (#128)
 13 |     - Add more titles and prefixes (#120, #127, #128, #119)
 14 | * 1.0.6 - February 8, 2020
 15 |     - Fix Python 3.8 syntax error (#104)
 16 | * 1.0.5 - Dec 12, 2019
 17 |     - Fix suffix parsing bug in comma parts (#98)
 18 |     - Fix deprecation warning on Python 3.7 (#94)
 19 |     - Improved capitalization support of mixed case names (#90)
 20 |     - Remove "elder" from titles (#96)
 21 |     - Add post-nominal list from Wikipedia to suffixes (#93)
 22 | * 1.0.4 - June 26, 2019
 23 |     - Better nickname handling of multiple single quotes (#86)
 24 |     - full_name attribute now returns formatted string output instead of original string (#87)
 25 | * 1.0.3 - April 18, 2019
 26 |     - fix sys.stdin usage when stdin doesn't exist (#82)
 27 |     - support for escaping log entry arguments (#84)
 28 | * 1.0.2 - Oct 26, 2018
 29 |     - Fix handling of only nickname and last name (#78)
 30 | * 1.0.1 - August 30, 2018
 31 |     - Fix overzealous regex for "Ph. D." (#43)
 32 |     - Add `surnames` attribute as aggregate of middle and last names
 33 | * 1.0.0 - August 30, 2018
 34 |     - Fix support for nicknames in single quotes (#74)
 35 |     - Change prefix handling to support prefixes on first names (#60)
 36 |     - Fix prefix capitalization when not part of lastname (#70)
 37 |     - Handle erroneous space in "Ph. D." (#43)
 38 | * 0.5.8 - August 19, 2018
 39 |     - Add "Junior" to suffixes (#76)
 40 |     - Add "dra" and "srta" to titles (#77)
 41 | * 0.5.7 - June 16, 2018
 42 |     - Fix doc link (#73)
 43 |     - Fix handling of "do" and "dos" Portuguese prefixes (#71, #72)
 44 | * 0.5.6 - January 15, 2018
 45 |     - Fix python version check (#64)
 46 | * 0.5.5 - January 10, 2018
 47 |     - Support J.D. as suffix and Wm. as title
 48 | * 0.5.4 - December 10, 2017
 49 |     - Add Dr to suffixes (#62)
 50 |     - Add the full set of Italian derivatives from "di" (#59)
 51 |     - Add parameter to specify the encoding of strings added to constants, use 'UTF-8' as fallback (#67)
 52 |     - Fix handling of names composed entirely of conjunctions (#66)
 53 | * 0.5.3 - June 27, 2017
 54 |     - Remove emojis from initial string by default with option to include emojis (#58)
 55 | * 0.5.2 - March 19, 2017
 56 |     - Added names scrapped from VIAF data, thanks daryanypl (#57)
 57 | * 0.5.1 - August 12, 2016
 58 |     - Fix error for names that end with conjunction (#54)
 59 | * 0.5.0 - August 4, 2016
 60 |     - Refactor join_on_conjunctions(), fix #53
 61 | * 0.4.1 - July 25, 2016
 62 |     - Remove "bishop" from titles because it also could be a first name
 63 |     - Fix handling of lastname prefixes with periods, e.g. "Jane St. John" (#50)
 64 | * 0.4.0 - June 2, 2016
 65 |     - Remove "CONSTANTS.suffixes", replaced by "suffix_acronyms" and "suffix_not_acronyms" (#49)
 66 |     - Add "du" to prefixes
 67 |     - Add "sheikh" variations to titles
 68 |     - Add parameter to force capitalization of mixed case strings
 69 | * 0.3.16 - March 24, 2016
 70 |     - Clarify LGPL licence version (#47)
 71 |     - Skip pickle tests if pickle not installed (#48)
 72 | * 0.3.15 - March 21, 2016
 73 |     - Fix string format when `empty_attribute_default = None` (#45)
 74 |     - Include tests in release source tarball (#46)
 75 | * 0.3.14 - March 18, 2016
 76 |     - Add `CONSTANTS.empty_attribute_default` to customize value returned for empty attributes (#44)
 77 | * 0.3.13 - March 14, 2016
 78 |     - Improve string format handling (#41)
 79 | * 0.3.12 - March 13, 2016
 80 |     - Fix first name clash with suffixes (#42)
 81 |     - Fix encoding of constants added via the python shell
 82 |     - Add "MSC" to suffixes, fix #41
 83 | * 0.3.11 - October 17, 2015
 84 |     - Fix bug capitalization exceptions (#39)
 85 | * 0.3.10 - September 19, 2015
 86 |     - Fix encoding of byte strings on python 2.x (#37)
 87 | * 0.3.9 - September 5, 2015
 88 |     - Separate suffixes that are acronyms to handle periods differently, fixes #29, #21
 89 |     - Don't find titles after first name is filled, fixes (#27)
 90 |     - Add "chair" titles (#37)
 91 | * 0.3.8 - September 2, 2015
 92 |     - Use regex to check for roman numerals at end of name (#36)
 93 |     - Add DVM to suffixes
 94 | * 0.3.7 - August 30, 2015
 95 |     - Speed improvement, 3x faster
 96 |     - Make HumanName instances pickleable
 97 | * 0.3.6 - August 6, 2015
 98 |     - Fix strings that start with conjunctions (#20)
 99 |     - handle assigning lists of names to a name attribute
100 |     - support dictionary-like assignment of name attributes
101 | * 0.3.5 - August 4, 2015
102 |     - Fix handling of string encoding in python 2.x (#34)
103 |     - Add support for dictionary key access, e.g. name['first']
104 |     - add 'santa' to prefixes, add 'cpa', 'csm', 'phr', 'pmp' to suffixes (#35)
105 |     - Fix prefixes before multi-part last names (#23)
106 |     - Fix capitalization bug (#30)
107 | * 0.3.4 - March 1, 2015
108 |     - Fix #24, handle first name also a prefix
109 |     - Fix #26, last name comma format when lastname is also a title
110 | * 0.3.3 - Aug 4, 2014
111 |     - Allow suffixes to be chained (#8)
112 |     - Handle trailing suffix in last name comma format (#3). Removes support for titles
113 |       with periods but no spaces in them, e.g. "Lt.Gen.". (#21)
114 | * 0.3.2 - July 16, 2014
115 |     - Retain original string in "original" attribute.
116 |     - Collapse white space when using custom string format.
117 |     - Fix #19, single comma name format may have trailing suffix
118 | * 0.3.1 - July 5, 2014
119 |     - Fix Pypi package, include new config module.
120 | * 0.3.0 - July 4, 2014
121 |     - Refactor configuration to simplify modifications to constants (backwards incompatible)
122 |     - use unicode_literals to simplify Python 2 & 3 support.
123 |     - Generate documentation using sphinx and host on readthedocs.
124 | * 0.2.10 - May 6, 2014
125 |     - If name is only a title and one part, assume it's a last name instead of a first name, with exceptions for some titles like 'Sir'. (`#7 <https://github.com/derek73/python-nameparser/issues/7>`_).
126 |     - Add some judicial and other common titles. (#9)
127 | * 0.2.9 - Apr 1, 2014
128 |     - Add a new nickname attribute containing anything in parenthesis or double quotes (`Issue 33 <https://code.google.com/p/python-nameparser/issues/detail?id=33>`_).
129 | * 0.2.8 - Oct 25, 2013
130 |     - Add support for Python 3.3+. Thanks to @corbinbs.
131 | * 0.2.7 - Feb 13, 2013
132 |     - Fix bug with multiple conjunctions in title
133 |     - add legal and crown titles
134 | * 0.2.6 - Feb 12, 2013
135 |     - Fix python 2.6 import error on logging.NullHandler
136 | * 0.2.5 - Feb 11, 2013
137 |     - Set logging handler to NullHandler
138 |     - Remove 'ben' from PREFIXES because it's more common as a name than a prefix.
139 |     - Deprecate BlankHumanNameError. Do not raise exceptions if full_name is empty string.
140 | * 0.2.4 - Feb 10, 2013
141 |     - Adjust logging, don't set basicConfig. Fix `Issue 10 <https://code.google.com/p/python-nameparser/issues/detail?id=10>`_ and `Issue 26 <https://code.google.com/p/python-nameparser/issues/detail?id=26>`_.
142 |     - Fix handling of single lower case initials that are also conjunctions, e.g. "john e smith". Re `Issue 11 <https://code.google.com/p/python-nameparser/issues/detail?id=11>`_.
143 |     - Fix handling of initials with no space separation, e.g. "E.T. Jones". Fix #11.
144 |     - Do not remove period from first name, when present.
145 |     - Remove 'e' from PREFIXES because it is handled as a conjunction.
146 |     - Python 2.7+ required to run the tests. Mark known failures.
147 |     - tests/test.py can now take an optional name argument that will return repr() for that name.
148 | * 0.2.3 - Fix overzealous "Mac" regex
149 | * 0.2.2 - Fix parsing error
150 | * 0.2.0
151 |     - Significant refactor of parsing logic. Handle conjunctions and prefixes before
152 |       parsing into attribute buckets.
153 |     - Support attribute overriding by assignment.
154 |     - Support multiple titles.
155 |     - Lowercase titles constants to fix bug with comparison.
156 |     - Move documentation to README.rst, add release log.
157 | * 0.1.4 - Use set() in constants for improved speed. setuptools compatibility - sketerpot
158 | * 0.1.3 - Add capitalization feature - twotwo
159 | * 0.1.2 - Add slice support
160 | 
161 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Nameparser documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri May 16 01:29:58 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | from datetime import date
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.abspath('..'))
 23 | import nameparser
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = [
 34 |     'sphinx.ext.autodoc',
 35 |     'sphinx.ext.doctest',
 36 |     'sphinx.ext.viewcode',
 37 | ]
 38 | 
 39 | # Add any paths that contain templates here, relative to this directory.
 40 | templates_path = ['_templates']
 41 | 
 42 | # The suffix of source filenames.
 43 | source_suffix = '.rst'
 44 | 
 45 | # The encoding of source files.
 46 | #source_encoding = 'utf-8-sig'
 47 | 
 48 | # The master toctree document.
 49 | master_doc = 'index'
 50 | 
 51 | # General information about the project.
 52 | project = u'Nameparser'
 53 | copyright = u'{:%Y}, Derek Gulbranson'.format(date.today())
 54 | 
 55 | # The version info for the project you're documenting, acts as replacement for
 56 | # |version| and |release|, also used in various other places throughout the
 57 | # built documents.
 58 | #
 59 | # The short X.Y version.
 60 | version = nameparser.__version__
 61 | # The full version, including alpha/beta/rc tags.
 62 | release = version
 63 | 
 64 | # The language for content autogenerated by Sphinx. Refer to documentation
 65 | # for a list of supported languages.
 66 | #language = None
 67 | 
 68 | # There are two options for replacing |today|: either, you set today to some
 69 | # non-false value, then it is used:
 70 | #today = ''
 71 | # Else, today_fmt is used as the format for a strftime call.
 72 | #today_fmt = '%B %d, %Y'
 73 | 
 74 | # List of patterns, relative to source directory, that match files and
 75 | # directories to ignore when looking for source files.
 76 | exclude_patterns = ['_build']
 77 | 
 78 | # The reST default role (used for this markup: `text`) to use for all
 79 | # documents.
 80 | #default_role = None
 81 | 
 82 | # If true, '()' will be appended to :func: etc. cross-reference text.
 83 | #add_function_parentheses = True
 84 | 
 85 | # If true, the current module name will be prepended to all description
 86 | # unit titles (such as .. function::).
 87 | #add_module_names = True
 88 | 
 89 | # If true, sectionauthor and moduleauthor directives will be shown in the
 90 | # output. They are ignored by default.
 91 | #show_authors = False
 92 | 
 93 | # The name of the Pygments (syntax highlighting) style to use.
 94 | pygments_style = 'sphinx'
 95 | 
 96 | # A list of ignored prefixes for module index sorting.
 97 | #modindex_common_prefix = []
 98 | 
 99 | # If true, keep warnings as "system message" paragraphs in the built documents.
100 | #keep_warnings = False
101 | 
102 | 
103 | # -- Options for HTML output ----------------------------------------------
104 | 
105 | # The theme to use for HTML and HTML Help pages.  See the documentation for
106 | # a list of builtin themes.
107 | html_theme = 'alabaster'
108 | 
109 | import alabaster
110 | 
111 | html_theme_path = [alabaster.get_path()]
112 | extensions += ['alabaster']
113 | html_theme = 'alabaster'
114 | html_sidebars = {
115 |     '**': [
116 |         'about.html',
117 |         'navigation.html',
118 |         'relations.html',
119 |         'searchbox.html',
120 |         'donate.html',
121 |     ]
122 | }
123 | html_theme_options = {
124 |     'github_user': 'derek73',
125 |     'github_repo': 'python-nameparser',
126 |     'travis_button': True,
127 |     'analytics_id': 'UA-339019-11',
128 | }
129 | 
130 | # Theme options are theme-specific and customize the look and feel of a theme
131 | # further.  For a list of options available for each theme, see the
132 | # documentation.
133 | #html_theme_options = {}
134 | 
135 | # Add any paths that contain custom themes here, relative to this directory.
136 | #html_theme_path = []
137 | 
138 | # The name for this set of Sphinx documents.  If None, it defaults to
139 | # "<project> v<release> documentation".
140 | #html_title = None
141 | 
142 | # A shorter title for the navigation bar.  Default is the same as html_title.
143 | #html_short_title = None
144 | 
145 | # The name of an image file (relative to this directory) to place at the top
146 | # of the sidebar.
147 | #html_logo = None
148 | 
149 | # The name of an image file (within the static path) to use as favicon of the
150 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
151 | # pixels large.
152 | #html_favicon = None
153 | 
154 | # Add any paths that contain custom static files (such as style sheets) here,
155 | # relative to this directory. They are copied after the builtin static files,
156 | # so a file named "default.css" will overwrite the builtin "default.css".
157 | html_static_path = ['_static']
158 | 
159 | # Add any extra paths that contain custom files (such as robots.txt or
160 | # .htaccess) here, relative to this directory. These files are copied
161 | # directly to the root of the documentation.
162 | #html_extra_path = []
163 | 
164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
165 | # using the given strftime format.
166 | #html_last_updated_fmt = '%b %d, %Y'
167 | 
168 | # If true, SmartyPants will be used to convert quotes and dashes to
169 | # typographically correct entities.
170 | #html_use_smartypants = True
171 | 
172 | # Custom sidebar templates, maps document names to template names.
173 | #html_sidebars = {}
174 | 
175 | # Additional templates that should be rendered to pages, maps page names to
176 | # template names.
177 | #html_additional_pages = {}
178 | 
179 | # If false, no module index is generated.
180 | #html_domain_indices = True
181 | 
182 | # If false, no index is generated.
183 | #html_use_index = True
184 | 
185 | # If true, the index is split into individual pages for each letter.
186 | #html_split_index = False
187 | 
188 | # If true, links to the reST sources are added to the pages.
189 | #html_show_sourcelink = True
190 | 
191 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
192 | #html_show_sphinx = True
193 | 
194 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
195 | #html_show_copyright = True
196 | 
197 | # If true, an OpenSearch description file will be output, and all pages will
198 | # contain a <link> tag referring to it.  The value of this option must be the
199 | # base URL from which the finished HTML is served.
200 | #html_use_opensearch = ''
201 | 
202 | # This is the file name suffix for HTML files (e.g. ".xhtml").
203 | #html_file_suffix = None
204 | 
205 | # Output file base name for HTML help builder.
206 | htmlhelp_basename = 'Nameparserdoc'
207 | 
208 | 
209 | # -- Options for LaTeX output ---------------------------------------------
210 | 
211 | latex_elements = {
212 | # The paper size ('letterpaper' or 'a4paper').
213 | #'papersize': 'letterpaper',
214 | 
215 | # The font size ('10pt', '11pt' or '12pt').
216 | #'pointsize': '10pt',
217 | 
218 | # Additional stuff for the LaTeX preamble.
219 | #'preamble': '',
220 | }
221 | 
222 | # Grouping the document tree into LaTeX files. List of tuples
223 | # (source start file, target name, title,
224 | #  author, documentclass [howto, manual, or own class]).
225 | latex_documents = [
226 |   ('index', 'Nameparser.tex', u'Nameparser Documentation',
227 |    u'Derek Gulbranson', 'manual'),
228 | ]
229 | 
230 | # The name of an image file (relative to this directory) to place at the top of
231 | # the title page.
232 | #latex_logo = None
233 | 
234 | # For "manual" documents, if this is true, then toplevel headings are parts,
235 | # not chapters.
236 | #latex_use_parts = False
237 | 
238 | # If true, show page references after internal links.
239 | #latex_show_pagerefs = False
240 | 
241 | # If true, show URL addresses after external links.
242 | #latex_show_urls = False
243 | 
244 | # Documents to append as an appendix to all manuals.
245 | #latex_appendices = []
246 | 
247 | # If false, no module index is generated.
248 | #latex_domain_indices = True
249 | 
250 | 
251 | # -- Options for manual page output ---------------------------------------
252 | 
253 | # One entry per manual page. List of tuples
254 | # (source start file, name, description, authors, manual section).
255 | man_pages = [
256 |     ('index', 'nameparser', u'Nameparser Documentation',
257 |      [u'Derek Gulbranson'], 1)
258 | ]
259 | 
260 | # If true, show URL addresses after external links.
261 | #man_show_urls = False
262 | 
263 | 
264 | # -- Options for Texinfo output -------------------------------------------
265 | 
266 | # Grouping the document tree into Texinfo files. List of tuples
267 | # (source start file, target name, title, author,
268 | #  dir menu entry, description, category)
269 | texinfo_documents = [
270 |   ('index', 'Nameparser', u'Nameparser Documentation',
271 |    u'Derek Gulbranson', 'Nameparser', 'A simple python modules for parsing human names into components.',
272 |    'Miscellaneous'),
273 | ]
274 | 
275 | # Documents to append as an appendix to all manuals.
276 | #texinfo_appendices = []
277 | 
278 | # If false, no module index is generated.
279 | #texinfo_domain_indices = True
280 | 
281 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
282 | #texinfo_show_urls = 'footnote'
283 | 
284 | # If true, do not generate a @detailmenu in the "Top" node's menu.
285 | #texinfo_no_detailmenu = False
286 | 
287 | doctest_global_setup = """from nameparser import HumanName
288 | from nameparser.config import CONSTANTS, Constants
289 | CONSTANTS = Constants()
290 | """
291 | 


--------------------------------------------------------------------------------
/nameparser/config/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | The :py:mod:`nameparser.config` module manages the configuration of the
  4 | nameparser. 
  5 | 
  6 | A module-level instance of :py:class:`~nameparser.config.Constants` is created
  7 | and used by default for all HumanName instances. You can adjust the entire module's
  8 | configuration by importing this instance and changing it.
  9 | 
 10 | ::
 11 | 
 12 |     >>> from nameparser.config import CONSTANTS
 13 |     >>> CONSTANTS.titles.remove('hon').add('chemistry','dean') # doctest: +ELLIPSIS
 14 |     SetManager(set([u'msgt', ..., u'adjutant']))
 15 | 
 16 | You can also adjust the configuration of individual instances by passing
 17 | ``None`` as the second argument upon instantiation.
 18 | 
 19 | ::
 20 | 
 21 |     >>> from nameparser import HumanName
 22 |     >>> hn = HumanName("Dean Robert Johns", None)
 23 |     >>> hn.C.titles.add('dean') # doctest: +ELLIPSIS
 24 |     SetManager(set([u'msgt', ..., u'adjutant']))
 25 |     >>> hn.parse_full_name() # need to run this again after config changes
 26 | 
 27 | **Potential Gotcha**: If you do not pass ``None`` as the second argument,
 28 | ``hn.C`` will be a reference to the module config, possibly yielding 
 29 | unexpected results. See `Customizing the Parser <customize.html>`_.
 30 | """
 31 | from __future__ import unicode_literals
 32 | import sys
 33 | try:
 34 |     # Python 3.3+
 35 |     from collections.abc import Set
 36 | except ImportError:
 37 |     from collections import Set
 38 | 
 39 | from nameparser.util import binary_type
 40 | from nameparser.util import lc
 41 | from nameparser.config.prefixes import PREFIXES
 42 | from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS
 43 | from nameparser.config.conjunctions import CONJUNCTIONS
 44 | from nameparser.config.suffixes import SUFFIX_ACRONYMS
 45 | from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS
 46 | from nameparser.config.titles import TITLES
 47 | from nameparser.config.titles import FIRST_NAME_TITLES
 48 | from nameparser.config.regexes import REGEXES
 49 | 
 50 | DEFAULT_ENCODING = 'UTF-8'
 51 | 
 52 | 
 53 | class SetManager(Set):
 54 |     '''
 55 |     Easily add and remove config variables per module or instance. Subclass of
 56 |     ``collections.abc.Set``.
 57 | 
 58 |     Only special functionality beyond that provided by set() is
 59 |     to normalize constants for comparison (lower case, no periods)
 60 |     when they are add()ed and remove()d and allow passing multiple 
 61 |     string arguments to the :py:func:`add()` and :py:func:`remove()` methods.
 62 | 
 63 |     '''
 64 | 
 65 |     def __init__(self, elements):
 66 |         self.elements = set(elements)
 67 | 
 68 |     def __call__(self):
 69 |         return self.elements
 70 | 
 71 |     def __repr__(self):
 72 |         return "SetManager({})".format(self.elements)  # used for docs
 73 | 
 74 |     def __iter__(self):
 75 |         return iter(self.elements)
 76 | 
 77 |     def __contains__(self, value):
 78 |         return value in self.elements
 79 | 
 80 |     def __len__(self):
 81 |         return len(self.elements)
 82 | 
 83 |     def next(self):
 84 |         return self.__next__()
 85 | 
 86 |     def __next__(self):
 87 |         if self.count >= len(self.elements):
 88 |             self.count = 0
 89 |             raise StopIteration
 90 |         else:
 91 |             c = self.count
 92 |             self.count = c + 1
 93 |             return getattr(self, self.elements[c]) or next(self)
 94 | 
 95 |     def add_with_encoding(self, s, encoding=None):
 96 |         """
 97 |         Add the lower case and no-period version of the string to the set. Pass an
 98 |         explicit `encoding` parameter to specify the encoding of binary strings that
 99 |         are not DEFAULT_ENCODING (UTF-8).
100 |         """
101 |         stdin_encoding = None
102 |         if sys.stdin:
103 |             stdin_encoding = sys.stdin.encoding
104 |         encoding = encoding or stdin_encoding or DEFAULT_ENCODING
105 |         if type(s) == binary_type:
106 |             s = s.decode(encoding)
107 |         self.elements.add(lc(s))
108 | 
109 |     def add(self, *strings):
110 |         """
111 |         Add the lower case and no-period version of the string arguments to the set.
112 |         Can pass a list of strings. Returns ``self`` for chaining.
113 |         """
114 |         [self.add_with_encoding(s) for s in strings]
115 |         return self
116 | 
117 |     def remove(self, *strings):
118 |         """
119 |         Remove the lower case and no-period version of the string arguments from the set.
120 |         Returns ``self`` for chaining.
121 |         """
122 |         [self.elements.remove(lc(s)) for s in strings if lc(s) in self.elements]
123 |         return self
124 | 
125 | 
126 | class TupleManager(dict):
127 |     '''
128 |     A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants 
129 |     more friendly.
130 |     '''
131 | 
132 |     def __getattr__(self, attr):
133 |         return self.get(attr)
134 |     __setattr__ = dict.__setitem__
135 |     __delattr__ = dict.__delitem__
136 | 
137 |     def __getstate__(self):
138 |         return dict(self)
139 | 
140 |     def __setstate__(self, state):
141 |         self.__init__(state)
142 | 
143 |     def __reduce__(self):
144 |         return (TupleManager, (), self.__getstate__())
145 | 
146 | 
147 | class Constants(object):
148 |     """
149 |     An instance of this class hold all of the configuration constants for the parser.
150 | 
151 |     :param set prefixes: 
152 |         :py:attr:`prefixes` wrapped with :py:class:`SetManager`.
153 |     :param set titles: 
154 |         :py:attr:`titles` wrapped with :py:class:`SetManager`.
155 |     :param set first_name_titles: 
156 |         :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`.
157 |     :param set suffix_acronyms: 
158 |         :py:attr:`~suffixes.SUFFIX_ACRONYMS`  wrapped with :py:class:`SetManager`.
159 |     :param set suffix_not_acronyms: 
160 |         :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS`  wrapped with :py:class:`SetManager`.
161 |     :param set conjunctions: 
162 |         :py:attr:`conjunctions`  wrapped with :py:class:`SetManager`.
163 |     :type capitalization_exceptions: tuple or dict
164 |     :param capitalization_exceptions: 
165 |         :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`.
166 |     :type regexes: tuple or dict
167 |     :param regexes: 
168 |         :py:attr:`regexes`  wrapped with :py:class:`TupleManager`.
169 |     """
170 | 
171 |     string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
172 |     """
173 |     The default string format use for all new `HumanName` instances.
174 |     """
175 | 
176 |     initials_format = "{first} {middle} {last}"
177 |     """
178 |     The default initials format used for all new `HumanName` instances.
179 |     """
180 | 
181 |     initials_delimiter = "."
182 |     """
183 |     The default initials delimiter used for all new `HumanName` instances.
184 |     Will be used to add a delimiter between each initial.
185 |     """
186 | 
187 |     empty_attribute_default = ''
188 |     """
189 |     Default return value for empty attributes.
190 |     
191 |     .. doctest::
192 |     
193 |         >>> from nameparser.config import CONSTANTS
194 |         >>> CONSTANTS.empty_attribute_default = None
195 |         >>> name = HumanName("John Doe")
196 |         >>> name.title
197 |         None
198 |         >>>name.first
199 |         'John'
200 |         
201 |     """
202 | 
203 |     capitalize_name = False
204 |     """
205 |     If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to
206 |     :py:class:`~nameparser.parser.HumanName` instance.
207 | 
208 |     .. doctest::
209 | 
210 |         >>> from nameparser.config import CONSTANTS
211 |         >>> CONSTANTS.capitalize_name = True
212 |         >>> name = HumanName("bob v. de la macdole-eisenhower phd")
213 |         >>> str(name)
214 |         'Bob V. de la MacDole-Eisenhower Ph.D.'
215 | 
216 |     """
217 | 
218 |     force_mixed_case_capitalization = False
219 |     """
220 |     If set, forces the capitalization of mixed case strings when
221 |     :py:meth:`~nameparser.parser.HumanName.capitalize` is called.
222 | 
223 |     .. doctest::
224 | 
225 |         >>> from nameparser.config import CONSTANTS
226 |         >>> CONSTANTS.force_mixed_case_capitalization = True
227 |         >>> name = HumanName('Shirley Maclaine')
228 |         >>> name.capitalize()
229 |         >>> str(name)
230 |         'Shirley MacLaine'
231 | 
232 |     """
233 | 
234 |     def __init__(self,
235 |                  prefixes=PREFIXES,
236 |                  suffix_acronyms=SUFFIX_ACRONYMS,
237 |                  suffix_not_acronyms=SUFFIX_NOT_ACRONYMS,
238 |                  titles=TITLES,
239 |                  first_name_titles=FIRST_NAME_TITLES,
240 |                  conjunctions=CONJUNCTIONS,
241 |                  capitalization_exceptions=CAPITALIZATION_EXCEPTIONS,
242 |                  regexes=REGEXES
243 |                  ):
244 |         self.prefixes = SetManager(prefixes)
245 |         self.suffix_acronyms = SetManager(suffix_acronyms)
246 |         self.suffix_not_acronyms = SetManager(suffix_not_acronyms)
247 |         self.titles = SetManager(titles)
248 |         self.first_name_titles = SetManager(first_name_titles)
249 |         self.conjunctions = SetManager(conjunctions)
250 |         self.capitalization_exceptions = TupleManager(capitalization_exceptions)
251 |         self.regexes = TupleManager(regexes)
252 |         self._pst = None
253 | 
254 |     @property
255 |     def suffixes_prefixes_titles(self):
256 |         if not self._pst:
257 |             self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles
258 |         return self._pst
259 | 
260 |     def __repr__(self):
261 |         return "<Constants() instance>"
262 | 
263 |     def __setstate__(self, state):
264 |         self.__init__(state)
265 | 
266 |     def __getstate__(self):
267 |         attrs = [x for x in dir(self) if not x.startswith('_')]
268 |         return dict([(a, getattr(self, a)) for a in attrs])
269 | 
270 | 
271 | #: A module-level instance of the :py:class:`Constants()` class.
272 | #: Provides a common instance for the module to share
273 | #: to easily adjust configuration for the entire module.
274 | #: See `Customizing the Parser with Your Own Configuration <customize.html>`_.
275 | CONSTANTS = Constants()
276 | 


--------------------------------------------------------------------------------
/docs/customize.rst:
--------------------------------------------------------------------------------
  1 | Customizing the Parser with Your Own Configuration
  2 | ==================================================
  3 | 
  4 | Recognition of titles, prefixes, suffixes and conjunctions is handled by
  5 | matching the lower case characters of a name piece with pre-defined sets
  6 | of strings located in :py:mod:`nameparser.config`. You can adjust
  7 | these predefined sets to help fine tune the parser for your dataset.
  8 | 
  9 | Changing the Parser Constants
 10 | -----------------------------
 11 | 
 12 | There are a few ways to adjust the parser configuration depending on your
 13 | needs. The config is available in two places.
 14 | 
 15 | The first is via ``from nameparser.config import CONSTANTS``.
 16 | 
 17 | .. doctest::
 18 | 
 19 |     >>> from nameparser.config import CONSTANTS
 20 |     >>> CONSTANTS
 21 |     <Constants() instance>
 22 | 
 23 | The other is the ``C`` attribute of a ``HumanName`` instance, e.g.
 24 | ``hn.C``.
 25 | 
 26 | .. doctest::
 27 | 
 28 |     >>> from nameparser import HumanName
 29 |     >>> hn = HumanName("Dean Robert Johns")
 30 |     >>> hn.C
 31 |     <Constants() instance>
 32 | 
 33 | Both places are usually a reference to the same shared module-level
 34 | :py:class:`~nameparser.config.CONSTANTS` instance, depending on how you
 35 | instantiate the :py:class:`~nameparser.parser.HumanName` class (see below).
 36 | 
 37 | 
 38 | 
 39 | Editable attributes of nameparser.config.CONSTANTS
 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 41 | 
 42 | * :py:data:`~nameparser.config.titles.TITLES` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
 43 | * :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
 44 | * :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
 45 | * :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
 46 | * :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece.
 47 | * :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
 48 | * :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
 49 | * :py:data:`~nameparser.config.regexes.REGEXES` - Regular expressions used to find words, initials, nicknames, etc.
 50 | 
 51 | Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning
 52 | the constants for your project. These methods automatically lower case and
 53 | remove punctuation to normalize them for comparison.
 54 | 
 55 | Other editable attributes
 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 57 | 
 58 | * :py:obj:`~nameparser.config.Constants.string_format` - controls output from `str()`
 59 | * :py:obj:`~nameparser.config.Constants.empty_attribute_default` - value returned by empty attributes, defaults to empty string
 60 | * :py:obj:`~nameparser.config.Constants.capitalize_name` - If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to :py:class:`~nameparser.parser.HumanName` instance.
 61 | * :py:obj:`~nameparser.config.Constants.force_mixed_case_capitalization` - If set, forces the capitalization of mixed case strings when :py:meth:`~nameparser.parser.HumanName.capitalize` is called.
 62 | 
 63 | 
 64 | 
 65 | Parser Customization Examples
 66 | -----------------------------
 67 | 
 68 | Removing a Title
 69 | ~~~~~~~~~~~~~~~~
 70 | 
 71 | Take a look at the :py:mod:`nameparser.config` documentation to see what's
 72 | in the constants. Here's a quick walk through of some examples where you
 73 | might want to adjust them.
 74 | 
 75 | "Hon" is a common abbreviation for "Honorable", a title used when
 76 | addressing judges, and is included in the default tiles constants. This
 77 | means it will never be considered a first name, because titles are the
 78 | pieces before first names.
 79 | 
 80 | But "Hon" is also sometimes a first name. If your dataset contains more
 81 | "Hon"s than "Honorable"s, you may wish to remove it from the titles
 82 | constant so that "Hon" can be parsed as a first name.
 83 | 
 84 | .. doctest::
 85 |     :options: +ELLIPSIS, +NORMALIZE_WHITESPACE
 86 | 
 87 |     >>> from nameparser import HumanName
 88 |     >>> hn = HumanName("Hon Solo")
 89 |     >>> hn
 90 |     <HumanName : [
 91 |       title: 'Hon'
 92 |       first: ''
 93 |       middle: ''
 94 |       last: 'Solo'
 95 |       suffix: ''
 96 |       nickname: ''
 97 |     ]>
 98 |     >>> from nameparser.config import CONSTANTS
 99 |     >>> CONSTANTS.titles.remove('hon')
100 |     SetManager({'right', ..., 'tax'})
101 |     >>> hn = HumanName("Hon Solo")
102 |     >>> hn
103 |     <HumanName : [
104 |       title: ''
105 |       first: 'Hon'
106 |       middle: ''
107 |       last: 'Solo'
108 |       suffix: ''
109 |       nickname: ''
110 |     ]>
111 | 
112 | 
113 | If you don't want to detect any titles at all, you can remove all of them:
114 | 
115 |     >>> CONSTANTS.titles.remove(*CONSTANTS.titles)
116 | 
117 | 
118 | Adding a Title
119 | ~~~~~~~~~~~~~~~~
120 | 
121 | You can also pass a ``Constants`` instance to ``HumanName`` on instantiation.
122 | 
123 | "Dean" is a common first name so it is not included in the default titles
124 | constant. But in some contexts it is more common as a title. If you would
125 | like "Dean" to be parsed as a title, simply add it to the titles constant.
126 | 
127 | You can pass multiple strings to both the :py:func:`~nameparser.config.SetManager.add`
128 | and :py:func:`~nameparser.config.SetManager.remove`
129 | methods and each string will be added or removed. Both functions
130 | automatically normalize the strings for the parser's comparison method by
131 | making them lower case and removing periods.
132 | 
133 | .. doctest::
134 |     :options: +ELLIPSIS, +NORMALIZE_WHITESPACE
135 | 
136 |     >>> from nameparser import HumanName
137 |     >>> from nameparser.config import Constants
138 |     >>> constants = Constants()
139 |     >>> constants.titles.add('dean', 'Chemistry')
140 |     SetManager({'right', ..., 'tax'})
141 |     >>> hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=constants)
142 |     >>> hn
143 |     <HumanName : [
144 |       title: 'Assoc Dean of Chemistry'
145 |       first: 'Robert'
146 |       middle: ''
147 |       last: 'Johns'
148 |       suffix: ''
149 |       nickname: ''
150 |     ]>
151 | 
152 | 
153 | Module-level Shared Configuration Instance
154 | ------------------------------------------
155 | 
156 | When you modify the configuration, by default this will modify the behavior all
157 | HumanName instances. This could be a handy way to set it up for your entire
158 | project, but it could also lead to some unexpected behavior because changing
159 | the config on one instance could modify the behavior of another instance.
160 | 
161 | .. doctest:: module config
162 |     :options: +ELLIPSIS, +NORMALIZE_WHITESPACE
163 | 
164 |     >>> from nameparser import HumanName
165 |     >>> instance = HumanName("")
166 |     >>> instance.C.titles.add('dean')
167 |     SetManager({'right', ..., 'tax'})
168 |     >>> other_instance = HumanName("Dean Robert Johns")
169 |     >>> other_instance # Dean parses as title
170 |     <HumanName : [
171 |       title: 'Dean'
172 |       first: 'Robert'
173 |       middle: ''
174 |       last: 'Johns'
175 |       suffix: ''
176 |       nickname: ''
177 |     ]>
178 | 
179 | 
180 | If you'd prefer new instances to have their own config values, one shortcut is to pass
181 | ``None`` as the second argument (or ``constant`` keyword argument) when
182 | instantiating ``HumanName``. Each instance always has a ``C`` attribute, but if
183 | you didn't pass something falsey to the ``constants`` argument then it's a
184 | reference to the module-level config values with the behavior described above.
185 | 
186 | .. doctest:: module config
187 |     :options: +ELLIPSIS, +NORMALIZE_WHITESPACE
188 | 
189 |     >>> from nameparser import HumanName
190 |     >>> instance = HumanName("Dean Robert Johns")
191 |     >>> instance.has_own_config
192 |     False
193 |     >>> instance.C.titles.add('dean')
194 |     SetManager({'right', ..., 'tax'})
195 |     >>> other_instance = HumanName("Dean Robert Johns", None) # <-- pass None for per-instance config
196 |     >>> other_instance
197 |     <HumanName : [
198 |       title: ''
199 |       first: 'Dean'
200 |       middle: 'Robert'
201 |       last: 'Johns'
202 |       suffix: ''
203 |       nickname: ''
204 |     ]>
205 |     >>> other_instance.has_own_config
206 |     True
207 | 
208 | Don't Remove Emojis
209 | ~~~~~~~~~~~~~~~~~~~
210 | 
211 | By default, all emojis are removed from the input string before the name is parsed.
212 | You can turn this off by setting the ``emoji`` regex to ``False``.
213 | 
214 | .. doctest::
215 | 
216 |     >>> from nameparser import HumanName
217 |     >>> from nameparser.config import Constants
218 |     >>> constants = Constants()
219 |     >>> constants.regexes.emoji = False
220 |     >>> hn = HumanName("Sam 😊 Smith", constants=constants)
221 |     >>> hn
222 |     "Sam 😊 Smith"
223 | 
224 | Config Changes May Need Parse Refresh
225 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
226 | 
227 | The full name is parsed upon assignment to the ``full_name`` attribute or
228 | instantiation. Sometimes after making changes to configuration or other inner
229 | data after assigning the full name, the name will need to be re-parsed with the
230 | :py:func:`~nameparser.parser.HumanName.parse_full_name()` method before you see
231 | those changes with ``repr()``.
232 | 
233 | 
234 | Adjusting names after parsing them
235 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
236 | 
237 | Each attribute has a corresponding ordered list of name pieces. If you're doing
238 | pre- or post-processing you may wish to manipulate these lists directly.
239 | The strings returned by the attribute names just join these lists with spaces.
240 | 
241 | 
242 | * o.title_list
243 | * o.first_list
244 | * o.middle_list
245 | * o.last_list
246 | * o.suffix_list
247 | * o.nickname_list
248 | 
249 | ::
250 | 
251 |   >>> hn = HumanName("Juan Q. Xavier Velasquez y Garcia, Jr.")
252 |   >>> hn.middle_list
253 |   ['Q.', 'Xavier']
254 |   >>> hn.middle_list += ["Ricardo"]
255 |   >>> hn.middle_list
256 |   ['Q.', 'Xavier', 'Ricardo']
257 | 
258 | 
259 | You can also replace any name bucket's contents by assigning a string or a list
260 | directly to the attribute.
261 | 
262 | ::
263 | 
264 |   >>> hn = HumanName("Dr. John A. Kenneth Doe")
265 |   >>> hn.title = ["Associate","Professor"]
266 |   >>> hn.suffix = "Md."
267 |   >>> hn.suffix
268 |   <HumanName : [
269 |     title: 'Associate Processor'
270 |     first: 'John'
271 |     middle: 'A. Kenneth'
272 |     last: 'Doe'
273 |     suffix: 'Md.'
274 |     nickname: ''
275 |   ]>
276 | 
277 | 
278 | 
279 | 


--------------------------------------------------------------------------------
/nameparser/config/suffixes.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | SUFFIX_NOT_ACRONYMS = set([
  5 |     'dr',
  6 |     'esq',
  7 |     'esquire',
  8 |     'jr',
  9 |     'jnr',
 10 |     'junior',
 11 |     'sr',
 12 |     'snr',
 13 |     '2',
 14 |     'i',
 15 |     'ii',
 16 |     'iii',
 17 |     'iv',
 18 |     'v',
 19 | ])
 20 | """
 21 | 
 22 | Post-nominal pieces that are not acronyms. The parser does not remove periods
 23 | when matching against these pieces.
 24 | 
 25 | """
 26 | SUFFIX_ACRONYMS = set([
 27 |     '(ret)',
 28 |     '(vet)',
 29 |     '8-vsb',
 30 |     'aas',
 31 |     'aba',
 32 |     'abc',
 33 |     'abd',
 34 |     'abpp',
 35 |     'abr',
 36 |     'aca',
 37 |     'acas',
 38 |     'ace',
 39 |     'acha',
 40 |     'acp',
 41 |     'ae',
 42 |     'ae',
 43 |     'aem',
 44 |     'afasma',
 45 |     'afc',
 46 |     'afc',
 47 |     'afm',
 48 |     'afm',
 49 |     'agsf',
 50 |     'aia',
 51 |     'aicp',
 52 |     'ala',
 53 |     'alc',
 54 |     'alp',
 55 |     'am',
 56 |     'amd',
 57 |     'ame',
 58 |     'amieee',
 59 |     'ams',
 60 |     'aphr',
 61 |     'apn aprn',
 62 |     'apr',
 63 |     'apss',
 64 |     'aqp',
 65 |     'arm',
 66 |     'arrc',
 67 |     'asa',
 68 |     'asc',
 69 |     'asid',
 70 |     'asla',
 71 |     'asp',
 72 |     'atc',
 73 |     'awb',
 74 |     'bca',
 75 |     'bcl',
 76 |     'bcss',
 77 |     'bds',
 78 |     'bem',
 79 |     'bem',
 80 |     'bls-i',
 81 |     'bpe',
 82 |     'bpi',
 83 |     'bpt',
 84 |     'bt',
 85 |     'btcs',
 86 |     'bts',
 87 |     'cacts',
 88 |     'cae',
 89 |     'caha',
 90 |     'caia',
 91 |     'cams',
 92 |     'cap',
 93 |     'capa',
 94 |     'capm',
 95 |     'capp',
 96 |     'caps',
 97 |     'caro',
 98 |     'cas',
 99 |     'casp',
100 |     'cb',
101 |     'cbe',
102 |     'cbm',
103 |     'cbne',
104 |     'cbnt',
105 |     'cbp',
106 |     'cbrte',
107 |     'cbs',
108 |     'cbsp',
109 |     'cbt',
110 |     'cbte',
111 |     'cbv',
112 |     'cca',
113 |     'ccc',
114 |     'ccca',
115 |     'cccm',
116 |     'cce',
117 |     'cchp',
118 |     'ccie',
119 |     'ccim',
120 |     'cciso',
121 |     'ccm',
122 |     'ccmt',
123 |     'ccna',
124 |     'ccnp',
125 |     'ccp',
126 |     'ccp-c',
127 |     'ccpr',
128 |     'ccs',
129 |     'ccufc',
130 |     'cd',
131 |     'cdal',
132 |     'cdfm',
133 |     'cdmp',
134 |     'cds',
135 |     'cdt',
136 |     'cea',
137 |     'ceas',
138 |     'cebs',
139 |     'ceds',
140 |     'ceh',
141 |     'cela',
142 |     'cem',
143 |     'cep',
144 |     'cera',
145 |     'cet',
146 |     'cfa',
147 |     'cfc',
148 |     'cfcc',
149 |     'cfce',
150 |     'cfcm',
151 |     'cfe',
152 |     'cfeds',
153 |     'cfi',
154 |     'cfm',
155 |     'cfp',
156 |     'cfps',
157 |     'cfr',
158 |     'cfre',
159 |     'cga',
160 |     'cgap',
161 |     'cgb',
162 |     'cgc',
163 |     'cgfm',
164 |     'cgfo',
165 |     'cgm',
166 |     'cgm',
167 |     'cgma',
168 |     'cgp',
169 |     'cgr',
170 |     'cgsp',
171 |     'ch',
172 |     'ch',
173 |     'cha',
174 |     'chba',
175 |     'chdm',
176 |     'che',
177 |     'ches',
178 |     'chfc',
179 |     'chfc',
180 |     'chi',
181 |     'chmc',
182 |     'chmm',
183 |     'chp',
184 |     'chpa',
185 |     'chpe',
186 |     'chpln',
187 |     'chpse',
188 |     'chrm',
189 |     'chsc',
190 |     'chse',
191 |     'chse-a',
192 |     'chsos',
193 |     'chss',
194 |     'cht',
195 |     'cia',
196 |     'cic',
197 |     'cie',
198 |     'cig',
199 |     'cip',
200 |     'cipm',
201 |     'cips',
202 |     'ciro',
203 |     'cisa',
204 |     'cism',
205 |     'cissp',
206 |     'cla',
207 |     'clsd',
208 |     'cltd',
209 |     'clu',
210 |     'cm',
211 |     'cma',
212 |     'cmas',
213 |     'cmc',
214 |     'cmfo',
215 |     'cmg',
216 |     'cmp',
217 |     'cms',
218 |     'cmsp',
219 |     'cmt',
220 |     'cna',
221 |     'cnm',
222 |     'cnp',
223 |     'cp',
224 |     'cp-c',
225 |     'cpa',
226 |     'cpacc',
227 |     'cpbe',
228 |     'cpcm',
229 |     'cpcu',
230 |     'cpe',
231 |     'cpfa',
232 |     'cpfo',
233 |     'cpg',
234 |     'cph',
235 |     'cpht',
236 |     'cpim',
237 |     'cpl',
238 |     'cplp',
239 |     'cpm',
240 |     'cpo',
241 |     'cpp',
242 |     'cppm',
243 |     'cprc',
244 |     'cpre',
245 |     'cprp',
246 |     'cpsc',
247 |     'cpsi',
248 |     'cpss',
249 |     'cpt',
250 |     'cpwa',
251 |     'crde',
252 |     'crisc',
253 |     'crma',
254 |     'crme',
255 |     'crna',
256 |     'cro',
257 |     'crp',
258 |     'crt',
259 |     'crtt',
260 |     'csa',
261 |     'csbe',
262 |     'csc',
263 |     'cscp',
264 |     'cscu',
265 |     'csep',
266 |     'csi',
267 |     'csm',
268 |     'csp',
269 |     'cspo',
270 |     'csre',
271 |     'csrte',
272 |     'csslp',
273 |     'cssm',
274 |     'cst',
275 |     'cste',
276 |     'ctbs',
277 |     'ctfa',
278 |     'cto',
279 |     'ctp',
280 |     'cts',
281 |     'cua',
282 |     'cusp',
283 |     'cva',
284 |     'cva[22]',
285 |     'cvo',
286 |     'cvp',
287 |     'cvrs',
288 |     'cwap',
289 |     'cwb',
290 |     'cwdp',
291 |     'cwep',
292 |     'cwna',
293 |     'cwne',
294 |     'cwp',
295 |     'cwsp',
296 |     'cxa',
297 |     'cyds',
298 |     'cysa',
299 |     'dabfm',
300 |     'dabvlm',
301 |     'dacvim',
302 |     'dbe',
303 |     'dc',
304 |     'dcb',
305 |     'dcm',
306 |     'dcmg',
307 |     'dcvo',
308 |     'dd',
309 |     'dds',
310 |     'ded',
311 |     'dep',
312 |     'dfc',
313 |     'dfm',
314 |     'diplac',
315 |     'diplom',
316 |     'djur',
317 |     'dma',
318 |     'dmd',
319 |     'dmin',
320 |     'dnp',
321 |     'do',
322 |     'dpm',
323 |     'dpt',
324 |     'drb',
325 |     'drmp',
326 |     'drph',
327 |     'dsc',
328 |     'dsm',
329 |     'dso',
330 |     'dss',
331 |     'dtr',
332 |     'dvep',
333 |     'dvm',
334 |     'ea',
335 |     'ed',
336 |     'edd',
337 |     'ei',
338 |     'eit',
339 |     'els',
340 |     'emd',
341 |     'emt-b',
342 |     'emt-i/85',
343 |     'emt-i/99',
344 |     'emt-p',
345 |     'enp',
346 |     'erd',
347 |     'esq',
348 |     'evp',
349 |     'faafp',
350 |     'faan',
351 |     'faap',
352 |     'fac-c',
353 |     'facc',
354 |     'facd',
355 |     'facem',
356 |     'facep',
357 |     'facha',
358 |     'facofp',
359 |     'facog',
360 |     'facp',
361 |     'facph',
362 |     'facs',
363 |     'faia',
364 |     'faicp',
365 |     'fala',
366 |     'fashp',
367 |     'fasid',
368 |     'fasla',
369 |     'fasma',
370 |     'faspen',
371 |     'fca',
372 |     'fcas',
373 |     'fcela',
374 |     'fd',
375 |     'fec',
376 |     'fhames',
377 |     'fic',
378 |     'ficf',
379 |     'fieee',
380 |     'fmp',
381 |     'fmva',
382 |     'fnss',
383 |     'fp&a',
384 |     'fp-c',
385 |     'fpc',
386 |     'frm',
387 |     'fsa',
388 |     'fsdp',
389 |     'fws',
390 |     'gaee[14]',
391 |     'gba',
392 |     'gbe',
393 |     'gc',
394 |     'gcb',
395 |     'gcb',
396 |     'gchs',
397 |     'gcie',
398 |     'gcmg',
399 |     'gcmg',
400 |     'gcsi',
401 |     'gcvo',
402 |     'gcvo',
403 |     'gisp',
404 |     'git',
405 |     'gm',
406 |     'gmb',
407 |     'gmr',
408 |     'gphr',
409 |     'gri',
410 |     'grp',
411 |     'gsmieee',
412 |     'hccp',
413 |     'hrs',
414 |     'iaccp',
415 |     'iaee',
416 |     'iccm-d',
417 |     'iccm-f',
418 |     'idsm',
419 |     'ifgict',
420 |     'iom',
421 |     'ipep',
422 |     'ipm',
423 |     'iso',
424 |     'issp-csp',
425 |     'issp-sa',
426 |     'itil',
427 |     'jd',
428 |     'jp',
429 |     'kbe',
430 |     'kcb',
431 |     'kchs/dchs',
432 |     'kcie',
433 |     'kcie',
434 |     'kcmg',
435 |     'kcsi',
436 |     'kcsi',
437 |     'kcvo',
438 |     'kg',
439 |     'khs/dhs',
440 |     'kp',
441 |     'kt',
442 |     'lac',
443 |     'lcmt',
444 |     'lcpc',
445 |     'lcsw',
446 |     'leed ap',
447 |     'lg',
448 |     'litk',
449 |     'litl',
450 |     'litp',
451 |     'llm',
452 |     'lm',
453 |     'lmsw',
454 |     'lmt',
455 |     'lp',
456 |     'lpa',
457 |     'lpc',
458 |     'lpn',
459 |     'lpss',
460 |     'lsi',
461 |     'lsit',
462 |     'lt',
463 |     'lvn',
464 |     'lvo',
465 |     'lvt',
466 |     'ma',
467 |     'maaa',
468 |     'mai',
469 |     'mba',
470 |     'mbe',
471 |     'mbs',
472 |     'mc',
473 |     'mcct',
474 |     'mcdba',
475 |     'mches',
476 |     'mcm',
477 |     'mcp',
478 |     'mcpd',
479 |     'mcsa',
480 |     'mcsd',
481 |     'mcse',
482 |     'mct',
483 |     'md',
484 |     'mdiv',
485 |     'mem',
486 |     'mfa',
487 |     'micp',
488 |     'mieee',
489 |     'mirm',
490 |     'mle',
491 |     'mls',
492 |     'mlse',
493 |     'mlt',
494 |     'mm',
495 |     'mmad',
496 |     'mmas',
497 |     'mnaa',
498 |     'mnae',
499 |     'mp',
500 |     'mpa',
501 |     'mph',
502 |     'mpse',
503 |     'mra',
504 |     'ms',
505 |     'msa',
506 |     'msc'
507 |     'mscmsm',
508 |     'msm',
509 |     'mt',
510 |     'mts',
511 |     'mvo',
512 |     'nbc-his',
513 |     'nbcch',
514 |     'nbcch-ps',
515 |     'nbcdch',
516 |     'nbcdch-ps',
517 |     'nbcfch',
518 |     'nbcfch-ps',
519 |     'nbct',
520 |     'ncarb',
521 |     'nccp',
522 |     'ncidq',
523 |     'ncps',
524 |     'ncso',
525 |     'ncto',
526 |     'nd',
527 |     'ndtr',
528 |     'nicet i',
529 |     'nicet ii',
530 |     'nicet iii',
531 |     'nicet iv',
532 |     'nmd',
533 |     'np',
534 |     'np[18]',
535 |     'nraemt',
536 |     'nremr',
537 |     'nremt',
538 |     'nrp',
539 |     'obe',
540 |     'obi',
541 |     'oca',
542 |     'ocm',
543 |     'ocp',
544 |     'od',
545 |     'om',
546 |     'oscp',
547 |     'ot',
548 |     'pa-c',
549 |     'pcc',
550 |     'pci',
551 |     'pe',
552 |     'pfmp',
553 |     'pg',
554 |     'pgmp',
555 |     'ph',
556 |     'pharmd',
557 |     'phc',
558 |     'phd',
559 |     'phr',
560 |     'phrca',
561 |     'pla',
562 |     'pls',
563 |     'pmc',
564 |     'pmi-acp',
565 |     'pmp',
566 |     'pp',
567 |     'pps',
568 |     'prm',
569 |     'psm i',
570 |     'psm ii',
571 |     'psm',
572 |     'psp',
573 |     'psyd',
574 |     'pt',
575 |     'pta',
576 |     'qam',
577 |     'qc',
578 |     'qcsw',
579 |     'qfsm',
580 |     'qgm',
581 |     'qpm',
582 |     'qsd',
583 |     'qsp',
584 |     'ra',
585 |     'rai',
586 |     'rba',
587 |     'rci',
588 |     'rcp',
589 |     'rd',
590 |     'rdcs',
591 |     'rdh',
592 |     'rdms',
593 |     'rdn',
594 |     'res',
595 |     'rfp',
596 |     'rhca',
597 |     'rid',
598 |     'rls',
599 |     'rmsks',
600 |     'rn',
601 |     'rp',
602 |     'rpa',
603 |     'rph',
604 |     'rpl',
605 |     'rrc',
606 |     'rrt',
607 |     'rrt-accs',
608 |     'rrt-nps',
609 |     'rrt-sds',
610 |     'rtrp',
611 |     'rvm',
612 |     'rvt',
613 |     'sa',
614 |     'same',
615 |     'sasm',
616 |     'sccp',
617 |     'scmp',
618 |     'se',
619 |     'secb',
620 |     'sfp',
621 |     'sgm',
622 |     'shrm-cp',
623 |     'shrm-scp',
624 |     'si',
625 |     'siie',
626 |     'smieee',
627 |     'sphr',
628 |     'sra',
629 |     'sscp',
630 |     'stmieee',
631 |     'tbr-ct',
632 |     'td',
633 |     'thd',
634 |     'thm',
635 |     'ud',
636 |     'usa',
637 |     'usaf',
638 |     'usar',
639 |     'uscg',
640 |     'usmc',
641 |     'usn',
642 |     'usnr',
643 |     'uxc',
644 |     'uxmc',
645 |     'vc',
646 |     'vc',
647 |     'vcp',
648 |     'vd',
649 |     'vrd',
650 | ])
651 | """
652 | 
653 | Post-nominal acronyms. Titles, degrees and other things people stick after their name
654 | that may or may not have periods between the letters. The parser removes periods
655 | when matching against these pieces.
656 | 
657 | """
658 | 


--------------------------------------------------------------------------------
/nameparser/config/titles.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | FIRST_NAME_TITLES = set([
  5 |     'aunt',
  6 |     'auntie',
  7 |     'brother',
  8 |     'dame',
  9 |     'father',
 10 |     'king',
 11 |     'maid',
 12 |     'master',
 13 |     'mother',
 14 |     'pope',
 15 |     'queen',
 16 |     'sir',
 17 |     'sister',
 18 |     'uncle',
 19 |     'sheikh',
 20 |     'sheik',
 21 |     'shaik',
 22 |     'shayk',
 23 |     'shaykh',
 24 |     'shaikh',
 25 |     'cheikh',
 26 |     'shekh',
 27 | ])
 28 | """
 29 | When these titles appear with a single other name, that name is a first name, e.g.
 30 | "Sir John", "Sister Mary", "Queen Elizabeth".
 31 | """
 32 | 
 33 | #: **Cannot include things that could also be first names**, e.g. "dean".
 34 | #: Many of these from wikipedia: https://en.wikipedia.org/wiki/Title.
 35 | #: The parser recognizes chains of these including conjunctions allowing 
 36 | #: recognition titles like "Deputy Secretary of State".
 37 | TITLES = FIRST_NAME_TITLES | set([
 38 |     "attaché",
 39 |     "chargé d'affaires",
 40 |     "king's",
 41 |     "marchioness",
 42 |     "marquess",
 43 |     "marquis",
 44 |     "marquise",
 45 |     "queen's",
 46 |     '10th',
 47 |     '1lt',
 48 |     '1sgt',
 49 |     '1st',
 50 |     '1stlt',
 51 |     '1stsgt',
 52 |     '2lt',
 53 |     '2nd',
 54 |     '2ndlt',
 55 |     '3rd',
 56 |     '4th',
 57 |     '5th',
 58 |     '6th',
 59 |     '7th',
 60 |     '8th',
 61 |     '9th',
 62 |     'a1c',
 63 |     'ab',
 64 |     'abbess',
 65 |     'abbot',
 66 |     'abolitionist',
 67 |     'academic',
 68 |     'acolyte',
 69 |     'activist',
 70 |     'actor ',
 71 |     'actress',
 72 |     'adept',
 73 |     'adjutant',
 74 |     'adm',
 75 |     'admiral',
 76 |     'advertising',
 77 |     'adviser',
 78 |     'advocate',
 79 |     'air',
 80 |     'akhoond',
 81 |     'alderman',
 82 |     'almoner',
 83 |     'ambassador',
 84 |     'amn',
 85 |     'analytics',
 86 |     'anarchist',
 87 |     'animator',
 88 |     'anthropologist',
 89 |     'appellate',
 90 |     'apprentice',
 91 |     'arbitrator',
 92 |     'archbishop',
 93 |     'archdeacon',
 94 |     'archdruid',
 95 |     'archduchess',
 96 |     'archduke',
 97 |     'archeologist',
 98 |     'architect',
 99 |     'arhat',
100 |     'army',
101 |     'arranger',
102 |     'assistant',
103 |     'assoc',
104 |     'associate',
105 |     'asst',
106 |     'astronomer',
107 |     'attache',
108 |     'attorney',
109 |     'author',
110 |     'award-winning',
111 |     'ayatollah',
112 |     'baba',
113 |     'bailiff',
114 |     'ballet',
115 |     'bandleader',
116 |     'banker',
117 |     'banner',
118 |     'bard',
119 |     'baron',
120 |     'baroness',
121 |     'barrister',
122 |     'baseball',
123 |     'bearer',
124 |     'behavioral',
125 |     'bench',
126 |     'bg',
127 |     'bgen',
128 |     'biblical',
129 |     'bibliographer',
130 |     'biochemist',
131 |     'biographer',
132 |     'biologist',
133 |     'bishop',
134 |     'blessed',
135 |     'blogger',
136 |     'blues',
137 |     'bodhisattva',
138 |     'bookseller',
139 |     'botanist',
140 |     'bp',
141 |     'brigadier',
142 |     'briggen',
143 |     'british',
144 |     'broadcaster',
145 |     'buddha',
146 |     'burgess',
147 |     'burlesque',
148 |     'business',
149 |     'businessman',
150 |     'businesswoman',
151 |     'bwana',
152 |     'canon',
153 |     'capt',
154 |     'captain',
155 |     'cardinal',
156 |     'cartographer',
157 |     'cartoonist',
158 |     'catholicos',
159 |     'ccmsgt',
160 |     'cdr',
161 |     'celebrity',
162 |     'ceo',
163 |     'cfo',
164 |     'chair',
165 |     'chairs',
166 |     'chancellor',
167 |     'chaplain',
168 |     'chef',
169 |     'chemist',
170 |     'chief',
171 |     'chieftain',
172 |     'choreographer',
173 |     'civil',
174 |     'classical',
175 |     'clergyman',
176 |     'clerk',
177 |     'cmsaf',
178 |     'cmsgt',
179 |     'co-chair',
180 |     'co-chairs',
181 |     'co-founder',
182 |     'coach',
183 |     'col',
184 |     'collector',
185 |     'colonel',
186 |     'comedian',
187 |     'comedienne',
188 |     'comic',
189 |     'commander',
190 |     'commander-in-chief',
191 |     'commodore',
192 |     'composer',
193 |     'compositeur',
194 |     'comptroller',
195 |     'computer',
196 |     'comtesse',
197 |     'conductor',
198 |     'consultant',
199 |     'controller',
200 |     'corporal',
201 |     'corporate',
202 |     'correspondent',
203 |     'councillor',
204 |     'counselor',
205 |     'count',
206 |     'countess',
207 |     'courtier',
208 |     'cpl',
209 |     'cpo',
210 |     'cpt',
211 |     'credit',
212 |     'criminal',
213 |     'criminologist',
214 |     'critic',
215 |     'csm',
216 |     'curator',
217 |     'customs',
218 |     'cwo-2',
219 |     'cwo-3',
220 |     'cwo-4',
221 |     'cwo-5',
222 |     'cwo2',
223 |     'cwo3',
224 |     'cwo4',
225 |     'cwo5',
226 |     'cyclist',
227 |     'dancer',
228 |     'dcn',
229 |     'deacon',
230 |     'delegate',
231 |     'deputy',
232 |     'designated',
233 |     'designer',
234 |     'detective',
235 |     'developer',
236 |     'diplomat',
237 |     'dir',
238 |     'director',
239 |     'discovery',
240 |     'dissident',
241 |     'district',
242 |     'division',
243 |     'do',
244 |     'docent',
245 |     'docket',
246 |     'doctor',
247 |     'doyen',
248 |     'dpty',
249 |     'dr',
250 |     'dra',
251 |     'dramatist',
252 |     'druid',
253 |     'drummer',
254 |     'duchesse',
255 |     # 'duke', # a common first name
256 |     'dutchess',
257 |     'ecologist',
258 |     'economist',
259 |     'editor',
260 |     'edmi',
261 |     'edohen',
262 |     'educator',
263 |     'effendi',
264 |     'ekegbian',
265 |     'elerunwon',
266 |     'eminence',
267 |     'emperor',
268 |     'empress',
269 |     'engineer',
270 |     'english',
271 |     'ens',
272 |     'entertainer',
273 |     'entrepreneur',
274 |     'envoy',
275 |     'essayist',
276 |     'evangelist',
277 |     'excellency',
278 |     'excellent',
279 |     'exec',
280 |     'executive',
281 |     'expert',
282 |     'fadm',
283 |     'family',
284 |     'federal',
285 |     'field',
286 |     'film',
287 |     'financial',
288 |     'first',
289 |     'flag',
290 |     'flying',
291 |     'foreign',
292 |     'forester',
293 |     'founder',
294 |     'fr',
295 |     'friar',
296 |     'gaf',
297 |     'gen',
298 |     'general',
299 |     'generalissimo',
300 |     'gentiluomo',
301 |     'giani',
302 |     'goodman',
303 |     'goodwife',
304 |     'governor',
305 |     'graf',
306 |     'grand',
307 |     'group',
308 |     'guitarist',
309 |     'guru',
310 |     'gyani',
311 |     'gysgt',
312 |     'hajji',
313 |     'headman',
314 |     'heir',
315 |     'heiress',
316 |     'her',
317 |     'hereditary',
318 |     'high',
319 |     'highness',
320 |     'his',
321 |     'historian',
322 |     'historicus',
323 |     'historien',
324 |     'holiness',
325 |     'hon', # sorry Hon Solo, but judges seem more common.
326 |     'honorable',
327 |     'honourable',
328 |     'host',
329 |     'illustrator',
330 |     'imam',
331 |     'industrialist',
332 |     'information',
333 |     'instructor',
334 |     'intelligence',
335 |     'intendant',
336 |     'inventor',
337 |     'investigator',
338 |     'investor',
339 |     'journalist',
340 |     'journeyman',
341 |     'jr',
342 |     'judge',
343 |     'judicial',
344 |     'junior',
345 |     'jurist',
346 |     'keyboardist',
347 |     'kingdom',
348 |     'knowledge',
349 |     'lady',
350 |     'lama',
351 |     'lamido',
352 |     'law',
353 |     'lawyer',
354 |     'lcdr',
355 |     'lcpl',
356 |     'leader',
357 |     'lecturer',
358 |     'legal',
359 |     'librarian',
360 |     'lieutenant',
361 |     'linguist',
362 |     'literary',
363 |     'lord',
364 |     'lt',
365 |     'ltc',
366 |     'ltcol',
367 |     'ltg',
368 |     'ltgen',
369 |     'ltjg',
370 |     'lyricist',
371 |     'madam',
372 |     'madame',
373 |     'mademoiselle',
374 |     'mag',
375 |     'mag-judge',
376 |     'mag/judge',
377 |     'magistrate',
378 |     'magistrate-judge',
379 |     'magnate',
380 |     'maharajah',
381 |     'maharani',
382 |     'mahdi',
383 |     'maj',
384 |     'majesty',
385 |     'majgen',
386 |     'manager',
387 |     'marcher',
388 |     'marchess',
389 |     'marketing',
390 |     'marquis',
391 |     'mathematician',
392 |     'mathematics',
393 |     'matriarch',
394 |     'mayor',
395 |     'mcpo',
396 |     'mcpoc',
397 |     'mcpon',
398 |     'md',
399 |     'member',
400 |     'memoirist',
401 |     'merchant',
402 |     'met',
403 |     'metropolitan',
404 |     'mg',
405 |     'mgr',
406 |     'mgysgt',
407 |     'military',
408 |     'minister',
409 |     'miss',
410 |     'misses',
411 |     'missionary',
412 |     'mister',
413 |     'mlle',
414 |     'mme',
415 |     'mobster',
416 |     'model',
417 |     'monk',
418 |     'monsignor',
419 |     'most',
420 |     'mountaineer',
421 |     'mpco-cg',
422 |     'mr',
423 |     'mrs',
424 |     'ms',
425 |     'msg',
426 |     'msgt',
427 |     'mufti',
428 |     'mullah',
429 |     'municipal',
430 |     'murshid',
431 |     'musician',
432 |     'musicologist',
433 |     'mx',
434 |     'mystery',
435 |     'nanny',
436 |     'narrator',
437 |     'national',
438 |     'naturalist',
439 |     'navy',
440 |     'neuroscientist',
441 |     'novelist',
442 |     'nurse',
443 |     'obstetritian',
444 |     'officer',
445 |     'opera',
446 |     'operating',
447 |     'ornithologist',
448 |     'painter',
449 |     'paleontologist',
450 |     'pastor',
451 |     'patriarch',
452 |     'pediatrician',
453 |     'personality',
454 |     'petty',
455 |     'pfc',
456 |     'pharaoh',
457 |     'phd',
458 |     'philantropist',
459 |     'philosopher',
460 |     'photographer',
461 |     'physician',
462 |     'physicist',
463 |     'pianist',
464 |     'pilot',
465 |     'pioneer',
466 |     'pir',
467 |     'player',
468 |     'playwright',
469 |     'po1',
470 |     'po2',
471 |     'po3',
472 |     'poet',
473 |     'police',
474 |     'political',
475 |     'politician',
476 |     'prefect',
477 |     'prelate',
478 |     'premier',
479 |     'pres',
480 |     'presbyter',
481 |     'president',
482 |     'presiding',
483 |     'priest',
484 |     'priestess',
485 |     'primate',
486 |     'prime',
487 |     'prin',
488 |     'prince',
489 |     'princess',
490 |     'principal',
491 |     'printer',
492 |     'printmaker',
493 |     'prior',
494 |     'private',
495 |     'pro',
496 |     'producer',
497 |     'prof',
498 |     'professor',
499 |     'provost',
500 |     'pslc',
501 |     'psychiatrist',
502 |     'psychologist',
503 |     'publisher',
504 |     'pursuivant',
505 |     'pv2',
506 |     'pvt',
507 |     'rabbi',
508 |     'radio',
509 |     'radm',
510 |     'rangatira',
511 |     'ranger',
512 |     'rdml',
513 |     'rear',
514 |     'rebbe',
515 |     'registrar',
516 |     'rep',
517 |     'representative',
518 |     'researcher',
519 |     'resident',
520 |     'rev',
521 |     'revenue',
522 |     'reverend',
523 |     'right',
524 |     'risk',
525 |     'rock',
526 |     'royal',
527 |     'rt',
528 |     'sa',
529 |     'sailor',
530 |     'saint',
531 |     'sainte',
532 |     'saoshyant',
533 |     'satirist',
534 |     'scholar',
535 |     'schoolmaster',
536 |     'scientist',
537 |     'scpo',
538 |     'screenwriter',
539 |     'se',
540 |     'secretary',
541 |     'security',
542 |     'seigneur',
543 |     'senator',
544 |     'senior',
545 |     'senior-judge',
546 |     'sergeant',
547 |     'servant',
548 |     'sfc',
549 |     'sgm',
550 |     'sgt',
551 |     'sgtmaj',
552 |     'sgtmajmc',
553 |     'shehu',
554 |     'sheikh',
555 |     'sheriff',
556 |     'siddha',
557 |     'singer',
558 |     'singer-songwriter',
559 |     'sma',
560 |     'smsgt',
561 |     'sn',
562 |     'soccer',
563 |     'social',
564 |     'sociologist',
565 |     'software',
566 |     'soldier',
567 |     'solicitor',
568 |     'soprano',
569 |     'spc',
570 |     'speaker',
571 |     'special',
572 |     'sr',
573 |     'sra',
574 |     'srta',
575 |     'ssg',
576 |     'ssgt',
577 |     'st',
578 |     'staff',
579 |     'state',
580 |     'states',
581 |     'strategy',
582 |     'subaltern',
583 |     'subedar',
584 |     'suffragist',
585 |     'sultan',
586 |     'sultana',
587 |     'superior',
588 |     'supreme',
589 |     'surgeon',
590 |     'swami',
591 |     'swordbearer',
592 |     'sysselmann',
593 |     'tax',
594 |     'teacher',
595 |     'technical',
596 |     'technologist',
597 |     'television ',
598 |     'tenor',
599 |     'theater',
600 |     'theatre',
601 |     'theologian',
602 |     'theorist',
603 |     'timi',
604 |     'tirthankar',
605 |     'translator',
606 |     'travel',
607 |     'treasurer',
608 |     'tsar',
609 |     'tsarina',
610 |     'tsgt',
611 |     'uk',
612 |     'united',
613 |     'us',
614 |     'vadm',
615 |     'vardapet',
616 |     'vc',
617 |     'venerable',
618 |     'verderer',
619 |     'vicar',
620 |     'vice',
621 |     'viscount',
622 |     'vizier',
623 |     'vocalist',
624 |     'voice',
625 |     'warden',
626 |     'warrant',
627 |     'wing',
628 |     'wm',
629 |     'wo-1',
630 |     'wo1',
631 |     'wo2',
632 |     'wo3',
633 |     'wo4',
634 |     'wo5',
635 |     'woodman',
636 |     'writer',
637 |     'zoologist',
638 | ])
639 | 


--------------------------------------------------------------------------------
/nameparser/parser.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | from __future__ import unicode_literals
   3 | 
   4 | import sys
   5 | import re
   6 | from operator import itemgetter
   7 | from itertools import groupby
   8 | 
   9 | from nameparser.util import u
  10 | from nameparser.util import text_types, binary_type
  11 | from nameparser.util import lc
  12 | from nameparser.util import log
  13 | from nameparser.config import CONSTANTS
  14 | from nameparser.config import Constants
  15 | from nameparser.config import DEFAULT_ENCODING
  16 | 
  17 | ENCODING = 'utf-8'
  18 | 
  19 | 
  20 | def group_contiguous_integers(data):
  21 |     """
  22 |     return list of tuples containing first and last index
  23 |     position of contiguous numbers in a series
  24 |     """
  25 |     ranges = []
  26 |     for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]):
  27 |         group = list(map(itemgetter(1), group))
  28 |         if len(group) > 1:
  29 |             ranges.append((group[0], group[-1]))
  30 |     return ranges
  31 | 
  32 | 
  33 | class HumanName(object):
  34 |     """
  35 |     Parse a person's name into individual components.
  36 | 
  37 |     Instantiation assigns to ``full_name``, and assignment to
  38 |     :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the
  39 |     name, these instance attributes are available. Alternatively, you can pass
  40 |     any of the instance attributes to the constructor method and skip the parsing
  41 |     process. If any of the the instance attributes are passed to the constructor
  42 |     as keywords, :py:func:`parse_full_name` will not be performed.
  43 | 
  44 |     **HumanName Instance Attributes**
  45 | 
  46 |     * :py:attr:`title`
  47 |     * :py:attr:`first`
  48 |     * :py:attr:`middle`
  49 |     * :py:attr:`last`
  50 |     * :py:attr:`suffix`
  51 |     * :py:attr:`nickname`
  52 |     * :py:attr:`surnames`
  53 | 
  54 |     :param str full_name: The name string to be parsed.
  55 |     :param constants constants:
  56 |         a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for
  57 |         `per-instance config <customize.html>`_.
  58 |     :param str encoding: string representing the encoding of your input
  59 |     :param str string_format: python string formatting
  60 |     :param str initials_format: python initials string formatting
  61 |     :param str initials_delimter: string delimiter for initials
  62 |     :param str first: first name
  63 |     :param str middle: middle name
  64 |     :param str last: last name
  65 |     :param str title: The title or prenominal
  66 |     :param str suffix: The suffix or postnominal
  67 |     :param str nickname: Nicknames
  68 |     """
  69 | 
  70 |     C = CONSTANTS
  71 |     """
  72 |     A reference to the configuration for this instance, which may or may not be
  73 |     a reference to the shared, module-wide instance at
  74 |     :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser
  75 |     <customize.html>`_.
  76 |     """
  77 | 
  78 |     original = ''
  79 |     """
  80 |     The original string, untouched by the parser.
  81 |     """
  82 | 
  83 |     _count = 0
  84 |     _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname']
  85 |     unparsable = True
  86 |     _full_name = ''
  87 | 
  88 |     def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,
  89 |                  string_format=None, initials_format=None, initials_delimiter=None,
  90 |                  first=None, middle=None, last=None, title=None, suffix=None,
  91 |                  nickname=None):
  92 |         self.C = constants
  93 |         if type(self.C) is not type(CONSTANTS):
  94 |             self.C = Constants()
  95 | 
  96 |         self.encoding = encoding
  97 |         self.string_format = string_format or self.C.string_format
  98 |         self.initials_format = initials_format or self.C.initials_format
  99 |         self.initials_delimiter = initials_delimiter or self.C.initials_delimiter
 100 |         if (first or middle or last or title or suffix or nickname):
 101 |             self.first = first
 102 |             self.middle = middle
 103 |             self.last = last
 104 |             self.title = title
 105 |             self.suffix = suffix
 106 |             self.nickname = nickname
 107 |             self.unparsable = False
 108 |         else:
 109 |             # full_name setter triggers the parse
 110 |             self.full_name = full_name
 111 | 
 112 |     def __iter__(self):
 113 |         return self
 114 | 
 115 |     def __len__(self):
 116 |         l = 0
 117 |         for x in self:
 118 |             l += 1
 119 |         return l
 120 | 
 121 |     def __eq__(self, other):
 122 |         """
 123 |         HumanName instances are equal to other objects whose
 124 |         lower case unicode representation is the same.
 125 |         """
 126 |         return (u(self)).lower() == (u(other)).lower()
 127 | 
 128 |     def __ne__(self, other):
 129 |         return not (u(self)).lower() == (u(other)).lower()
 130 | 
 131 |     def __getitem__(self, key):
 132 |         if isinstance(key, slice):
 133 |             return [getattr(self, x) for x in self._members[key]]
 134 |         else:
 135 |             return getattr(self, key)
 136 | 
 137 |     def __setitem__(self, key, value):
 138 |         if key in self._members:
 139 |             self._set_list(key, value)
 140 |         else:
 141 |             raise KeyError("Not a valid HumanName attribute", key)
 142 | 
 143 |     def next(self):
 144 |         return self.__next__()
 145 | 
 146 |     def __next__(self):
 147 |         if self._count >= len(self._members):
 148 |             self._count = 0
 149 |             raise StopIteration
 150 |         else:
 151 |             c = self._count
 152 |             self._count = c + 1
 153 |             return getattr(self, self._members[c]) or next(self)
 154 | 
 155 |     def __unicode__(self):
 156 |         if self.string_format:
 157 |             # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
 158 |             _s = self.string_format.format(**self.as_dict())
 159 |             # remove trailing punctuation from missing nicknames
 160 |             _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "")
 161 |             return self.collapse_whitespace(_s).strip(', ')
 162 |         return " ".join(self)
 163 | 
 164 |     def __hash__(self):
 165 |         return hash(str(self))
 166 | 
 167 |     def __str__(self):
 168 |         if sys.version_info[0] >= 3:
 169 |             return self.__unicode__()
 170 |         return self.__unicode__().encode(self.encoding)
 171 | 
 172 |     def __repr__(self):
 173 |         if self.unparsable:
 174 |             _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, }
 175 |         else:
 176 |             _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % {
 177 |                 'class': self.__class__.__name__,
 178 |                 'title': self.title or '',
 179 |                 'first': self.first or '',
 180 |                 'middle': self.middle or '',
 181 |                 'last': self.last or '',
 182 |                 'suffix': self.suffix or '',
 183 |                 'nickname': self.nickname or '',
 184 |             }
 185 |         if sys.version_info[0] >= 3:
 186 |             return _string
 187 |         return _string.encode(self.encoding)
 188 | 
 189 |     def as_dict(self, include_empty=True):
 190 |         """
 191 |         Return the parsed name as a dictionary of its attributes.
 192 | 
 193 |         :param bool include_empty: Include keys in the dictionary for empty name attributes.
 194 |         :rtype: dict
 195 | 
 196 |         .. doctest::
 197 | 
 198 |             >>> name = HumanName("Bob Dole")
 199 |             >>> name.as_dict()
 200 |             {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'}
 201 |             >>> name.as_dict(False)
 202 |             {'last': 'Dole', 'first': 'Bob'}
 203 | 
 204 |         """
 205 |         d = {}
 206 |         for m in self._members:
 207 |             if include_empty:
 208 |                 d[m] = getattr(self, m)
 209 |             else:
 210 |                 val = getattr(self, m)
 211 |                 if val:
 212 |                     d[m] = val
 213 |         return d
 214 | 
 215 |     def __process_initial__(self, name_part, firstname=False):
 216 |         """
 217 |             Name parts may include prefixes or conjunctions. This function filters these from the name unless it is
 218 |             a first name, since first names cannot be conjunctions or prefixes.
 219 |         """
 220 |         parts = name_part.split(" ")
 221 |         initials = []
 222 |         if len(parts) and isinstance(parts, list):
 223 |             for part in parts:
 224 |                 if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True:
 225 |                     initials.append(part[0])
 226 |         if len(initials) > 0:
 227 |             return " ".join(initials)
 228 |         else:
 229 |             return self.C.empty_attribute_default
 230 | 
 231 |     def initials_list(self):
 232 |         """
 233 |             Returns the initials as a list
 234 | 
 235 |             .. doctest::
 236 | 
 237 |                 >>> name = HumanName("Sir Bob Andrew Dole")
 238 |                 >>> name.initials_list()
 239 |                 ["B", "A", "D"]
 240 |                 >>> name = HumanName("J. Doe")
 241 |                 >>> name.initials_list()
 242 |                 ["J", "D"]
 243 |         """
 244 |         first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]
 245 |         middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]
 246 |         last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]
 247 |         return first_initials_list + middle_initials_list + last_initials_list
 248 | 
 249 |     def initials(self):
 250 |         """
 251 |             Return period-delimited initials of the first, middle and optionally last name.
 252 | 
 253 |             :param bool include_last_name: Include the last name as part of the initials
 254 |             :rtype: str
 255 | 
 256 |             .. doctest::
 257 | 
 258 |                 >>> name = HumanName("Sir Bob Andrew Dole")
 259 |                 >>> name.initials()
 260 |                 "B. A. D."
 261 |                 >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}")
 262 |                 >>> name.initials()
 263 |                 "B. A."
 264 |         """
 265 | 
 266 |         first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]
 267 |         middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]
 268 |         last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]
 269 | 
 270 |         initials_dict = {
 271 |             "first":  (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter
 272 |             if len(first_initials_list) else self.C.empty_attribute_default,
 273 |             "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter
 274 |             if len(middle_initials_list) else self.C.empty_attribute_default,
 275 |             "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter
 276 |             if len(last_initials_list) else self.C.empty_attribute_default
 277 |         }
 278 | 
 279 |         _s = self.initials_format.format(**initials_dict)
 280 |         return self.collapse_whitespace(_s)
 281 | 
 282 |     @property
 283 |     def has_own_config(self):
 284 |         """
 285 |         True if this instance is not using the shared module-level
 286 |         configuration.
 287 |         """
 288 |         return self.C is not CONSTANTS
 289 | 
 290 |     # attributes
 291 | 
 292 |     @property
 293 |     def title(self):
 294 |         """
 295 |         The person's titles. Any string of consecutive pieces in
 296 |         :py:mod:`~nameparser.config.titles` or
 297 |         :py:mod:`~nameparser.config.conjunctions`
 298 |         at the beginning of :py:attr:`full_name`.
 299 |         """
 300 |         return " ".join(self.title_list) or self.C.empty_attribute_default
 301 | 
 302 |     @property
 303 |     def first(self):
 304 |         """
 305 |         The person's first name. The first name piece after any known
 306 |         :py:attr:`title` pieces parsed from :py:attr:`full_name`.
 307 |         """
 308 |         return " ".join(self.first_list) or self.C.empty_attribute_default
 309 | 
 310 |     @property
 311 |     def middle(self):
 312 |         """
 313 |         The person's middle names. All name pieces after the first name and
 314 |         before the last name parsed from :py:attr:`full_name`.
 315 |         """
 316 |         return " ".join(self.middle_list) or self.C.empty_attribute_default
 317 | 
 318 |     @property
 319 |     def last(self):
 320 |         """
 321 |         The person's last name. The last name piece parsed from
 322 |         :py:attr:`full_name`.
 323 |         """
 324 |         return " ".join(self.last_list) or self.C.empty_attribute_default
 325 | 
 326 |     @property
 327 |     def suffix(self):
 328 |         """
 329 |         The persons's suffixes. Pieces at the end of the name that are found in
 330 |         :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end
 331 |         of comma separated formats, e.g.
 332 |         "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed
 333 |         from :py:attr:`full_name`.
 334 |         """
 335 |         return ", ".join(self.suffix_list) or self.C.empty_attribute_default
 336 | 
 337 |     @property
 338 |     def nickname(self):
 339 |         """
 340 |         The person's nicknames. Any text found inside of quotes (``""``) or
 341 |         parenthesis (``()``)
 342 |         """
 343 |         return " ".join(self.nickname_list) or self.C.empty_attribute_default
 344 | 
 345 |     @property
 346 |     def surnames_list(self):
 347 |         """
 348 |         List of middle names followed by last name.
 349 |         """
 350 |         return self.middle_list + self.last_list
 351 | 
 352 |     @property
 353 |     def surnames(self):
 354 |         """
 355 |         A string of all middle names followed by the last name.
 356 |         """
 357 |         return " ".join(self.surnames_list) or self.C.empty_attribute_default
 358 | 
 359 |     # setter methods
 360 | 
 361 |     def _set_list(self, attr, value):
 362 |         if isinstance(value, list):
 363 |             val = value
 364 |         elif isinstance(value, text_types):
 365 |             val = [value]
 366 |         elif value is None:
 367 |             val = []
 368 |         else:
 369 |             raise TypeError(
 370 |                 "Can only assign strings, lists or None to name attributes."
 371 |                 " Got {0}".format(type(value)))
 372 |         setattr(self, attr+"_list", self.parse_pieces(val))
 373 | 
 374 |     @title.setter
 375 |     def title(self, value):
 376 |         self._set_list('title', value)
 377 | 
 378 |     @first.setter
 379 |     def first(self, value):
 380 |         self._set_list('first', value)
 381 | 
 382 |     @middle.setter
 383 |     def middle(self, value):
 384 |         self._set_list('middle', value)
 385 | 
 386 |     @last.setter
 387 |     def last(self, value):
 388 |         self._set_list('last', value)
 389 | 
 390 |     @suffix.setter
 391 |     def suffix(self, value):
 392 |         self._set_list('suffix', value)
 393 | 
 394 |     @nickname.setter
 395 |     def nickname(self, value):
 396 |         self._set_list('nickname', value)
 397 | 
 398 |     # Parse helpers
 399 | 
 400 |     def is_title(self, value):
 401 |         """Is in the :py:data:`~nameparser.config.titles.TITLES` set."""
 402 |         return lc(value) in self.C.titles
 403 | 
 404 |     def is_conjunction(self, piece):
 405 |         """Is in the conjunctions set and not :py:func:`is_an_initial()`."""
 406 |         if isinstance(piece, list):
 407 |             for item in piece:
 408 |                 if self.is_conjunction(item):
 409 |                     return True
 410 |         else:
 411 |             return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece)
 412 | 
 413 |     def is_prefix(self, piece):
 414 |         """
 415 |         Lowercase and no periods version of piece is in the
 416 |         :py:data:`~nameparser.config.prefixes.PREFIXES` set.
 417 |         """
 418 |         if isinstance(piece, list):
 419 |             for item in piece:
 420 |                 if self.is_prefix(item):
 421 |                     return True
 422 |         else:
 423 |             return lc(piece) in self.C.prefixes
 424 | 
 425 |     def is_roman_numeral(self, value):
 426 |         """
 427 |         Matches the ``roman_numeral`` regular expression in
 428 |         :py:data:`~nameparser.config.regexes.REGEXES`.
 429 |         """
 430 |         return bool(self.C.regexes.roman_numeral.match(value))
 431 | 
 432 |     def is_suffix(self, piece):
 433 |         """
 434 |         Is in the suffixes set and not :py:func:`is_an_initial()`.
 435 | 
 436 |         Some suffixes may be acronyms (M.B.A) while some are not (Jr.),
 437 |         so we remove the periods from `piece` when testing against
 438 |         `C.suffix_acronyms`.
 439 |         """
 440 |         # suffixes may have periods inside them like "M.D."
 441 |         if isinstance(piece, list):
 442 |             for item in piece:
 443 |                 if self.is_suffix(item):
 444 |                     return True
 445 |         else:
 446 |             return ((lc(piece).replace('.', '') in self.C.suffix_acronyms)
 447 |                     or (lc(piece) in self.C.suffix_not_acronyms)) \
 448 |                 and not self.is_an_initial(piece)
 449 | 
 450 |     def are_suffixes(self, pieces):
 451 |         """Return True if all pieces are suffixes."""
 452 |         for piece in pieces:
 453 |             if not self.is_suffix(piece):
 454 |                 return False
 455 |         return True
 456 | 
 457 |     def is_rootname(self, piece):
 458 |         """
 459 |         Is not a known title, suffix or prefix. Just first, middle, last names.
 460 |         """
 461 |         return lc(piece) not in self.C.suffixes_prefixes_titles \
 462 |             and not self.is_an_initial(piece)
 463 | 
 464 |     def is_an_initial(self, value):
 465 |         """
 466 |         Words with a single period at the end, or a single uppercase letter.
 467 | 
 468 |         Matches the ``initial`` regular expression in
 469 |         :py:data:`~nameparser.config.regexes.REGEXES`.
 470 |         """
 471 |         return bool(self.C.regexes.initial.match(value))
 472 | 
 473 |     # full_name parser
 474 | 
 475 |     @property
 476 |     def full_name(self):
 477 |         """The string output of the HumanName instance."""
 478 |         return self.__str__()
 479 | 
 480 |     @full_name.setter
 481 |     def full_name(self, value):
 482 |         self.original = value
 483 |         self._full_name = value
 484 |         if isinstance(value, binary_type):
 485 |             self._full_name = value.decode(self.encoding)
 486 |         self.parse_full_name()
 487 | 
 488 |     def collapse_whitespace(self, string):
 489 |         # collapse multiple spaces into single space
 490 |         string = self.C.regexes.spaces.sub(" ", string.strip())
 491 |         if string.endswith(","):
 492 |             string = string[:-1]
 493 |         return string
 494 | 
 495 |     def pre_process(self):
 496 |         """
 497 | 
 498 |         This method happens at the beginning of the :py:func:`parse_full_name`
 499 |         before any other processing of the string aside from unicode
 500 |         normalization, so it's a good place to do any custom handling in a
 501 |         subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`.
 502 | 
 503 |         """
 504 |         self.fix_phd()
 505 |         self.parse_nicknames()
 506 |         self.squash_emoji()
 507 | 
 508 |     def post_process(self):
 509 |         """
 510 |         This happens at the end of the :py:func:`parse_full_name` after
 511 |         all other processing has taken place. Runs :py:func:`handle_firstnames`
 512 |         and :py:func:`handle_capitalization`.
 513 |         """
 514 |         self.handle_firstnames()
 515 |         self.handle_capitalization()
 516 | 
 517 |     def fix_phd(self):
 518 |         try:
 519 |             _re = self.C.regexes.phd
 520 |             match = _re.search(self._full_name)
 521 |             if match:
 522 |                 self.suffix_list.append(match.group(1))
 523 |                 self._full_name = _re.sub('', self._full_name)
 524 |         except AttributeError:
 525 |             pass
 526 | 
 527 |     def parse_nicknames(self):
 528 |         """
 529 |         The content of parenthesis or quotes in the name will be added to the
 530 |         nicknames list. This happens before any other processing of the name.
 531 | 
 532 |         Single quotes cannot span white space characters and must border
 533 |         white space to allow for quotes in names like O'Connor and Kawai'ae'a.
 534 |         Double quotes and parenthesis can span white space.
 535 | 
 536 |         Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`;
 537 |         `quoted_word`, `double_quotes` and `parenthesis`.
 538 |         """
 539 | 
 540 |         empty_re = re.compile("")
 541 | 
 542 |         re_quoted_word = self.C.regexes.quoted_word or empty_re
 543 |         re_double_quotes = self.C.regexes.double_quotes or empty_re
 544 |         re_parenthesis = self.C.regexes.parenthesis or empty_re
 545 | 
 546 |         for _re in (re_quoted_word, re_double_quotes, re_parenthesis):
 547 |             if _re.search(self._full_name):
 548 |                 self.nickname_list += [x for x in _re.findall(self._full_name)]
 549 |                 self._full_name = _re.sub('', self._full_name)
 550 | 
 551 |     def squash_emoji(self):
 552 |         """
 553 |         Remove emoji from the input string.
 554 |         """
 555 |         re_emoji = self.C.regexes.emoji
 556 |         if re_emoji and re_emoji.search(self._full_name):
 557 |             self._full_name = re_emoji.sub('', self._full_name)
 558 | 
 559 |     def handle_firstnames(self):
 560 |         """
 561 |         If there are only two parts and one is a title, assume it's a last name
 562 |         instead of a first name. e.g. Mr. Johnson. Unless it's a special title
 563 |         like "Sir", then when it's followed by a single name that name is always
 564 |         a first name.
 565 |         """
 566 |         if self.title \
 567 |                 and len(self) == 2 \
 568 |                 and not lc(self.title) in self.C.first_name_titles:
 569 |             self.last, self.first = self.first, self.last
 570 | 
 571 |     def parse_full_name(self):
 572 |         """
 573 | 
 574 |         The main parse method for the parser. This method is run upon
 575 |         assignment to the :py:attr:`full_name` attribute or instantiation.
 576 | 
 577 |         Basic flow is to hand off to :py:func:`pre_process` to handle
 578 |         nicknames. It then splits on commas and chooses a code path depending
 579 |         on the number of commas.
 580 | 
 581 |         :py:func:`parse_pieces` then splits those parts on spaces and
 582 |         :py:func:`join_on_conjunctions` joins any pieces next to conjunctions.
 583 |         """
 584 | 
 585 |         self.title_list = []
 586 |         self.first_list = []
 587 |         self.middle_list = []
 588 |         self.last_list = []
 589 |         self.suffix_list = []
 590 |         self.nickname_list = []
 591 |         self.unparsable = True
 592 | 
 593 |         self.pre_process()
 594 | 
 595 |         self._full_name = self.collapse_whitespace(self._full_name)
 596 | 
 597 |         # break up full_name by commas
 598 |         parts = [x.strip() for x in self._full_name.split(",")]
 599 | 
 600 |         log.debug("full_name: %s", self._full_name)
 601 |         log.debug("parts: %s", parts)
 602 | 
 603 |         if len(parts) == 1:
 604 | 
 605 |             # no commas, title first middle middle middle last suffix
 606 |             #            part[0]
 607 | 
 608 |             pieces = self.parse_pieces(parts)
 609 |             p_len = len(pieces)
 610 |             for i, piece in enumerate(pieces):
 611 |                 try:
 612 |                     nxt = pieces[i + 1]
 613 |                 except IndexError:
 614 |                     nxt = None
 615 | 
 616 |                 # title must have a next piece, unless it's just a title
 617 |                 if not self.first \
 618 |                         and (nxt or p_len == 1) \
 619 |                         and self.is_title(piece):
 620 |                     self.title_list.append(piece)
 621 |                     continue
 622 |                 if not self.first:
 623 |                     if p_len == 1 and self.nickname:
 624 |                         self.last_list.append(piece)
 625 |                         continue
 626 |                     self.first_list.append(piece)
 627 |                     continue
 628 |                 if self.are_suffixes(pieces[i+1:]) or \
 629 |                         (
 630 |                             # if the next piece is the last piece and a roman
 631 |                             # numeral but this piece is not an initial
 632 |                             self.is_roman_numeral(nxt) and i == p_len - 2
 633 |                             and not self.is_an_initial(piece)
 634 |                 ):
 635 |                     self.last_list.append(piece)
 636 |                     self.suffix_list += pieces[i+1:]
 637 |                     break
 638 |                 if not nxt:
 639 |                     self.last_list.append(piece)
 640 |                     continue
 641 | 
 642 |                 self.middle_list.append(piece)
 643 |         else:
 644 |             # if all the end parts are suffixes and there is more than one piece
 645 |             # in the first part. (Suffixes will never appear after last names
 646 |             # only, and allows potential first names to be in suffixes, e.g.
 647 |             # "Johnson, Bart"
 648 | 
 649 |             post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1)
 650 | 
 651 |             if self.are_suffixes(parts[1].split(' ')) \
 652 |                     and len(parts[0].split(' ')) > 1:
 653 | 
 654 |                 # suffix comma:
 655 |                 # title first middle last [suffix], suffix [suffix] [, suffix]
 656 |                 #               parts[0],          parts[1:...]
 657 | 
 658 |                 self.suffix_list += parts[1:]
 659 |                 pieces = self.parse_pieces(parts[0].split(' '))
 660 |                 log.debug("pieces: %s", u(pieces))
 661 |                 for i, piece in enumerate(pieces):
 662 |                     try:
 663 |                         nxt = pieces[i + 1]
 664 |                     except IndexError:
 665 |                         nxt = None
 666 | 
 667 |                     if not self.first \
 668 |                             and (nxt or len(pieces) == 1) \
 669 |                             and self.is_title(piece):
 670 |                         self.title_list.append(piece)
 671 |                         continue
 672 |                     if not self.first:
 673 |                         self.first_list.append(piece)
 674 |                         continue
 675 |                     if self.are_suffixes(pieces[i+1:]):
 676 |                         self.last_list.append(piece)
 677 |                         self.suffix_list = pieces[i+1:] + self.suffix_list
 678 |                         break
 679 |                     if not nxt:
 680 |                         self.last_list.append(piece)
 681 |                         continue
 682 |                     self.middle_list.append(piece)
 683 |             else:
 684 | 
 685 |                 # lastname comma:
 686 |                 # last [suffix], title first middles[,] suffix [,suffix]
 687 |                 #      parts[0],      parts[1],              parts[2:...]
 688 | 
 689 |                 log.debug("post-comma pieces: %s", u(post_comma_pieces))
 690 | 
 691 |                 # lastname part may have suffixes in it
 692 |                 lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
 693 |                 for piece in lastname_pieces:
 694 |                     # the first one is always a last name, even if it looks like
 695 |                     # a suffix
 696 |                     if self.is_suffix(piece) and len(self.last_list) > 0:
 697 |                         self.suffix_list.append(piece)
 698 |                     else:
 699 |                         self.last_list.append(piece)
 700 | 
 701 |                 for i, piece in enumerate(post_comma_pieces):
 702 |                     try:
 703 |                         nxt = post_comma_pieces[i + 1]
 704 |                     except IndexError:
 705 |                         nxt = None
 706 | 
 707 |                     if not self.first \
 708 |                             and (nxt or len(post_comma_pieces) == 1) \
 709 |                             and self.is_title(piece):
 710 |                         self.title_list.append(piece)
 711 |                         continue
 712 |                     if not self.first:
 713 |                         self.first_list.append(piece)
 714 |                         continue
 715 |                     if self.is_suffix(piece):
 716 |                         self.suffix_list.append(piece)
 717 |                         continue
 718 |                     self.middle_list.append(piece)
 719 |                 try:
 720 |                     if parts[2]:
 721 |                         self.suffix_list += parts[2:]
 722 |                 except IndexError:
 723 |                     pass
 724 | 
 725 |         if len(self) < 0:
 726 |             log.info("Unparsable: \"%s\" ", self.original)
 727 |         else:
 728 |             self.unparsable = False
 729 |         self.post_process()
 730 | 
 731 |     def parse_pieces(self, parts, additional_parts_count=0):
 732 |         """
 733 |         Split parts on spaces and remove commas, join on conjunctions and
 734 |         lastname prefixes. If parts have periods in the middle, try splitting
 735 |         on periods and check if the parts are titles or suffixes. If they are
 736 |         add to the constant so they will be found.
 737 | 
 738 |         :param list parts: name part strings from the comma split
 739 |         :param int additional_parts_count:
 740 | 
 741 |             if the comma format contains other parts, we need to know
 742 |             how many there are to decide if things should be considered a
 743 |             conjunction.
 744 |         :return: pieces split on spaces and joined on conjunctions
 745 |         :rtype: list
 746 |         """
 747 | 
 748 |         output = []
 749 |         for part in parts:
 750 |             if not isinstance(part, text_types):
 751 |                 raise TypeError("Name parts must be strings. "
 752 |                                 "Got {0}".format(type(part)))
 753 |             output += [x.strip(' ,') for x in part.split(' ')]
 754 | 
 755 |         # If part contains periods, check if it's multiple titles or suffixes
 756 |         # together without spaces if so, add the new part with periods to the
 757 |         # constants so they get parsed correctly later
 758 |         for part in output:
 759 |             # if this part has a period not at the beginning or end
 760 |             if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part):
 761 |                 # split on periods, any of the split pieces titles or suffixes?
 762 |                 # ("Lt.Gov.")
 763 |                 period_chunks = part.split(".")
 764 |                 titles = list(filter(self.is_title,  period_chunks))
 765 |                 suffixes = list(filter(self.is_suffix, period_chunks))
 766 | 
 767 |                 # add the part to the constant so it will be found
 768 |                 if len(list(titles)):
 769 |                     self.C.titles.add(part)
 770 |                     continue
 771 |                 if len(list(suffixes)):
 772 |                     self.C.suffix_not_acronyms.add(part)
 773 |                     continue
 774 | 
 775 |         return self.join_on_conjunctions(output, additional_parts_count)
 776 | 
 777 |     def join_on_conjunctions(self, pieces, additional_parts_count=0):
 778 |         """
 779 |         Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
 780 | 
 781 |             ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
 782 |                             ['Mr. and Mrs.', 'John', 'Doe']
 783 | 
 784 |             ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
 785 |                             ['The Secretary of State', 'Hillary', 'Clinton']
 786 | 
 787 |         When joining titles, saves newly formed piece to the instance's titles
 788 |         constant so they will be parsed correctly later. E.g. after parsing the
 789 |         example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
 790 |         be present in the titles constant set.
 791 | 
 792 |         :param list pieces: name pieces strings after split on spaces
 793 |         :param int additional_parts_count:
 794 |         :return: new list with piece next to conjunctions merged into one piece
 795 |             with spaces in it.
 796 |         :rtype: list
 797 | 
 798 |         """
 799 |         length = len(pieces) + additional_parts_count
 800 |         # don't join on conjunctions if there's only 2 parts
 801 |         if length < 3:
 802 |             return pieces
 803 | 
 804 |         rootname_pieces = [p for p in pieces if self.is_rootname(p)]
 805 |         total_length = len(rootname_pieces) + additional_parts_count
 806 | 
 807 |         # find all the conjunctions, join any conjunctions that are next to each
 808 |         # other, then join those newly joined conjunctions and any single
 809 |         # conjunctions to the piece before and after it
 810 |         conj_index = [i for i, piece in enumerate(pieces)
 811 |                       if self.is_conjunction(piece)]
 812 | 
 813 |         contiguous_conj_i = []
 814 |         for i, val in enumerate(conj_index):
 815 |             try:
 816 |                 if conj_index[i+1] == val+1:
 817 |                     contiguous_conj_i += [val]
 818 |             except IndexError:
 819 |                 pass
 820 | 
 821 |         contiguous_conj_i = group_contiguous_integers(conj_index)
 822 | 
 823 |         delete_i = []
 824 |         for i in contiguous_conj_i:
 825 |             if type(i) == tuple:
 826 |                 new_piece = " ".join(pieces[i[0]: i[1]+1])
 827 |                 delete_i += list(range(i[0]+1, i[1]+1))
 828 |                 pieces[i[0]] = new_piece
 829 |             else:
 830 |                 new_piece = " ".join(pieces[i: i+2])
 831 |                 delete_i += [i+1]
 832 |                 pieces[i] = new_piece
 833 |             # add newly joined conjunctions to constants to be found later
 834 |             self.C.conjunctions.add(new_piece)
 835 | 
 836 |         for i in reversed(delete_i):
 837 |             # delete pieces in reverse order or the index changes on each delete
 838 |             del pieces[i]
 839 | 
 840 |         if len(pieces) == 1:
 841 |             # if there's only one piece left, nothing left to do
 842 |             return pieces
 843 | 
 844 |         # refresh conjunction index locations
 845 |         conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
 846 | 
 847 |         for i in conj_index:
 848 |             if len(pieces[i]) == 1 and total_length < 4:
 849 |                 # if there are only 3 total parts (minus known titles, suffixes
 850 |                 # and prefixes) and this conjunction is a single letter, prefer
 851 |                 # treating it as an initial rather than a conjunction.
 852 |                 # http://code.google.com/p/python-nameparser/issues/detail?id=11
 853 |                 continue
 854 | 
 855 |             if i == 0:
 856 |                 new_piece = " ".join(pieces[i:i+2])
 857 |                 if self.is_title(pieces[i+1]):
 858 |                     # when joining to a title, make new_piece a title too
 859 |                     self.C.titles.add(new_piece)
 860 |                 pieces[i] = new_piece
 861 |                 pieces.pop(i+1)
 862 |                 # subtract 1 from the index of all the remaining conjunctions
 863 |                 for j, val in enumerate(conj_index):
 864 |                     if val > i:
 865 |                         conj_index[j] = val-1
 866 | 
 867 |             else:
 868 |                 new_piece = " ".join(pieces[i-1:i+2])
 869 |                 if self.is_title(pieces[i-1]):
 870 |                     # when joining to a title, make new_piece a title too
 871 |                     self.C.titles.add(new_piece)
 872 |                 pieces[i-1] = new_piece
 873 |                 pieces.pop(i)
 874 |                 rm_count = 2
 875 |                 try:
 876 |                     pieces.pop(i)
 877 |                 except IndexError:
 878 |                     rm_count = 1
 879 | 
 880 |                 # subtract the number of removed pieces from the index
 881 |                 # of all the remaining conjunctions
 882 |                 for j, val in enumerate(conj_index):
 883 |                     if val > i:
 884 |                         conj_index[j] = val - rm_count
 885 | 
 886 |         # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
 887 |         prefixes = list(filter(self.is_prefix, pieces))
 888 |         if prefixes:
 889 |             for prefix in prefixes:
 890 |                 try:
 891 |                     i = pieces.index(prefix)
 892 |                 except ValueError:
 893 |                     # If the prefix is no longer in pieces, it's because it has been
 894 |                     # combined with the prefix that appears right before (or before that when
 895 |                     # chained together) in the last loop, so the index of that newly created
 896 |                     # piece is the same as in the last loop, i==i still, and we want to join
 897 |                     # it to the next piece.
 898 |                     pass
 899 | 
 900 |                 new_piece = ''
 901 | 
 902 |                 # join everything after the prefix until the next prefix or suffix
 903 | 
 904 |                 try:
 905 |                     if i == 0 and total_length >= 1:
 906 |                         # If it's the first piece and there are more than 1 rootnames, assume it's a first name
 907 |                         continue
 908 |                     next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
 909 |                     j = pieces.index(next_prefix, i + 1)
 910 |                     if j == i + 1:
 911 |                         # if there are two prefixes in sequence, join to the following piece
 912 |                         j += 1
 913 |                     new_piece = ' '.join(pieces[i:j])
 914 |                     pieces = pieces[:i] + [new_piece] + pieces[j:]
 915 |                 except StopIteration:
 916 |                     try:
 917 |                         # if there are no more prefixes, look for a suffix to stop at
 918 |                         stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
 919 |                         j = pieces.index(stop_at)
 920 |                         new_piece = ' '.join(pieces[i:j])
 921 |                         pieces = pieces[:i] + [new_piece] + pieces[j:]
 922 |                     except StopIteration:
 923 |                         # if there were no suffixes, nothing to stop at so join all
 924 |                         # remaining pieces
 925 |                         new_piece = ' '.join(pieces[i:])
 926 |                         pieces = pieces[:i] + [new_piece]
 927 | 
 928 |         log.debug("pieces: %s", pieces)
 929 |         return pieces
 930 | 
 931 |     # Capitalization Support
 932 | 
 933 |     def cap_word(self, word, attribute):
 934 |         if (self.is_prefix(word) and attribute in ('last', 'middle')) \
 935 |                 or self.is_conjunction(word):
 936 |             return word.lower()
 937 |         exceptions = self.C.capitalization_exceptions
 938 |         if lc(word) in exceptions:
 939 |             return exceptions[lc(word)]
 940 |         mac_match = self.C.regexes.mac.match(word)
 941 |         if mac_match:
 942 |             def cap_after_mac(m):
 943 |                 return m.group(1).capitalize() + m.group(2).capitalize()
 944 |             return self.C.regexes.mac.sub(cap_after_mac, word)
 945 |         else:
 946 |             return word.capitalize()
 947 | 
 948 |     def cap_piece(self, piece, attribute):
 949 |         if not piece:
 950 |             return ""
 951 | 
 952 |         def replacement(m): return self.cap_word(m.group(0), attribute)
 953 |         return self.C.regexes.word.sub(replacement, piece)
 954 | 
 955 |     def capitalize(self, force=None):
 956 |         """
 957 |         The HumanName class can try to guess the correct capitalization of name
 958 |         entered in all upper or lower case. By default, it will not adjust the
 959 |         case of names entered in mixed case. To run capitalization on all names
 960 |         pass the parameter `force=True`.
 961 | 
 962 |         :param bool force: Forces capitalization of mixed case strings. This
 963 |             parameter overrides rules set within
 964 |             :py:class:`~nameparser.config.CONSTANTS`.
 965 | 
 966 |         **Usage**
 967 | 
 968 |         .. doctest:: capitalize
 969 | 
 970 |             >>> name = HumanName('bob v. de la macdole-eisenhower phd')
 971 |             >>> name.capitalize()
 972 |             >>> str(name)
 973 |             'Bob V. de la MacDole-Eisenhower Ph.D.'
 974 |             >>> # Don't touch good names
 975 |             >>> name = HumanName('Shirley Maclaine')
 976 |             >>> name.capitalize()
 977 |             >>> str(name)
 978 |             'Shirley Maclaine'
 979 |             >>> name.capitalize(force=True)
 980 |             >>> str(name)
 981 |             'Shirley MacLaine'
 982 | 
 983 |         """
 984 |         name = u(self)
 985 |         force = self.C.force_mixed_case_capitalization \
 986 |             if force is None else force
 987 | 
 988 |         if not force and not (name == name.upper() or name == name.lower()):
 989 |             return
 990 |         self.title_list = self.cap_piece(self.title, 'title').split(' ')
 991 |         self.first_list = self.cap_piece(self.first, 'first').split(' ')
 992 |         self.middle_list = self.cap_piece(self.middle, 'middle').split(' ')
 993 |         self.last_list = self.cap_piece(self.last, 'last').split(' ')
 994 |         self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ')
 995 | 
 996 |     def handle_capitalization(self):
 997 |         """
 998 |         Handles capitalization configurations set within
 999 |         :py:class:`~nameparser.config.CONSTANTS`.
1000 |         """
1001 |         if self.C.capitalize_name:
1002 |             self.capitalize()
1003 | 


--------------------------------------------------------------------------------