├── AUTHORS ├── dev-requirements.txt ├── setup.cfg ├── MANIFEST.in ├── .tm_properties ├── .gitignore ├── nameparser ├── __init__.py ├── config │ ├── capitalization.py │ ├── conjunctions.py │ ├── prefixes.py │ ├── regexes.py │ ├── __init__.py │ ├── suffixes.py │ └── titles.py ├── util.py └── parser.py ├── docs ├── contributing.rst ├── modules.rst ├── resources.rst ├── index.rst ├── Makefile ├── usage.rst ├── release_log.rst ├── conf.py └── customize.rst ├── .editorconfig ├── .travis.yml ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── python-package.yml ├── setup.py ├── CONTRIBUTING.md └── README.rst /AUTHORS: -------------------------------------------------------------------------------- 1 | Derek Gulbranson 2 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | dill>=0.2.5 2 | Sphinx 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS 2 | include LICENSE 3 | include README.rst 4 | include tests.py 5 | -------------------------------------------------------------------------------- /.tm_properties: -------------------------------------------------------------------------------- 1 | excludeDirectories = "{$excludeDirectories,dist,*.egg-info,build,docs/_*}" 2 | include = "{$include,.gitignore,.hgignore,.travis.yml}" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hgrc 2 | *.DS_Store 3 | __pycache__/ 4 | *.py[cod] 5 | .python2/ 6 | MANIFEST 7 | nameparser.egg-info/ 8 | build 9 | *.egg 10 | .coverage 11 | dist 12 | .idea 13 | Pipfile 14 | Pipfile.lock 15 | 16 | # docs 17 | docs/_* 18 | -------------------------------------------------------------------------------- /nameparser/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = (1, 1, 3) 2 | __version__ = '.'.join(map(str, VERSION)) 3 | __author__ = "Derek Gulbranson" 4 | __author_email__ = 'derek73@gmail.com' 5 | __license__ = "LGPL" 6 | __url__ = "https://github.com/derek73/python-nameparser" 7 | 8 | 9 | from nameparser.parser import HumanName 10 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | The project is hosted on GitHub: 5 | 6 | https://github.com/derek73/python-nameparser 7 | 8 | Find more information about running tests and contributing the project at the projects contribution guide. 9 | 10 | https://github.com/derek73/python-nameparser/blob/master/CONTRIBUTING.md 11 | 12 | -------------------------------------------------------------------------------- /nameparser/config/capitalization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | CAPITALIZATION_EXCEPTIONS = ( 5 | ('ii', 'II'), 6 | ('iii', 'III'), 7 | ('iv', 'IV'), 8 | ('md', 'M.D.'), 9 | ('phd', 'Ph.D.'), 10 | ) 11 | """ 12 | Any pieces that are not capitalized by capitalizing the first letter. 13 | """ -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.{py,rst,ini}] 12 | indent_style = space 13 | indent_size = 4 14 | 15 | [*.{html,json,yml}] 16 | indent_style = space 17 | indent_size = 2 18 | 19 | [*.md] 20 | trim_trailing_whitespace = false 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | - "3.7" 8 | - "3.8" 9 | # command to install dependencies 10 | install: 11 | - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi 12 | - if [[ $TRAVIS_PYTHON_VERSION -ne '3.4' ]]; then pip install dill; fi 13 | - "python setup.py install" 14 | # command to run tests 15 | script: python tests.py 16 | sudo: false 17 | -------------------------------------------------------------------------------- /nameparser/config/conjunctions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | CONJUNCTIONS = set([ 5 | '&', 6 | 'and', 7 | 'et', 8 | 'e', 9 | 'of', 10 | 'the', 11 | 'und', 12 | 'y', 13 | ]) 14 | """ 15 | Pieces that should join to their neighboring pieces, e.g. "and", "y" and "&". 16 | "of" and "the" are also include to facilitate joining multiple titles, 17 | e.g. "President of the United States". 18 | """ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Derek Gulbranson . 2 | http://derekgulbranson.com/ 3 | 4 | ----- 5 | 6 | LGPL-2.1+ 7 | http://www.opensource.org/licenses/lgpl-license.html 8 | 9 | This library is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU Lesser General Public License as published by the Free Software 11 | Foundation; either version 2.1 of the License, or (at your option) any later 12 | version. 13 | 14 | This library is distributed in the hope that it will be useful, but WITHOUT ANY 15 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 16 | PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. 17 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | HumanName Class Documentation 2 | ============================== 3 | 4 | HumanName.parser 5 | ---------------- 6 | 7 | .. py:module:: nameparser.parser 8 | 9 | .. py:class:: HumanName 10 | :noindex: 11 | 12 | .. autoclass:: HumanName 13 | :members: 14 | :special-members: __eq__, __init__ 15 | 16 | HumanName.config 17 | ---------------- 18 | 19 | .. automodule:: nameparser.config 20 | :members: 21 | 22 | HumanName.config Defaults 23 | ------------------------- 24 | 25 | 26 | .. automodule:: nameparser.config.titles 27 | :members: 28 | .. automodule:: nameparser.config.suffixes 29 | :members: 30 | .. automodule:: nameparser.config.prefixes 31 | :members: 32 | .. automodule:: nameparser.config.conjunctions 33 | :members: 34 | .. automodule:: nameparser.config.capitalization 35 | :members: 36 | .. automodule:: nameparser.config.regexes 37 | :members: 38 | -------------------------------------------------------------------------------- /nameparser/util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # http://code.google.com/p/python-nameparser/issues/detail?id=10 4 | log = logging.getLogger('HumanName') 5 | try: 6 | log.addHandler(logging.NullHandler()) 7 | except AttributeError: 8 | class NullHandler(logging.Handler): 9 | def emit(self, record): 10 | pass 11 | log.addHandler(NullHandler()) 12 | log.setLevel(logging.ERROR) 13 | 14 | 15 | import sys 16 | if sys.version_info[0] < 3: 17 | 18 | text_type = unicode 19 | binary_type = str 20 | 21 | def u(x, encoding=None): 22 | if encoding: 23 | return unicode(x, encoding) 24 | else: 25 | return unicode(x) 26 | 27 | else: 28 | text_type = str 29 | binary_type = bytes 30 | 31 | def u(x, encoding=None): 32 | return text_type(x) 33 | 34 | text_types = (text_type, binary_type) 35 | def lc(value): 36 | """Lower case and remove any periods to normalize for comparison.""" 37 | if not value: 38 | return '' 39 | return value.lower().strip('.') 40 | -------------------------------------------------------------------------------- /docs/resources.rst: -------------------------------------------------------------------------------- 1 | Naming Practices and Resources 2 | ============================== 3 | 4 | * US_Census_Surname_Data_2000_ 5 | * US_Social_Security_Administration_Baby_Names_Index_ 6 | * Naming_practice_guide_UK_2006_ 7 | * Wikipedia_Anthroponymy_ 8 | * Wikipedia_Naming_conventions_ 9 | * Wikipedia_List_Of_Titles_ 10 | * Tussenvoegsel_ 11 | * Family_Name_Affixes_ 12 | 13 | .. _US_Census_Surname_Data_2000: https://www.census.gov/data/developers/data-sets/surnames/2000.html 14 | .. _US_Social_Security_Administration_Baby_Names_Index: https://www.ssa.gov/oact/babynames/limits.html 15 | .. _Naming_practice_guide_UK_2006: https://www.fbiic.gov/public/2008/nov/Naming_practice_guide_UK_2006.pdf 16 | .. _Wikipedia_Anthroponymy: https://en.wikipedia.org/wiki/Anthroponymy 17 | .. _Wikipedia_Naming_conventions: http://en.wikipedia.org/wiki/Wikipedia:Naming_conventions_(people) 18 | .. _Wikipedia_List_Of_Titles: https://en.wikipedia.org/wiki/Title 19 | .. _Tussenvoegsel: https://en.wikipedia.org/wiki/Tussenvoegsel 20 | .. _Family_Name_Affixes : https://en.wikipedia.org/wiki/List_of_family_name_affixes 21 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Published Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.x' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build 30 | - name: Build package 31 | run: python -m build 32 | - name: Publish package 33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 34 | with: 35 | user: __token__ 36 | password: ${{ secrets.PYPI_API_TOKEN }} 37 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Test the Python package 5 | 6 | on: 7 | workflow_dispatch: 8 | push: 9 | branches: [ master ] 10 | pull_request: 11 | branches: [ master ] 12 | 13 | jobs: 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install twine 32 | python -m pip install sphinx 33 | if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi 34 | - name: Run Tests 35 | run: | 36 | python tests.py 37 | python setup.py sdist 38 | twine check dist/* 39 | sphinx-build -b html docs dist/docs 40 | -------------------------------------------------------------------------------- /nameparser/config/prefixes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | #: Name pieces that appear before a last name. Prefixes join to the piece 5 | #: that follows them to make one new piece. They can be chained together, e.g 6 | #: "von der" and "de la". Because they only appear in middle or last names, 7 | #: they also signify that all following name pieces should be in the same name 8 | #: part, for example, "von" will be joined to all following pieces that are not 9 | #: prefixes or suffixes, allowing recognition of double last names when they 10 | #: appear after a prefixes. So in "pennie von bergen wessels MD", "von" will 11 | #: join with all following name pieces until the suffix "MD", resulting in the 12 | #: correct parsing of the last name "von bergen wessels". 13 | PREFIXES = set([ 14 | 'abu', 15 | 'al', 16 | 'bin', 17 | 'bon', 18 | 'da', 19 | 'dal', 20 | 'de', 21 | 'de\'', 22 | 'degli', 23 | 'dei', 24 | 'del', 25 | 'dela', 26 | 'della', 27 | 'delle', 28 | 'delli', 29 | 'dello', 30 | 'der', 31 | 'di', 32 | 'dí', 33 | 'do', 34 | 'dos', 35 | 'du', 36 | 'ibn', 37 | 'la', 38 | 'le', 39 | 'mac', 40 | 'mc', 41 | 'san', 42 | 'santa', 43 | 'st', 44 | 'ste', 45 | 'van', 46 | 'vander', 47 | 'vel', 48 | 'von', 49 | 'vom', 50 | ]) 51 | -------------------------------------------------------------------------------- /nameparser/config/regexes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | import re 4 | 5 | # emoji regex from https://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python 6 | try: 7 | # Wide UCS-4 build 8 | re_emoji = re.compile('[' 9 | '\U0001F300-\U0001F64F' 10 | '\U0001F680-\U0001F6FF' 11 | '\u2600-\u26FF\u2700-\u27BF]+', 12 | re.UNICODE) 13 | except re.error: 14 | # Narrow UCS-2 build 15 | re_emoji = re.compile('(' 16 | '\ud83c[\udf00-\udfff]|' 17 | '\ud83d[\udc00-\ude4f\ude80-\udeff]|' 18 | '[\u2600-\u26FF\u2700-\u27BF])+', 19 | re.UNICODE) 20 | 21 | REGEXES = set([ 22 | ("spaces", re.compile(r"\s+", re.U)), 23 | ("word", re.compile(r"(\w|\.)+", re.U)), 24 | ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), 25 | ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), 26 | ("quoted_word", re.compile(r'(? 45 | 46 | 47 | Writing Tests 48 | ---------------- 49 | 50 | If you make changes, please make sure you include tests with example 51 | names that you want to be parsed correctly. 52 | 53 | It's a good idea to include tests of alternate comma placement formats 54 | of the name to ensure that the 3 code paths for the 3 formats work in 55 | the same way. 56 | 57 | The tests could be MUCH better. If the spirit moves you to design or 58 | implement a much more intelligent test strategy, please know that your 59 | efforts will be welcome and appreciated. 60 | 61 | Unless you add better coverage someplace else, add a few examples of 62 | your names to `TEST_NAMES`. A test attempts to try the 3 different 63 | comma variations of these names automatically and make sure things 64 | don't blow up, so it can be a helpful regression indicator. 65 | 66 | 67 | New Releases 68 | ------------ 69 | 70 | [Publishing to Pypi Guide](https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/) 71 | 72 | $ python setup.py sdist bdist_wheel 73 | $ twine upload dist/* 74 | 75 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Nameparser documentation master file, created by 2 | sphinx-quickstart on Fri May 16 01:29:58 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Python Human Name Parser 7 | ======================== 8 | 9 | Version |release| 10 | 11 | A simple Python module for parsing human names into their individual 12 | components. 13 | 14 | * hn.title 15 | * hn.first 16 | * hn.middle 17 | * hn.last 18 | * hn.suffix 19 | * hn.nickname 20 | 21 | Supports 3 different comma placement variations in the input string. 22 | 23 | 1. Title Firstname "Nickname" Middle Middle Lastname Suffix 24 | 2. Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix] 25 | 3. Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix] 26 | 27 | 28 | It attempts the best guess that can be made with a simple, rule-based 29 | approach. It's not perfect, but it gets you pretty far. 30 | 31 | Its main use case is English, but it may be useful for other latin-based languages, especially 32 | if you are willing to `customize it`_, but it is not likely to be useful for languages 33 | that do not share the same structure as English names. 34 | 35 | .. _customize it: customize.html 36 | 37 | Instantiating the `HumanName` class with a string splits on commas and then spaces, 38 | classifying name parts based on placement in the string and matches against known name 39 | pieces like titles. It joins name pieces on conjunctions and special prefixes to last names like 40 | "del". Titles can be chained together and include conjunctions to handle 41 | titles like "Asst Secretary of State". It can also try to correct 42 | capitalization. 43 | 44 | It does not attempt to correct input mistakes. When there is ambiguity that cannot be resolved by a rule-based approach, 45 | HumanName prefers to handle the most common cases correctly. For example, 46 | "Dean" is not parsed as title because it is more common as a first name 47 | (You can customize this behavior though, see `Parser Customization Examples`_). 48 | 49 | .. _Parser Customization Examples: customize.html#parser-customization-examples 50 | 51 | 52 | Parsing Names 53 | ------------- 54 | 55 | .. toctree:: 56 | :maxdepth: 2 57 | 58 | usage 59 | customize 60 | 61 | **Developer Documentation** 62 | 63 | .. toctree:: 64 | :maxdepth: 2 65 | 66 | modules 67 | resources 68 | release_log 69 | contributing 70 | 71 | 72 | 73 | Indices and tables 74 | ================== 75 | 76 | * :ref:`genindex` 77 | * :ref:`modindex` 78 | * :ref:`search` 79 | 80 | 81 | **GitHub Project**: https://github.com/derek73/python-nameparser 82 | 83 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Name Parser 2 | =========== 3 | 4 | |Build Status| |PyPI| |PyPI version| |Documentation| 5 | 6 | A simple Python (3.2+ & 2.6+) module for parsing human names into their 7 | individual components. 8 | 9 | * hn.title 10 | * hn.first 11 | * hn.middle 12 | * hn.last 13 | * hn.suffix 14 | * hn.nickname 15 | * hn.surnames *(middle + last)* 16 | * hn.initials *(first initial of each name part)* 17 | 18 | Supported Name Structures 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | The supported name structure is generally "Title First Middle Last Suffix", where all pieces 22 | are optional. Comma-separated format like "Last, First" is also supported. 23 | 24 | 1. Title Firstname "Nickname" Middle Middle Lastname Suffix 25 | 2. Lastname [Suffix], Title Firstname (Nickname) Middle Middle[,] Suffix [, Suffix] 26 | 3. Title Firstname M Lastname [Suffix], Suffix [Suffix] [, Suffix] 27 | 28 | Instantiating the `HumanName` class with a string splits on commas and then spaces, 29 | classifying name parts based on placement in the string and matches against known name 30 | pieces like titles and suffixes. 31 | 32 | It correctly handles some common conjunctions and special prefixes to last names 33 | like "del". Titles and conjunctions can be chained together to handle complex 34 | titles like "Asst Secretary of State". It can also try to correct capitalization 35 | of names that are all upper- or lowercase names. 36 | 37 | It attempts the best guess that can be made with a simple, rule-based approach. 38 | Its main use case is English and it is not likely to be useful for languages 39 | that do not conform to the supported name structure. It's not perfect, but it 40 | gets you pretty far. 41 | 42 | Installation 43 | ------------ 44 | 45 | :: 46 | 47 | pip install nameparser 48 | 49 | If you want to try out the latest code from GitHub you can 50 | install with pip using the command below. 51 | 52 | ``pip install -e git+git://github.com/derek73/python-nameparser.git#egg=nameparser`` 53 | 54 | If you need to handle lists of names, check out 55 | `namesparser `_, a 56 | compliment to this module that handles multiple names in a string. 57 | 58 | 59 | Quick Start Example 60 | ------------------- 61 | 62 | :: 63 | 64 | >>> from nameparser import HumanName 65 | >>> name = HumanName("Dr. Juan Q. Xavier de la Vega III (Doc Vega)") 66 | >>> name 67 | 75 | >>> name.last 76 | 'de la Vega' 77 | >>> name.as_dict() 78 | {'last': 'de la Vega', 'suffix': 'III', 'title': 'Dr.', 'middle': 'Q. Xavier', 'nickname': 'Doc Vega', 'first': 'Juan'} 79 | >>> str(name) 80 | 'Dr. Juan Q. Xavier de la Vega III (Doc Vega)' 81 | >>> name.string_format = "{first} {last}" 82 | >>> str(name) 83 | 'Juan de la Vega' 84 | 85 | 86 | The parser does not attempt to correct mistakes in the input. It mostly just splits on white 87 | space and puts things in buckets based on their position in the string. This also means 88 | the difference between 'title' and 'suffix' is positional, not semantic. "Dr" is a title 89 | when it comes before the name and a suffix when it comes after. ("Pre-nominal" 90 | and "post-nominal" would probably be better names.) 91 | 92 | :: 93 | 94 | >>> name = HumanName("1 & 2, 3 4 5, Mr.") 95 | >>> name 96 | 104 | 105 | Customization 106 | ------------- 107 | 108 | Your project may need some adjustment for your dataset. You can 109 | do this in your own pre- or post-processing, by `customizing the configured pre-defined 110 | sets`_ of titles, prefixes, etc., or by subclassing the `HumanName` class. See the 111 | `full documentation`_ for more information. 112 | 113 | 114 | `Full documentation`_ 115 | ~~~~~~~~~~~~~~~~~~~~~ 116 | 117 | .. _customizing the configured pre-defined sets: http://nameparser.readthedocs.org/en/latest/customize.html 118 | .. _Full documentation: http://nameparser.readthedocs.org/en/latest/ 119 | 120 | 121 | Contributing 122 | ------------ 123 | 124 | If you come across name piece that you think should be in the default config, you're 125 | probably right. `Start a New Issue`_ and we can get them added. 126 | 127 | Please let me know if there are ways this library could be structured to make 128 | it easier for you to use in your projects. Read CONTRIBUTING.md_ for more info 129 | on running the tests and contributing to the project. 130 | 131 | **GitHub Project** 132 | 133 | https://github.com/derek73/python-nameparser 134 | 135 | .. _CONTRIBUTING.md: https://github.com/derek73/python-nameparser/tree/master/CONTRIBUTING.md 136 | .. _Start a New Issue: https://github.com/derek73/python-nameparser/issues 137 | .. _click here to propose changes to the titles: https://github.com/derek73/python-nameparser/edit/master/nameparser/config/titles.py 138 | 139 | .. |Build Status| image:: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml/badge.svg 140 | :target: https://github.com/derek73/python-nameparser/actions/workflows/python-package.yml 141 | .. |PyPI| image:: https://img.shields.io/pypi/v/nameparser.svg 142 | :target: https://pypi.org/project/nameparser/ 143 | .. |Documentation| image:: https://readthedocs.org/projects/nameparser/badge/?version=latest 144 | :target: http://nameparser.readthedocs.io/en/latest/?badge=latest 145 | .. |PyPI version| image:: https://img.shields.io/pypi/pyversions/nameparser.svg 146 | :target: https://pypi.org/project/nameparser/ 147 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Nameparser.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Nameparser.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Nameparser" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Nameparser" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | Using the HumanName Parser 2 | ========================== 3 | 4 | Example Usage 5 | ------------- 6 | 7 | The examples use Python 3, but Python 2.6+ is supported. 8 | 9 | .. doctest:: 10 | :options: +NORMALIZE_WHITESPACE 11 | 12 | >>> from nameparser import HumanName 13 | >>> name = HumanName("Dr. Juan Q. Xavier de la Vega III") 14 | >>> name.title 15 | 'Dr.' 16 | >>> name["title"] 17 | 'Dr.' 18 | >>> name.first 19 | 'Juan' 20 | >>> name.middle 21 | 'Q. Xavier' 22 | >>> name.last 23 | 'de la Vega' 24 | >>> name.suffix 25 | 'III' 26 | >>> name.surnames 27 | 'Q. Xavier de la Vega' 28 | >>> name.full_name = "Juan Q. Xavier Velasquez y Garcia, Jr." 29 | >>> name 30 | 38 | >>> name.middle = "Jason Alexander" 39 | >>> name.middle 40 | 'Jason Alexander' 41 | >>> name 42 | 50 | >>> name.middle = ["custom","values"] 51 | >>> name.middle 52 | 'custom values' 53 | >>> name.full_name = 'Doe-Ray, Jonathan "John" A. Harris' 54 | >>> name.as_dict() 55 | {'last': 'Doe-Ray', 'suffix': '', 'title': '', 'middle': 'A. Harris', 'nickname': 'John', 'first': 'Jonathan'} 56 | >>> name.as_dict(False) # add False to hide keys with empty values 57 | {'middle': 'A. Harris', 'nickname': 'John', 'last': 'Doe-Ray', 'first': 'Jonathan'} 58 | >>> name = HumanName("Dr. Juan Q. Xavier de la Vega III") 59 | >>> name2 = HumanName("de la vega, dr. juan Q. xavier III") 60 | >>> name == name2 61 | True 62 | >>> len(name) 63 | 5 64 | >>> list(name) 65 | ['Dr.', 'Juan', 'Q. Xavier', 'de la Vega', 'III'] 66 | >>> name[1:-2] 67 | ['Juan', 'Q. Xavier', 'de la Vega'] 68 | 69 | 70 | Capitalization Support 71 | ---------------------- 72 | 73 | The HumanName class can try to guess the correct capitalization of name 74 | entered in all upper or lower case. By default, it will not adjust 75 | the case of names entered in mixed case. To run capitalization on a 76 | `HumanName` instance, pass the parameter `force=True`. 77 | 78 | Capitalize the name. 79 | 80 | * bob v. de la macdole-eisenhower phd -> Bob V. de la MacDole-Eisenhower Ph.D. 81 | 82 | .. doctest:: capitalize 83 | 84 | >>> name = HumanName("bob v. de la macdole-eisenhower phd") 85 | >>> name.capitalize() 86 | >>> str(name) 87 | 'Bob V. de la MacDole-Eisenhower Ph.D.' 88 | >>> name = HumanName('Shirley Maclaine') # Don't change mixed case names 89 | >>> name.capitalize() 90 | >>> str(name) 91 | 'Shirley Maclaine' 92 | >>> name.capitalize(force=True) 93 | >>> str(name) 94 | 'Shirley MacLaine' 95 | 96 | To apply capitalization to all `HumanName` instances, set 97 | :py:attr:`~nameparser.config.Constants.capitalize_name` to `True`. 98 | 99 | .. doctest:: capitalize_name 100 | :options: +NORMALIZE_WHITESPACE 101 | 102 | >>> from nameparser.config import CONSTANTS 103 | >>> CONSTANTS.capitalize_name = True 104 | >>> name = HumanName("bob v. de la macdole-eisenhower phd") 105 | >>> str(name) 106 | 'Bob V. de la MacDole-Eisenhower Ph.D.' 107 | 108 | To force the capitalization of mixed case strings on all `HumanName` instances, 109 | set :py:attr:`~nameparser.config.Constants.force_mixed_case_capitalization` to `True`. 110 | 111 | .. doctest:: force_mixed_case_capitalization 112 | :options: +NORMALIZE_WHITESPACE 113 | 114 | >>> from nameparser.config import CONSTANTS 115 | >>> CONSTANTS.force_mixed_case_capitalization = True 116 | >>> name = HumanName('Shirley Maclaine') 117 | >>> name.capitalize() 118 | >>> str(name) 119 | 'Shirley MacLaine' 120 | 121 | 122 | Nickname Handling 123 | ------------------ 124 | 125 | The content of parenthesis or quotes in the name will be 126 | available from the nickname attribute. 127 | 128 | .. doctest:: nicknames 129 | :options: +NORMALIZE_WHITESPACE 130 | 131 | >>> name = HumanName('Jonathan "John" A. Smith') 132 | >>> name 133 | 141 | 142 | Change the output string with string formatting 143 | ----------------------------------------------- 144 | 145 | The string representation of a `HumanName` instance is controlled by its `string_format` attribute. 146 | The default value, `"{title} {first} {middle} {last} {suffix} ({nickname})"`, includes parenthesis 147 | around nicknames. Trailing commas and empty quotes and parenthesis are automatically removed if the 148 | name has no nickname pieces. 149 | 150 | You can change the default formatting for all `HumanName` instances by setting a new 151 | :py:attr:`~nameparser.config.Constants.string_format` value on the shared 152 | :py:class:`~nameparser.config.CONSTANTS` configuration instance. 153 | 154 | .. doctest:: string format 155 | 156 | >>> from nameparser.config import CONSTANTS 157 | >>> CONSTANTS.string_format = "{title} {first} ({nickname}) {middle} {last} {suffix}" 158 | >>> name = HumanName('Robert Johnson') 159 | >>> str(name) 160 | 'Robert Johnson' 161 | >>> name = HumanName('Robert "Rob" Johnson') 162 | >>> str(name) 163 | 'Robert (Rob) Johnson' 164 | 165 | You can control the order and presence of any name fields by changing the 166 | :py:attr:`~nameparser.config.Constants.string_format` attribute of the shared CONSTANTS instance. 167 | Don't want to include nicknames in your output? No problem. Just omit that keyword from the 168 | `string_format` attribute. 169 | 170 | .. doctest:: string format 171 | 172 | >>> from nameparser.config import CONSTANTS 173 | >>> CONSTANTS.string_format = "{title} {first} {last}" 174 | >>> name = HumanName("Dr. Juan Ruiz de la Vega III (Doc Vega)") 175 | >>> str(name) 176 | 'Dr. Juan de la Vega' 177 | 178 | 179 | Initials Support 180 | ---------------- 181 | 182 | The HumanName class can try to get the correct representation of initials. 183 | Initials can be tricky as different format usages exist. 184 | To exclude any of the name parts from the initials, change the initials format string: 185 | :py:attr:`~nameparser.config.Constants.initials_format` 186 | Three attributes exist for the format, `first`, `middle` and `last`. 187 | 188 | .. doctest:: initials format 189 | 190 | >>> from nameparser.config import CONSTANTS 191 | >>> CONSTANTS.initials_format = "{first} {middle}" 192 | >>> HumanName("Doe, John A. Kenneth, Jr.").initials() 193 | 'J. A. K.' 194 | >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{last}, {first}).initials() 195 | 'D., J.' 196 | 197 | 198 | Furthermore, the delimiter for the string output can be set through: 199 | :py:attr:`~nameparser.config.Constants.initials_delimiter` 200 | 201 | .. doctest:: initials delimiter 202 | 203 | >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials() 204 | "J; A; K;" 205 | >>> from nameparser.config import CONSTANTS 206 | >>> CONSTANTS.initials_delimiter = "." 207 | >>> HumanName("Doe, John A. Kenneth, Jr.", initials_format="{first}{middle}{last}).initials() 208 | "J.A.K.D." 209 | 210 | To get a list representation of the initials, use :py:meth:`~nameparser.HumanName.initials_list`. 211 | This function is unaffected by :py:attr:`~nameparser.config.Constants.initials_format` 212 | 213 | .. doctest:: list format 214 | >>> HumanName("Doe, John A. Kenneth, Jr.", initials_delimiter=";").initials_list() 215 | ["J", "A", "K", "D"] 216 | 217 | -------------------------------------------------------------------------------- /docs/release_log.rst: -------------------------------------------------------------------------------- 1 | Release Log 2 | =========== 3 | * 1.1.3 - September 20, 2023 4 | - Fix case when we have two same prefixes in the name ()#147) 5 | * 1.1.2 - November 13, 2022 6 | - Add support for attributes in constructor (#140) 7 | - Make HumanName instances hashable (#138) 8 | - Update repr for names with single quotes (#137) 9 | * 1.1.1 - January 28, 2022 10 | - Fix bug in is_suffix handling of lists (#129) 11 | * 1.1.0 - January 3, 2022 12 | - Add initials support (#128) 13 | - Add more titles and prefixes (#120, #127, #128, #119) 14 | * 1.0.6 - February 8, 2020 15 | - Fix Python 3.8 syntax error (#104) 16 | * 1.0.5 - Dec 12, 2019 17 | - Fix suffix parsing bug in comma parts (#98) 18 | - Fix deprecation warning on Python 3.7 (#94) 19 | - Improved capitalization support of mixed case names (#90) 20 | - Remove "elder" from titles (#96) 21 | - Add post-nominal list from Wikipedia to suffixes (#93) 22 | * 1.0.4 - June 26, 2019 23 | - Better nickname handling of multiple single quotes (#86) 24 | - full_name attribute now returns formatted string output instead of original string (#87) 25 | * 1.0.3 - April 18, 2019 26 | - fix sys.stdin usage when stdin doesn't exist (#82) 27 | - support for escaping log entry arguments (#84) 28 | * 1.0.2 - Oct 26, 2018 29 | - Fix handling of only nickname and last name (#78) 30 | * 1.0.1 - August 30, 2018 31 | - Fix overzealous regex for "Ph. D." (#43) 32 | - Add `surnames` attribute as aggregate of middle and last names 33 | * 1.0.0 - August 30, 2018 34 | - Fix support for nicknames in single quotes (#74) 35 | - Change prefix handling to support prefixes on first names (#60) 36 | - Fix prefix capitalization when not part of lastname (#70) 37 | - Handle erroneous space in "Ph. D." (#43) 38 | * 0.5.8 - August 19, 2018 39 | - Add "Junior" to suffixes (#76) 40 | - Add "dra" and "srta" to titles (#77) 41 | * 0.5.7 - June 16, 2018 42 | - Fix doc link (#73) 43 | - Fix handling of "do" and "dos" Portuguese prefixes (#71, #72) 44 | * 0.5.6 - January 15, 2018 45 | - Fix python version check (#64) 46 | * 0.5.5 - January 10, 2018 47 | - Support J.D. as suffix and Wm. as title 48 | * 0.5.4 - December 10, 2017 49 | - Add Dr to suffixes (#62) 50 | - Add the full set of Italian derivatives from "di" (#59) 51 | - Add parameter to specify the encoding of strings added to constants, use 'UTF-8' as fallback (#67) 52 | - Fix handling of names composed entirely of conjunctions (#66) 53 | * 0.5.3 - June 27, 2017 54 | - Remove emojis from initial string by default with option to include emojis (#58) 55 | * 0.5.2 - March 19, 2017 56 | - Added names scrapped from VIAF data, thanks daryanypl (#57) 57 | * 0.5.1 - August 12, 2016 58 | - Fix error for names that end with conjunction (#54) 59 | * 0.5.0 - August 4, 2016 60 | - Refactor join_on_conjunctions(), fix #53 61 | * 0.4.1 - July 25, 2016 62 | - Remove "bishop" from titles because it also could be a first name 63 | - Fix handling of lastname prefixes with periods, e.g. "Jane St. John" (#50) 64 | * 0.4.0 - June 2, 2016 65 | - Remove "CONSTANTS.suffixes", replaced by "suffix_acronyms" and "suffix_not_acronyms" (#49) 66 | - Add "du" to prefixes 67 | - Add "sheikh" variations to titles 68 | - Add parameter to force capitalization of mixed case strings 69 | * 0.3.16 - March 24, 2016 70 | - Clarify LGPL licence version (#47) 71 | - Skip pickle tests if pickle not installed (#48) 72 | * 0.3.15 - March 21, 2016 73 | - Fix string format when `empty_attribute_default = None` (#45) 74 | - Include tests in release source tarball (#46) 75 | * 0.3.14 - March 18, 2016 76 | - Add `CONSTANTS.empty_attribute_default` to customize value returned for empty attributes (#44) 77 | * 0.3.13 - March 14, 2016 78 | - Improve string format handling (#41) 79 | * 0.3.12 - March 13, 2016 80 | - Fix first name clash with suffixes (#42) 81 | - Fix encoding of constants added via the python shell 82 | - Add "MSC" to suffixes, fix #41 83 | * 0.3.11 - October 17, 2015 84 | - Fix bug capitalization exceptions (#39) 85 | * 0.3.10 - September 19, 2015 86 | - Fix encoding of byte strings on python 2.x (#37) 87 | * 0.3.9 - September 5, 2015 88 | - Separate suffixes that are acronyms to handle periods differently, fixes #29, #21 89 | - Don't find titles after first name is filled, fixes (#27) 90 | - Add "chair" titles (#37) 91 | * 0.3.8 - September 2, 2015 92 | - Use regex to check for roman numerals at end of name (#36) 93 | - Add DVM to suffixes 94 | * 0.3.7 - August 30, 2015 95 | - Speed improvement, 3x faster 96 | - Make HumanName instances pickleable 97 | * 0.3.6 - August 6, 2015 98 | - Fix strings that start with conjunctions (#20) 99 | - handle assigning lists of names to a name attribute 100 | - support dictionary-like assignment of name attributes 101 | * 0.3.5 - August 4, 2015 102 | - Fix handling of string encoding in python 2.x (#34) 103 | - Add support for dictionary key access, e.g. name['first'] 104 | - add 'santa' to prefixes, add 'cpa', 'csm', 'phr', 'pmp' to suffixes (#35) 105 | - Fix prefixes before multi-part last names (#23) 106 | - Fix capitalization bug (#30) 107 | * 0.3.4 - March 1, 2015 108 | - Fix #24, handle first name also a prefix 109 | - Fix #26, last name comma format when lastname is also a title 110 | * 0.3.3 - Aug 4, 2014 111 | - Allow suffixes to be chained (#8) 112 | - Handle trailing suffix in last name comma format (#3). Removes support for titles 113 | with periods but no spaces in them, e.g. "Lt.Gen.". (#21) 114 | * 0.3.2 - July 16, 2014 115 | - Retain original string in "original" attribute. 116 | - Collapse white space when using custom string format. 117 | - Fix #19, single comma name format may have trailing suffix 118 | * 0.3.1 - July 5, 2014 119 | - Fix Pypi package, include new config module. 120 | * 0.3.0 - July 4, 2014 121 | - Refactor configuration to simplify modifications to constants (backwards incompatible) 122 | - use unicode_literals to simplify Python 2 & 3 support. 123 | - Generate documentation using sphinx and host on readthedocs. 124 | * 0.2.10 - May 6, 2014 125 | - If name is only a title and one part, assume it's a last name instead of a first name, with exceptions for some titles like 'Sir'. (`#7 `_). 126 | - Add some judicial and other common titles. (#9) 127 | * 0.2.9 - Apr 1, 2014 128 | - Add a new nickname attribute containing anything in parenthesis or double quotes (`Issue 33 `_). 129 | * 0.2.8 - Oct 25, 2013 130 | - Add support for Python 3.3+. Thanks to @corbinbs. 131 | * 0.2.7 - Feb 13, 2013 132 | - Fix bug with multiple conjunctions in title 133 | - add legal and crown titles 134 | * 0.2.6 - Feb 12, 2013 135 | - Fix python 2.6 import error on logging.NullHandler 136 | * 0.2.5 - Feb 11, 2013 137 | - Set logging handler to NullHandler 138 | - Remove 'ben' from PREFIXES because it's more common as a name than a prefix. 139 | - Deprecate BlankHumanNameError. Do not raise exceptions if full_name is empty string. 140 | * 0.2.4 - Feb 10, 2013 141 | - Adjust logging, don't set basicConfig. Fix `Issue 10 `_ and `Issue 26 `_. 142 | - Fix handling of single lower case initials that are also conjunctions, e.g. "john e smith". Re `Issue 11 `_. 143 | - Fix handling of initials with no space separation, e.g. "E.T. Jones". Fix #11. 144 | - Do not remove period from first name, when present. 145 | - Remove 'e' from PREFIXES because it is handled as a conjunction. 146 | - Python 2.7+ required to run the tests. Mark known failures. 147 | - tests/test.py can now take an optional name argument that will return repr() for that name. 148 | * 0.2.3 - Fix overzealous "Mac" regex 149 | * 0.2.2 - Fix parsing error 150 | * 0.2.0 151 | - Significant refactor of parsing logic. Handle conjunctions and prefixes before 152 | parsing into attribute buckets. 153 | - Support attribute overriding by assignment. 154 | - Support multiple titles. 155 | - Lowercase titles constants to fix bug with comparison. 156 | - Move documentation to README.rst, add release log. 157 | * 0.1.4 - Use set() in constants for improved speed. setuptools compatibility - sketerpot 158 | * 0.1.3 - Add capitalization feature - twotwo 159 | * 0.1.2 - Add slice support 160 | 161 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Nameparser documentation build configuration file, created by 4 | # sphinx-quickstart on Fri May 16 01:29:58 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | from datetime import date 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.abspath('..')) 23 | import nameparser 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | #needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.viewcode', 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # The suffix of source filenames. 43 | source_suffix = '.rst' 44 | 45 | # The encoding of source files. 46 | #source_encoding = 'utf-8-sig' 47 | 48 | # The master toctree document. 49 | master_doc = 'index' 50 | 51 | # General information about the project. 52 | project = u'Nameparser' 53 | copyright = u'{:%Y}, Derek Gulbranson'.format(date.today()) 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | # The short X.Y version. 60 | version = nameparser.__version__ 61 | # The full version, including alpha/beta/rc tags. 62 | release = version 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | #language = None 67 | 68 | # There are two options for replacing |today|: either, you set today to some 69 | # non-false value, then it is used: 70 | #today = '' 71 | # Else, today_fmt is used as the format for a strftime call. 72 | #today_fmt = '%B %d, %Y' 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | exclude_patterns = ['_build'] 77 | 78 | # The reST default role (used for this markup: `text`) to use for all 79 | # documents. 80 | #default_role = None 81 | 82 | # If true, '()' will be appended to :func: etc. cross-reference text. 83 | #add_function_parentheses = True 84 | 85 | # If true, the current module name will be prepended to all description 86 | # unit titles (such as .. function::). 87 | #add_module_names = True 88 | 89 | # If true, sectionauthor and moduleauthor directives will be shown in the 90 | # output. They are ignored by default. 91 | #show_authors = False 92 | 93 | # The name of the Pygments (syntax highlighting) style to use. 94 | pygments_style = 'sphinx' 95 | 96 | # A list of ignored prefixes for module index sorting. 97 | #modindex_common_prefix = [] 98 | 99 | # If true, keep warnings as "system message" paragraphs in the built documents. 100 | #keep_warnings = False 101 | 102 | 103 | # -- Options for HTML output ---------------------------------------------- 104 | 105 | # The theme to use for HTML and HTML Help pages. See the documentation for 106 | # a list of builtin themes. 107 | html_theme = 'alabaster' 108 | 109 | import alabaster 110 | 111 | html_theme_path = [alabaster.get_path()] 112 | extensions += ['alabaster'] 113 | html_theme = 'alabaster' 114 | html_sidebars = { 115 | '**': [ 116 | 'about.html', 117 | 'navigation.html', 118 | 'relations.html', 119 | 'searchbox.html', 120 | 'donate.html', 121 | ] 122 | } 123 | html_theme_options = { 124 | 'github_user': 'derek73', 125 | 'github_repo': 'python-nameparser', 126 | 'travis_button': True, 127 | 'analytics_id': 'UA-339019-11', 128 | } 129 | 130 | # Theme options are theme-specific and customize the look and feel of a theme 131 | # further. For a list of options available for each theme, see the 132 | # documentation. 133 | #html_theme_options = {} 134 | 135 | # Add any paths that contain custom themes here, relative to this directory. 136 | #html_theme_path = [] 137 | 138 | # The name for this set of Sphinx documents. If None, it defaults to 139 | # " v documentation". 140 | #html_title = None 141 | 142 | # A shorter title for the navigation bar. Default is the same as html_title. 143 | #html_short_title = None 144 | 145 | # The name of an image file (relative to this directory) to place at the top 146 | # of the sidebar. 147 | #html_logo = None 148 | 149 | # The name of an image file (within the static path) to use as favicon of the 150 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 151 | # pixels large. 152 | #html_favicon = None 153 | 154 | # Add any paths that contain custom static files (such as style sheets) here, 155 | # relative to this directory. They are copied after the builtin static files, 156 | # so a file named "default.css" will overwrite the builtin "default.css". 157 | html_static_path = ['_static'] 158 | 159 | # Add any extra paths that contain custom files (such as robots.txt or 160 | # .htaccess) here, relative to this directory. These files are copied 161 | # directly to the root of the documentation. 162 | #html_extra_path = [] 163 | 164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 165 | # using the given strftime format. 166 | #html_last_updated_fmt = '%b %d, %Y' 167 | 168 | # If true, SmartyPants will be used to convert quotes and dashes to 169 | # typographically correct entities. 170 | #html_use_smartypants = True 171 | 172 | # Custom sidebar templates, maps document names to template names. 173 | #html_sidebars = {} 174 | 175 | # Additional templates that should be rendered to pages, maps page names to 176 | # template names. 177 | #html_additional_pages = {} 178 | 179 | # If false, no module index is generated. 180 | #html_domain_indices = True 181 | 182 | # If false, no index is generated. 183 | #html_use_index = True 184 | 185 | # If true, the index is split into individual pages for each letter. 186 | #html_split_index = False 187 | 188 | # If true, links to the reST sources are added to the pages. 189 | #html_show_sourcelink = True 190 | 191 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 192 | #html_show_sphinx = True 193 | 194 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 195 | #html_show_copyright = True 196 | 197 | # If true, an OpenSearch description file will be output, and all pages will 198 | # contain a tag referring to it. The value of this option must be the 199 | # base URL from which the finished HTML is served. 200 | #html_use_opensearch = '' 201 | 202 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 203 | #html_file_suffix = None 204 | 205 | # Output file base name for HTML help builder. 206 | htmlhelp_basename = 'Nameparserdoc' 207 | 208 | 209 | # -- Options for LaTeX output --------------------------------------------- 210 | 211 | latex_elements = { 212 | # The paper size ('letterpaper' or 'a4paper'). 213 | #'papersize': 'letterpaper', 214 | 215 | # The font size ('10pt', '11pt' or '12pt'). 216 | #'pointsize': '10pt', 217 | 218 | # Additional stuff for the LaTeX preamble. 219 | #'preamble': '', 220 | } 221 | 222 | # Grouping the document tree into LaTeX files. List of tuples 223 | # (source start file, target name, title, 224 | # author, documentclass [howto, manual, or own class]). 225 | latex_documents = [ 226 | ('index', 'Nameparser.tex', u'Nameparser Documentation', 227 | u'Derek Gulbranson', 'manual'), 228 | ] 229 | 230 | # The name of an image file (relative to this directory) to place at the top of 231 | # the title page. 232 | #latex_logo = None 233 | 234 | # For "manual" documents, if this is true, then toplevel headings are parts, 235 | # not chapters. 236 | #latex_use_parts = False 237 | 238 | # If true, show page references after internal links. 239 | #latex_show_pagerefs = False 240 | 241 | # If true, show URL addresses after external links. 242 | #latex_show_urls = False 243 | 244 | # Documents to append as an appendix to all manuals. 245 | #latex_appendices = [] 246 | 247 | # If false, no module index is generated. 248 | #latex_domain_indices = True 249 | 250 | 251 | # -- Options for manual page output --------------------------------------- 252 | 253 | # One entry per manual page. List of tuples 254 | # (source start file, name, description, authors, manual section). 255 | man_pages = [ 256 | ('index', 'nameparser', u'Nameparser Documentation', 257 | [u'Derek Gulbranson'], 1) 258 | ] 259 | 260 | # If true, show URL addresses after external links. 261 | #man_show_urls = False 262 | 263 | 264 | # -- Options for Texinfo output ------------------------------------------- 265 | 266 | # Grouping the document tree into Texinfo files. List of tuples 267 | # (source start file, target name, title, author, 268 | # dir menu entry, description, category) 269 | texinfo_documents = [ 270 | ('index', 'Nameparser', u'Nameparser Documentation', 271 | u'Derek Gulbranson', 'Nameparser', 'A simple python modules for parsing human names into components.', 272 | 'Miscellaneous'), 273 | ] 274 | 275 | # Documents to append as an appendix to all manuals. 276 | #texinfo_appendices = [] 277 | 278 | # If false, no module index is generated. 279 | #texinfo_domain_indices = True 280 | 281 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 282 | #texinfo_show_urls = 'footnote' 283 | 284 | # If true, do not generate a @detailmenu in the "Top" node's menu. 285 | #texinfo_no_detailmenu = False 286 | 287 | doctest_global_setup = """from nameparser import HumanName 288 | from nameparser.config import CONSTANTS, Constants 289 | CONSTANTS = Constants() 290 | """ 291 | -------------------------------------------------------------------------------- /nameparser/config/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | The :py:mod:`nameparser.config` module manages the configuration of the 4 | nameparser. 5 | 6 | A module-level instance of :py:class:`~nameparser.config.Constants` is created 7 | and used by default for all HumanName instances. You can adjust the entire module's 8 | configuration by importing this instance and changing it. 9 | 10 | :: 11 | 12 | >>> from nameparser.config import CONSTANTS 13 | >>> CONSTANTS.titles.remove('hon').add('chemistry','dean') # doctest: +ELLIPSIS 14 | SetManager(set([u'msgt', ..., u'adjutant'])) 15 | 16 | You can also adjust the configuration of individual instances by passing 17 | ``None`` as the second argument upon instantiation. 18 | 19 | :: 20 | 21 | >>> from nameparser import HumanName 22 | >>> hn = HumanName("Dean Robert Johns", None) 23 | >>> hn.C.titles.add('dean') # doctest: +ELLIPSIS 24 | SetManager(set([u'msgt', ..., u'adjutant'])) 25 | >>> hn.parse_full_name() # need to run this again after config changes 26 | 27 | **Potential Gotcha**: If you do not pass ``None`` as the second argument, 28 | ``hn.C`` will be a reference to the module config, possibly yielding 29 | unexpected results. See `Customizing the Parser `_. 30 | """ 31 | from __future__ import unicode_literals 32 | import sys 33 | try: 34 | # Python 3.3+ 35 | from collections.abc import Set 36 | except ImportError: 37 | from collections import Set 38 | 39 | from nameparser.util import binary_type 40 | from nameparser.util import lc 41 | from nameparser.config.prefixes import PREFIXES 42 | from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS 43 | from nameparser.config.conjunctions import CONJUNCTIONS 44 | from nameparser.config.suffixes import SUFFIX_ACRONYMS 45 | from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS 46 | from nameparser.config.titles import TITLES 47 | from nameparser.config.titles import FIRST_NAME_TITLES 48 | from nameparser.config.regexes import REGEXES 49 | 50 | DEFAULT_ENCODING = 'UTF-8' 51 | 52 | 53 | class SetManager(Set): 54 | ''' 55 | Easily add and remove config variables per module or instance. Subclass of 56 | ``collections.abc.Set``. 57 | 58 | Only special functionality beyond that provided by set() is 59 | to normalize constants for comparison (lower case, no periods) 60 | when they are add()ed and remove()d and allow passing multiple 61 | string arguments to the :py:func:`add()` and :py:func:`remove()` methods. 62 | 63 | ''' 64 | 65 | def __init__(self, elements): 66 | self.elements = set(elements) 67 | 68 | def __call__(self): 69 | return self.elements 70 | 71 | def __repr__(self): 72 | return "SetManager({})".format(self.elements) # used for docs 73 | 74 | def __iter__(self): 75 | return iter(self.elements) 76 | 77 | def __contains__(self, value): 78 | return value in self.elements 79 | 80 | def __len__(self): 81 | return len(self.elements) 82 | 83 | def next(self): 84 | return self.__next__() 85 | 86 | def __next__(self): 87 | if self.count >= len(self.elements): 88 | self.count = 0 89 | raise StopIteration 90 | else: 91 | c = self.count 92 | self.count = c + 1 93 | return getattr(self, self.elements[c]) or next(self) 94 | 95 | def add_with_encoding(self, s, encoding=None): 96 | """ 97 | Add the lower case and no-period version of the string to the set. Pass an 98 | explicit `encoding` parameter to specify the encoding of binary strings that 99 | are not DEFAULT_ENCODING (UTF-8). 100 | """ 101 | stdin_encoding = None 102 | if sys.stdin: 103 | stdin_encoding = sys.stdin.encoding 104 | encoding = encoding or stdin_encoding or DEFAULT_ENCODING 105 | if type(s) == binary_type: 106 | s = s.decode(encoding) 107 | self.elements.add(lc(s)) 108 | 109 | def add(self, *strings): 110 | """ 111 | Add the lower case and no-period version of the string arguments to the set. 112 | Can pass a list of strings. Returns ``self`` for chaining. 113 | """ 114 | [self.add_with_encoding(s) for s in strings] 115 | return self 116 | 117 | def remove(self, *strings): 118 | """ 119 | Remove the lower case and no-period version of the string arguments from the set. 120 | Returns ``self`` for chaining. 121 | """ 122 | [self.elements.remove(lc(s)) for s in strings if lc(s) in self.elements] 123 | return self 124 | 125 | 126 | class TupleManager(dict): 127 | ''' 128 | A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants 129 | more friendly. 130 | ''' 131 | 132 | def __getattr__(self, attr): 133 | return self.get(attr) 134 | __setattr__ = dict.__setitem__ 135 | __delattr__ = dict.__delitem__ 136 | 137 | def __getstate__(self): 138 | return dict(self) 139 | 140 | def __setstate__(self, state): 141 | self.__init__(state) 142 | 143 | def __reduce__(self): 144 | return (TupleManager, (), self.__getstate__()) 145 | 146 | 147 | class Constants(object): 148 | """ 149 | An instance of this class hold all of the configuration constants for the parser. 150 | 151 | :param set prefixes: 152 | :py:attr:`prefixes` wrapped with :py:class:`SetManager`. 153 | :param set titles: 154 | :py:attr:`titles` wrapped with :py:class:`SetManager`. 155 | :param set first_name_titles: 156 | :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. 157 | :param set suffix_acronyms: 158 | :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. 159 | :param set suffix_not_acronyms: 160 | :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. 161 | :param set conjunctions: 162 | :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. 163 | :type capitalization_exceptions: tuple or dict 164 | :param capitalization_exceptions: 165 | :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. 166 | :type regexes: tuple or dict 167 | :param regexes: 168 | :py:attr:`regexes` wrapped with :py:class:`TupleManager`. 169 | """ 170 | 171 | string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" 172 | """ 173 | The default string format use for all new `HumanName` instances. 174 | """ 175 | 176 | initials_format = "{first} {middle} {last}" 177 | """ 178 | The default initials format used for all new `HumanName` instances. 179 | """ 180 | 181 | initials_delimiter = "." 182 | """ 183 | The default initials delimiter used for all new `HumanName` instances. 184 | Will be used to add a delimiter between each initial. 185 | """ 186 | 187 | empty_attribute_default = '' 188 | """ 189 | Default return value for empty attributes. 190 | 191 | .. doctest:: 192 | 193 | >>> from nameparser.config import CONSTANTS 194 | >>> CONSTANTS.empty_attribute_default = None 195 | >>> name = HumanName("John Doe") 196 | >>> name.title 197 | None 198 | >>>name.first 199 | 'John' 200 | 201 | """ 202 | 203 | capitalize_name = False 204 | """ 205 | If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to 206 | :py:class:`~nameparser.parser.HumanName` instance. 207 | 208 | .. doctest:: 209 | 210 | >>> from nameparser.config import CONSTANTS 211 | >>> CONSTANTS.capitalize_name = True 212 | >>> name = HumanName("bob v. de la macdole-eisenhower phd") 213 | >>> str(name) 214 | 'Bob V. de la MacDole-Eisenhower Ph.D.' 215 | 216 | """ 217 | 218 | force_mixed_case_capitalization = False 219 | """ 220 | If set, forces the capitalization of mixed case strings when 221 | :py:meth:`~nameparser.parser.HumanName.capitalize` is called. 222 | 223 | .. doctest:: 224 | 225 | >>> from nameparser.config import CONSTANTS 226 | >>> CONSTANTS.force_mixed_case_capitalization = True 227 | >>> name = HumanName('Shirley Maclaine') 228 | >>> name.capitalize() 229 | >>> str(name) 230 | 'Shirley MacLaine' 231 | 232 | """ 233 | 234 | def __init__(self, 235 | prefixes=PREFIXES, 236 | suffix_acronyms=SUFFIX_ACRONYMS, 237 | suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, 238 | titles=TITLES, 239 | first_name_titles=FIRST_NAME_TITLES, 240 | conjunctions=CONJUNCTIONS, 241 | capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, 242 | regexes=REGEXES 243 | ): 244 | self.prefixes = SetManager(prefixes) 245 | self.suffix_acronyms = SetManager(suffix_acronyms) 246 | self.suffix_not_acronyms = SetManager(suffix_not_acronyms) 247 | self.titles = SetManager(titles) 248 | self.first_name_titles = SetManager(first_name_titles) 249 | self.conjunctions = SetManager(conjunctions) 250 | self.capitalization_exceptions = TupleManager(capitalization_exceptions) 251 | self.regexes = TupleManager(regexes) 252 | self._pst = None 253 | 254 | @property 255 | def suffixes_prefixes_titles(self): 256 | if not self._pst: 257 | self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles 258 | return self._pst 259 | 260 | def __repr__(self): 261 | return "" 262 | 263 | def __setstate__(self, state): 264 | self.__init__(state) 265 | 266 | def __getstate__(self): 267 | attrs = [x for x in dir(self) if not x.startswith('_')] 268 | return dict([(a, getattr(self, a)) for a in attrs]) 269 | 270 | 271 | #: A module-level instance of the :py:class:`Constants()` class. 272 | #: Provides a common instance for the module to share 273 | #: to easily adjust configuration for the entire module. 274 | #: See `Customizing the Parser with Your Own Configuration `_. 275 | CONSTANTS = Constants() 276 | -------------------------------------------------------------------------------- /docs/customize.rst: -------------------------------------------------------------------------------- 1 | Customizing the Parser with Your Own Configuration 2 | ================================================== 3 | 4 | Recognition of titles, prefixes, suffixes and conjunctions is handled by 5 | matching the lower case characters of a name piece with pre-defined sets 6 | of strings located in :py:mod:`nameparser.config`. You can adjust 7 | these predefined sets to help fine tune the parser for your dataset. 8 | 9 | Changing the Parser Constants 10 | ----------------------------- 11 | 12 | There are a few ways to adjust the parser configuration depending on your 13 | needs. The config is available in two places. 14 | 15 | The first is via ``from nameparser.config import CONSTANTS``. 16 | 17 | .. doctest:: 18 | 19 | >>> from nameparser.config import CONSTANTS 20 | >>> CONSTANTS 21 | 22 | 23 | The other is the ``C`` attribute of a ``HumanName`` instance, e.g. 24 | ``hn.C``. 25 | 26 | .. doctest:: 27 | 28 | >>> from nameparser import HumanName 29 | >>> hn = HumanName("Dean Robert Johns") 30 | >>> hn.C 31 | 32 | 33 | Both places are usually a reference to the same shared module-level 34 | :py:class:`~nameparser.config.CONSTANTS` instance, depending on how you 35 | instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). 36 | 37 | 38 | 39 | Editable attributes of nameparser.config.CONSTANTS 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | * :py:data:`~nameparser.config.titles.TITLES` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. 43 | * :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". 44 | * :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". 45 | * :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". 46 | * :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece. 47 | * :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. 48 | * :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". 49 | * :py:data:`~nameparser.config.regexes.REGEXES` - Regular expressions used to find words, initials, nicknames, etc. 50 | 51 | Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning 52 | the constants for your project. These methods automatically lower case and 53 | remove punctuation to normalize them for comparison. 54 | 55 | Other editable attributes 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 57 | 58 | * :py:obj:`~nameparser.config.Constants.string_format` - controls output from `str()` 59 | * :py:obj:`~nameparser.config.Constants.empty_attribute_default` - value returned by empty attributes, defaults to empty string 60 | * :py:obj:`~nameparser.config.Constants.capitalize_name` - If set, applies :py:meth:`~nameparser.parser.HumanName.capitalize` to :py:class:`~nameparser.parser.HumanName` instance. 61 | * :py:obj:`~nameparser.config.Constants.force_mixed_case_capitalization` - If set, forces the capitalization of mixed case strings when :py:meth:`~nameparser.parser.HumanName.capitalize` is called. 62 | 63 | 64 | 65 | Parser Customization Examples 66 | ----------------------------- 67 | 68 | Removing a Title 69 | ~~~~~~~~~~~~~~~~ 70 | 71 | Take a look at the :py:mod:`nameparser.config` documentation to see what's 72 | in the constants. Here's a quick walk through of some examples where you 73 | might want to adjust them. 74 | 75 | "Hon" is a common abbreviation for "Honorable", a title used when 76 | addressing judges, and is included in the default tiles constants. This 77 | means it will never be considered a first name, because titles are the 78 | pieces before first names. 79 | 80 | But "Hon" is also sometimes a first name. If your dataset contains more 81 | "Hon"s than "Honorable"s, you may wish to remove it from the titles 82 | constant so that "Hon" can be parsed as a first name. 83 | 84 | .. doctest:: 85 | :options: +ELLIPSIS, +NORMALIZE_WHITESPACE 86 | 87 | >>> from nameparser import HumanName 88 | >>> hn = HumanName("Hon Solo") 89 | >>> hn 90 | 98 | >>> from nameparser.config import CONSTANTS 99 | >>> CONSTANTS.titles.remove('hon') 100 | SetManager({'right', ..., 'tax'}) 101 | >>> hn = HumanName("Hon Solo") 102 | >>> hn 103 | 111 | 112 | 113 | If you don't want to detect any titles at all, you can remove all of them: 114 | 115 | >>> CONSTANTS.titles.remove(*CONSTANTS.titles) 116 | 117 | 118 | Adding a Title 119 | ~~~~~~~~~~~~~~~~ 120 | 121 | You can also pass a ``Constants`` instance to ``HumanName`` on instantiation. 122 | 123 | "Dean" is a common first name so it is not included in the default titles 124 | constant. But in some contexts it is more common as a title. If you would 125 | like "Dean" to be parsed as a title, simply add it to the titles constant. 126 | 127 | You can pass multiple strings to both the :py:func:`~nameparser.config.SetManager.add` 128 | and :py:func:`~nameparser.config.SetManager.remove` 129 | methods and each string will be added or removed. Both functions 130 | automatically normalize the strings for the parser's comparison method by 131 | making them lower case and removing periods. 132 | 133 | .. doctest:: 134 | :options: +ELLIPSIS, +NORMALIZE_WHITESPACE 135 | 136 | >>> from nameparser import HumanName 137 | >>> from nameparser.config import Constants 138 | >>> constants = Constants() 139 | >>> constants.titles.add('dean', 'Chemistry') 140 | SetManager({'right', ..., 'tax'}) 141 | >>> hn = HumanName("Assoc Dean of Chemistry Robert Johns", constants=constants) 142 | >>> hn 143 | 151 | 152 | 153 | Module-level Shared Configuration Instance 154 | ------------------------------------------ 155 | 156 | When you modify the configuration, by default this will modify the behavior all 157 | HumanName instances. This could be a handy way to set it up for your entire 158 | project, but it could also lead to some unexpected behavior because changing 159 | the config on one instance could modify the behavior of another instance. 160 | 161 | .. doctest:: module config 162 | :options: +ELLIPSIS, +NORMALIZE_WHITESPACE 163 | 164 | >>> from nameparser import HumanName 165 | >>> instance = HumanName("") 166 | >>> instance.C.titles.add('dean') 167 | SetManager({'right', ..., 'tax'}) 168 | >>> other_instance = HumanName("Dean Robert Johns") 169 | >>> other_instance # Dean parses as title 170 | 178 | 179 | 180 | If you'd prefer new instances to have their own config values, one shortcut is to pass 181 | ``None`` as the second argument (or ``constant`` keyword argument) when 182 | instantiating ``HumanName``. Each instance always has a ``C`` attribute, but if 183 | you didn't pass something falsey to the ``constants`` argument then it's a 184 | reference to the module-level config values with the behavior described above. 185 | 186 | .. doctest:: module config 187 | :options: +ELLIPSIS, +NORMALIZE_WHITESPACE 188 | 189 | >>> from nameparser import HumanName 190 | >>> instance = HumanName("Dean Robert Johns") 191 | >>> instance.has_own_config 192 | False 193 | >>> instance.C.titles.add('dean') 194 | SetManager({'right', ..., 'tax'}) 195 | >>> other_instance = HumanName("Dean Robert Johns", None) # <-- pass None for per-instance config 196 | >>> other_instance 197 | 205 | >>> other_instance.has_own_config 206 | True 207 | 208 | Don't Remove Emojis 209 | ~~~~~~~~~~~~~~~~~~~ 210 | 211 | By default, all emojis are removed from the input string before the name is parsed. 212 | You can turn this off by setting the ``emoji`` regex to ``False``. 213 | 214 | .. doctest:: 215 | 216 | >>> from nameparser import HumanName 217 | >>> from nameparser.config import Constants 218 | >>> constants = Constants() 219 | >>> constants.regexes.emoji = False 220 | >>> hn = HumanName("Sam 😊 Smith", constants=constants) 221 | >>> hn 222 | "Sam 😊 Smith" 223 | 224 | Config Changes May Need Parse Refresh 225 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 226 | 227 | The full name is parsed upon assignment to the ``full_name`` attribute or 228 | instantiation. Sometimes after making changes to configuration or other inner 229 | data after assigning the full name, the name will need to be re-parsed with the 230 | :py:func:`~nameparser.parser.HumanName.parse_full_name()` method before you see 231 | those changes with ``repr()``. 232 | 233 | 234 | Adjusting names after parsing them 235 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 236 | 237 | Each attribute has a corresponding ordered list of name pieces. If you're doing 238 | pre- or post-processing you may wish to manipulate these lists directly. 239 | The strings returned by the attribute names just join these lists with spaces. 240 | 241 | 242 | * o.title_list 243 | * o.first_list 244 | * o.middle_list 245 | * o.last_list 246 | * o.suffix_list 247 | * o.nickname_list 248 | 249 | :: 250 | 251 | >>> hn = HumanName("Juan Q. Xavier Velasquez y Garcia, Jr.") 252 | >>> hn.middle_list 253 | ['Q.', 'Xavier'] 254 | >>> hn.middle_list += ["Ricardo"] 255 | >>> hn.middle_list 256 | ['Q.', 'Xavier', 'Ricardo'] 257 | 258 | 259 | You can also replace any name bucket's contents by assigning a string or a list 260 | directly to the attribute. 261 | 262 | :: 263 | 264 | >>> hn = HumanName("Dr. John A. Kenneth Doe") 265 | >>> hn.title = ["Associate","Professor"] 266 | >>> hn.suffix = "Md." 267 | >>> hn.suffix 268 | 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /nameparser/config/suffixes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | SUFFIX_NOT_ACRONYMS = set([ 5 | 'dr', 6 | 'esq', 7 | 'esquire', 8 | 'jr', 9 | 'jnr', 10 | 'junior', 11 | 'sr', 12 | 'snr', 13 | '2', 14 | 'i', 15 | 'ii', 16 | 'iii', 17 | 'iv', 18 | 'v', 19 | ]) 20 | """ 21 | 22 | Post-nominal pieces that are not acronyms. The parser does not remove periods 23 | when matching against these pieces. 24 | 25 | """ 26 | SUFFIX_ACRONYMS = set([ 27 | '(ret)', 28 | '(vet)', 29 | '8-vsb', 30 | 'aas', 31 | 'aba', 32 | 'abc', 33 | 'abd', 34 | 'abpp', 35 | 'abr', 36 | 'aca', 37 | 'acas', 38 | 'ace', 39 | 'acha', 40 | 'acp', 41 | 'ae', 42 | 'ae', 43 | 'aem', 44 | 'afasma', 45 | 'afc', 46 | 'afc', 47 | 'afm', 48 | 'afm', 49 | 'agsf', 50 | 'aia', 51 | 'aicp', 52 | 'ala', 53 | 'alc', 54 | 'alp', 55 | 'am', 56 | 'amd', 57 | 'ame', 58 | 'amieee', 59 | 'ams', 60 | 'aphr', 61 | 'apn aprn', 62 | 'apr', 63 | 'apss', 64 | 'aqp', 65 | 'arm', 66 | 'arrc', 67 | 'asa', 68 | 'asc', 69 | 'asid', 70 | 'asla', 71 | 'asp', 72 | 'atc', 73 | 'awb', 74 | 'bca', 75 | 'bcl', 76 | 'bcss', 77 | 'bds', 78 | 'bem', 79 | 'bem', 80 | 'bls-i', 81 | 'bpe', 82 | 'bpi', 83 | 'bpt', 84 | 'bt', 85 | 'btcs', 86 | 'bts', 87 | 'cacts', 88 | 'cae', 89 | 'caha', 90 | 'caia', 91 | 'cams', 92 | 'cap', 93 | 'capa', 94 | 'capm', 95 | 'capp', 96 | 'caps', 97 | 'caro', 98 | 'cas', 99 | 'casp', 100 | 'cb', 101 | 'cbe', 102 | 'cbm', 103 | 'cbne', 104 | 'cbnt', 105 | 'cbp', 106 | 'cbrte', 107 | 'cbs', 108 | 'cbsp', 109 | 'cbt', 110 | 'cbte', 111 | 'cbv', 112 | 'cca', 113 | 'ccc', 114 | 'ccca', 115 | 'cccm', 116 | 'cce', 117 | 'cchp', 118 | 'ccie', 119 | 'ccim', 120 | 'cciso', 121 | 'ccm', 122 | 'ccmt', 123 | 'ccna', 124 | 'ccnp', 125 | 'ccp', 126 | 'ccp-c', 127 | 'ccpr', 128 | 'ccs', 129 | 'ccufc', 130 | 'cd', 131 | 'cdal', 132 | 'cdfm', 133 | 'cdmp', 134 | 'cds', 135 | 'cdt', 136 | 'cea', 137 | 'ceas', 138 | 'cebs', 139 | 'ceds', 140 | 'ceh', 141 | 'cela', 142 | 'cem', 143 | 'cep', 144 | 'cera', 145 | 'cet', 146 | 'cfa', 147 | 'cfc', 148 | 'cfcc', 149 | 'cfce', 150 | 'cfcm', 151 | 'cfe', 152 | 'cfeds', 153 | 'cfi', 154 | 'cfm', 155 | 'cfp', 156 | 'cfps', 157 | 'cfr', 158 | 'cfre', 159 | 'cga', 160 | 'cgap', 161 | 'cgb', 162 | 'cgc', 163 | 'cgfm', 164 | 'cgfo', 165 | 'cgm', 166 | 'cgm', 167 | 'cgma', 168 | 'cgp', 169 | 'cgr', 170 | 'cgsp', 171 | 'ch', 172 | 'ch', 173 | 'cha', 174 | 'chba', 175 | 'chdm', 176 | 'che', 177 | 'ches', 178 | 'chfc', 179 | 'chfc', 180 | 'chi', 181 | 'chmc', 182 | 'chmm', 183 | 'chp', 184 | 'chpa', 185 | 'chpe', 186 | 'chpln', 187 | 'chpse', 188 | 'chrm', 189 | 'chsc', 190 | 'chse', 191 | 'chse-a', 192 | 'chsos', 193 | 'chss', 194 | 'cht', 195 | 'cia', 196 | 'cic', 197 | 'cie', 198 | 'cig', 199 | 'cip', 200 | 'cipm', 201 | 'cips', 202 | 'ciro', 203 | 'cisa', 204 | 'cism', 205 | 'cissp', 206 | 'cla', 207 | 'clsd', 208 | 'cltd', 209 | 'clu', 210 | 'cm', 211 | 'cma', 212 | 'cmas', 213 | 'cmc', 214 | 'cmfo', 215 | 'cmg', 216 | 'cmp', 217 | 'cms', 218 | 'cmsp', 219 | 'cmt', 220 | 'cna', 221 | 'cnm', 222 | 'cnp', 223 | 'cp', 224 | 'cp-c', 225 | 'cpa', 226 | 'cpacc', 227 | 'cpbe', 228 | 'cpcm', 229 | 'cpcu', 230 | 'cpe', 231 | 'cpfa', 232 | 'cpfo', 233 | 'cpg', 234 | 'cph', 235 | 'cpht', 236 | 'cpim', 237 | 'cpl', 238 | 'cplp', 239 | 'cpm', 240 | 'cpo', 241 | 'cpp', 242 | 'cppm', 243 | 'cprc', 244 | 'cpre', 245 | 'cprp', 246 | 'cpsc', 247 | 'cpsi', 248 | 'cpss', 249 | 'cpt', 250 | 'cpwa', 251 | 'crde', 252 | 'crisc', 253 | 'crma', 254 | 'crme', 255 | 'crna', 256 | 'cro', 257 | 'crp', 258 | 'crt', 259 | 'crtt', 260 | 'csa', 261 | 'csbe', 262 | 'csc', 263 | 'cscp', 264 | 'cscu', 265 | 'csep', 266 | 'csi', 267 | 'csm', 268 | 'csp', 269 | 'cspo', 270 | 'csre', 271 | 'csrte', 272 | 'csslp', 273 | 'cssm', 274 | 'cst', 275 | 'cste', 276 | 'ctbs', 277 | 'ctfa', 278 | 'cto', 279 | 'ctp', 280 | 'cts', 281 | 'cua', 282 | 'cusp', 283 | 'cva', 284 | 'cva[22]', 285 | 'cvo', 286 | 'cvp', 287 | 'cvrs', 288 | 'cwap', 289 | 'cwb', 290 | 'cwdp', 291 | 'cwep', 292 | 'cwna', 293 | 'cwne', 294 | 'cwp', 295 | 'cwsp', 296 | 'cxa', 297 | 'cyds', 298 | 'cysa', 299 | 'dabfm', 300 | 'dabvlm', 301 | 'dacvim', 302 | 'dbe', 303 | 'dc', 304 | 'dcb', 305 | 'dcm', 306 | 'dcmg', 307 | 'dcvo', 308 | 'dd', 309 | 'dds', 310 | 'ded', 311 | 'dep', 312 | 'dfc', 313 | 'dfm', 314 | 'diplac', 315 | 'diplom', 316 | 'djur', 317 | 'dma', 318 | 'dmd', 319 | 'dmin', 320 | 'dnp', 321 | 'do', 322 | 'dpm', 323 | 'dpt', 324 | 'drb', 325 | 'drmp', 326 | 'drph', 327 | 'dsc', 328 | 'dsm', 329 | 'dso', 330 | 'dss', 331 | 'dtr', 332 | 'dvep', 333 | 'dvm', 334 | 'ea', 335 | 'ed', 336 | 'edd', 337 | 'ei', 338 | 'eit', 339 | 'els', 340 | 'emd', 341 | 'emt-b', 342 | 'emt-i/85', 343 | 'emt-i/99', 344 | 'emt-p', 345 | 'enp', 346 | 'erd', 347 | 'esq', 348 | 'evp', 349 | 'faafp', 350 | 'faan', 351 | 'faap', 352 | 'fac-c', 353 | 'facc', 354 | 'facd', 355 | 'facem', 356 | 'facep', 357 | 'facha', 358 | 'facofp', 359 | 'facog', 360 | 'facp', 361 | 'facph', 362 | 'facs', 363 | 'faia', 364 | 'faicp', 365 | 'fala', 366 | 'fashp', 367 | 'fasid', 368 | 'fasla', 369 | 'fasma', 370 | 'faspen', 371 | 'fca', 372 | 'fcas', 373 | 'fcela', 374 | 'fd', 375 | 'fec', 376 | 'fhames', 377 | 'fic', 378 | 'ficf', 379 | 'fieee', 380 | 'fmp', 381 | 'fmva', 382 | 'fnss', 383 | 'fp&a', 384 | 'fp-c', 385 | 'fpc', 386 | 'frm', 387 | 'fsa', 388 | 'fsdp', 389 | 'fws', 390 | 'gaee[14]', 391 | 'gba', 392 | 'gbe', 393 | 'gc', 394 | 'gcb', 395 | 'gcb', 396 | 'gchs', 397 | 'gcie', 398 | 'gcmg', 399 | 'gcmg', 400 | 'gcsi', 401 | 'gcvo', 402 | 'gcvo', 403 | 'gisp', 404 | 'git', 405 | 'gm', 406 | 'gmb', 407 | 'gmr', 408 | 'gphr', 409 | 'gri', 410 | 'grp', 411 | 'gsmieee', 412 | 'hccp', 413 | 'hrs', 414 | 'iaccp', 415 | 'iaee', 416 | 'iccm-d', 417 | 'iccm-f', 418 | 'idsm', 419 | 'ifgict', 420 | 'iom', 421 | 'ipep', 422 | 'ipm', 423 | 'iso', 424 | 'issp-csp', 425 | 'issp-sa', 426 | 'itil', 427 | 'jd', 428 | 'jp', 429 | 'kbe', 430 | 'kcb', 431 | 'kchs/dchs', 432 | 'kcie', 433 | 'kcie', 434 | 'kcmg', 435 | 'kcsi', 436 | 'kcsi', 437 | 'kcvo', 438 | 'kg', 439 | 'khs/dhs', 440 | 'kp', 441 | 'kt', 442 | 'lac', 443 | 'lcmt', 444 | 'lcpc', 445 | 'lcsw', 446 | 'leed ap', 447 | 'lg', 448 | 'litk', 449 | 'litl', 450 | 'litp', 451 | 'llm', 452 | 'lm', 453 | 'lmsw', 454 | 'lmt', 455 | 'lp', 456 | 'lpa', 457 | 'lpc', 458 | 'lpn', 459 | 'lpss', 460 | 'lsi', 461 | 'lsit', 462 | 'lt', 463 | 'lvn', 464 | 'lvo', 465 | 'lvt', 466 | 'ma', 467 | 'maaa', 468 | 'mai', 469 | 'mba', 470 | 'mbe', 471 | 'mbs', 472 | 'mc', 473 | 'mcct', 474 | 'mcdba', 475 | 'mches', 476 | 'mcm', 477 | 'mcp', 478 | 'mcpd', 479 | 'mcsa', 480 | 'mcsd', 481 | 'mcse', 482 | 'mct', 483 | 'md', 484 | 'mdiv', 485 | 'mem', 486 | 'mfa', 487 | 'micp', 488 | 'mieee', 489 | 'mirm', 490 | 'mle', 491 | 'mls', 492 | 'mlse', 493 | 'mlt', 494 | 'mm', 495 | 'mmad', 496 | 'mmas', 497 | 'mnaa', 498 | 'mnae', 499 | 'mp', 500 | 'mpa', 501 | 'mph', 502 | 'mpse', 503 | 'mra', 504 | 'ms', 505 | 'msa', 506 | 'msc' 507 | 'mscmsm', 508 | 'msm', 509 | 'mt', 510 | 'mts', 511 | 'mvo', 512 | 'nbc-his', 513 | 'nbcch', 514 | 'nbcch-ps', 515 | 'nbcdch', 516 | 'nbcdch-ps', 517 | 'nbcfch', 518 | 'nbcfch-ps', 519 | 'nbct', 520 | 'ncarb', 521 | 'nccp', 522 | 'ncidq', 523 | 'ncps', 524 | 'ncso', 525 | 'ncto', 526 | 'nd', 527 | 'ndtr', 528 | 'nicet i', 529 | 'nicet ii', 530 | 'nicet iii', 531 | 'nicet iv', 532 | 'nmd', 533 | 'np', 534 | 'np[18]', 535 | 'nraemt', 536 | 'nremr', 537 | 'nremt', 538 | 'nrp', 539 | 'obe', 540 | 'obi', 541 | 'oca', 542 | 'ocm', 543 | 'ocp', 544 | 'od', 545 | 'om', 546 | 'oscp', 547 | 'ot', 548 | 'pa-c', 549 | 'pcc', 550 | 'pci', 551 | 'pe', 552 | 'pfmp', 553 | 'pg', 554 | 'pgmp', 555 | 'ph', 556 | 'pharmd', 557 | 'phc', 558 | 'phd', 559 | 'phr', 560 | 'phrca', 561 | 'pla', 562 | 'pls', 563 | 'pmc', 564 | 'pmi-acp', 565 | 'pmp', 566 | 'pp', 567 | 'pps', 568 | 'prm', 569 | 'psm i', 570 | 'psm ii', 571 | 'psm', 572 | 'psp', 573 | 'psyd', 574 | 'pt', 575 | 'pta', 576 | 'qam', 577 | 'qc', 578 | 'qcsw', 579 | 'qfsm', 580 | 'qgm', 581 | 'qpm', 582 | 'qsd', 583 | 'qsp', 584 | 'ra', 585 | 'rai', 586 | 'rba', 587 | 'rci', 588 | 'rcp', 589 | 'rd', 590 | 'rdcs', 591 | 'rdh', 592 | 'rdms', 593 | 'rdn', 594 | 'res', 595 | 'rfp', 596 | 'rhca', 597 | 'rid', 598 | 'rls', 599 | 'rmsks', 600 | 'rn', 601 | 'rp', 602 | 'rpa', 603 | 'rph', 604 | 'rpl', 605 | 'rrc', 606 | 'rrt', 607 | 'rrt-accs', 608 | 'rrt-nps', 609 | 'rrt-sds', 610 | 'rtrp', 611 | 'rvm', 612 | 'rvt', 613 | 'sa', 614 | 'same', 615 | 'sasm', 616 | 'sccp', 617 | 'scmp', 618 | 'se', 619 | 'secb', 620 | 'sfp', 621 | 'sgm', 622 | 'shrm-cp', 623 | 'shrm-scp', 624 | 'si', 625 | 'siie', 626 | 'smieee', 627 | 'sphr', 628 | 'sra', 629 | 'sscp', 630 | 'stmieee', 631 | 'tbr-ct', 632 | 'td', 633 | 'thd', 634 | 'thm', 635 | 'ud', 636 | 'usa', 637 | 'usaf', 638 | 'usar', 639 | 'uscg', 640 | 'usmc', 641 | 'usn', 642 | 'usnr', 643 | 'uxc', 644 | 'uxmc', 645 | 'vc', 646 | 'vc', 647 | 'vcp', 648 | 'vd', 649 | 'vrd', 650 | ]) 651 | """ 652 | 653 | Post-nominal acronyms. Titles, degrees and other things people stick after their name 654 | that may or may not have periods between the letters. The parser removes periods 655 | when matching against these pieces. 656 | 657 | """ 658 | -------------------------------------------------------------------------------- /nameparser/config/titles.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | FIRST_NAME_TITLES = set([ 5 | 'aunt', 6 | 'auntie', 7 | 'brother', 8 | 'dame', 9 | 'father', 10 | 'king', 11 | 'maid', 12 | 'master', 13 | 'mother', 14 | 'pope', 15 | 'queen', 16 | 'sir', 17 | 'sister', 18 | 'uncle', 19 | 'sheikh', 20 | 'sheik', 21 | 'shaik', 22 | 'shayk', 23 | 'shaykh', 24 | 'shaikh', 25 | 'cheikh', 26 | 'shekh', 27 | ]) 28 | """ 29 | When these titles appear with a single other name, that name is a first name, e.g. 30 | "Sir John", "Sister Mary", "Queen Elizabeth". 31 | """ 32 | 33 | #: **Cannot include things that could also be first names**, e.g. "dean". 34 | #: Many of these from wikipedia: https://en.wikipedia.org/wiki/Title. 35 | #: The parser recognizes chains of these including conjunctions allowing 36 | #: recognition titles like "Deputy Secretary of State". 37 | TITLES = FIRST_NAME_TITLES | set([ 38 | "attaché", 39 | "chargé d'affaires", 40 | "king's", 41 | "marchioness", 42 | "marquess", 43 | "marquis", 44 | "marquise", 45 | "queen's", 46 | '10th', 47 | '1lt', 48 | '1sgt', 49 | '1st', 50 | '1stlt', 51 | '1stsgt', 52 | '2lt', 53 | '2nd', 54 | '2ndlt', 55 | '3rd', 56 | '4th', 57 | '5th', 58 | '6th', 59 | '7th', 60 | '8th', 61 | '9th', 62 | 'a1c', 63 | 'ab', 64 | 'abbess', 65 | 'abbot', 66 | 'abolitionist', 67 | 'academic', 68 | 'acolyte', 69 | 'activist', 70 | 'actor ', 71 | 'actress', 72 | 'adept', 73 | 'adjutant', 74 | 'adm', 75 | 'admiral', 76 | 'advertising', 77 | 'adviser', 78 | 'advocate', 79 | 'air', 80 | 'akhoond', 81 | 'alderman', 82 | 'almoner', 83 | 'ambassador', 84 | 'amn', 85 | 'analytics', 86 | 'anarchist', 87 | 'animator', 88 | 'anthropologist', 89 | 'appellate', 90 | 'apprentice', 91 | 'arbitrator', 92 | 'archbishop', 93 | 'archdeacon', 94 | 'archdruid', 95 | 'archduchess', 96 | 'archduke', 97 | 'archeologist', 98 | 'architect', 99 | 'arhat', 100 | 'army', 101 | 'arranger', 102 | 'assistant', 103 | 'assoc', 104 | 'associate', 105 | 'asst', 106 | 'astronomer', 107 | 'attache', 108 | 'attorney', 109 | 'author', 110 | 'award-winning', 111 | 'ayatollah', 112 | 'baba', 113 | 'bailiff', 114 | 'ballet', 115 | 'bandleader', 116 | 'banker', 117 | 'banner', 118 | 'bard', 119 | 'baron', 120 | 'baroness', 121 | 'barrister', 122 | 'baseball', 123 | 'bearer', 124 | 'behavioral', 125 | 'bench', 126 | 'bg', 127 | 'bgen', 128 | 'biblical', 129 | 'bibliographer', 130 | 'biochemist', 131 | 'biographer', 132 | 'biologist', 133 | 'bishop', 134 | 'blessed', 135 | 'blogger', 136 | 'blues', 137 | 'bodhisattva', 138 | 'bookseller', 139 | 'botanist', 140 | 'bp', 141 | 'brigadier', 142 | 'briggen', 143 | 'british', 144 | 'broadcaster', 145 | 'buddha', 146 | 'burgess', 147 | 'burlesque', 148 | 'business', 149 | 'businessman', 150 | 'businesswoman', 151 | 'bwana', 152 | 'canon', 153 | 'capt', 154 | 'captain', 155 | 'cardinal', 156 | 'cartographer', 157 | 'cartoonist', 158 | 'catholicos', 159 | 'ccmsgt', 160 | 'cdr', 161 | 'celebrity', 162 | 'ceo', 163 | 'cfo', 164 | 'chair', 165 | 'chairs', 166 | 'chancellor', 167 | 'chaplain', 168 | 'chef', 169 | 'chemist', 170 | 'chief', 171 | 'chieftain', 172 | 'choreographer', 173 | 'civil', 174 | 'classical', 175 | 'clergyman', 176 | 'clerk', 177 | 'cmsaf', 178 | 'cmsgt', 179 | 'co-chair', 180 | 'co-chairs', 181 | 'co-founder', 182 | 'coach', 183 | 'col', 184 | 'collector', 185 | 'colonel', 186 | 'comedian', 187 | 'comedienne', 188 | 'comic', 189 | 'commander', 190 | 'commander-in-chief', 191 | 'commodore', 192 | 'composer', 193 | 'compositeur', 194 | 'comptroller', 195 | 'computer', 196 | 'comtesse', 197 | 'conductor', 198 | 'consultant', 199 | 'controller', 200 | 'corporal', 201 | 'corporate', 202 | 'correspondent', 203 | 'councillor', 204 | 'counselor', 205 | 'count', 206 | 'countess', 207 | 'courtier', 208 | 'cpl', 209 | 'cpo', 210 | 'cpt', 211 | 'credit', 212 | 'criminal', 213 | 'criminologist', 214 | 'critic', 215 | 'csm', 216 | 'curator', 217 | 'customs', 218 | 'cwo-2', 219 | 'cwo-3', 220 | 'cwo-4', 221 | 'cwo-5', 222 | 'cwo2', 223 | 'cwo3', 224 | 'cwo4', 225 | 'cwo5', 226 | 'cyclist', 227 | 'dancer', 228 | 'dcn', 229 | 'deacon', 230 | 'delegate', 231 | 'deputy', 232 | 'designated', 233 | 'designer', 234 | 'detective', 235 | 'developer', 236 | 'diplomat', 237 | 'dir', 238 | 'director', 239 | 'discovery', 240 | 'dissident', 241 | 'district', 242 | 'division', 243 | 'do', 244 | 'docent', 245 | 'docket', 246 | 'doctor', 247 | 'doyen', 248 | 'dpty', 249 | 'dr', 250 | 'dra', 251 | 'dramatist', 252 | 'druid', 253 | 'drummer', 254 | 'duchesse', 255 | # 'duke', # a common first name 256 | 'dutchess', 257 | 'ecologist', 258 | 'economist', 259 | 'editor', 260 | 'edmi', 261 | 'edohen', 262 | 'educator', 263 | 'effendi', 264 | 'ekegbian', 265 | 'elerunwon', 266 | 'eminence', 267 | 'emperor', 268 | 'empress', 269 | 'engineer', 270 | 'english', 271 | 'ens', 272 | 'entertainer', 273 | 'entrepreneur', 274 | 'envoy', 275 | 'essayist', 276 | 'evangelist', 277 | 'excellency', 278 | 'excellent', 279 | 'exec', 280 | 'executive', 281 | 'expert', 282 | 'fadm', 283 | 'family', 284 | 'federal', 285 | 'field', 286 | 'film', 287 | 'financial', 288 | 'first', 289 | 'flag', 290 | 'flying', 291 | 'foreign', 292 | 'forester', 293 | 'founder', 294 | 'fr', 295 | 'friar', 296 | 'gaf', 297 | 'gen', 298 | 'general', 299 | 'generalissimo', 300 | 'gentiluomo', 301 | 'giani', 302 | 'goodman', 303 | 'goodwife', 304 | 'governor', 305 | 'graf', 306 | 'grand', 307 | 'group', 308 | 'guitarist', 309 | 'guru', 310 | 'gyani', 311 | 'gysgt', 312 | 'hajji', 313 | 'headman', 314 | 'heir', 315 | 'heiress', 316 | 'her', 317 | 'hereditary', 318 | 'high', 319 | 'highness', 320 | 'his', 321 | 'historian', 322 | 'historicus', 323 | 'historien', 324 | 'holiness', 325 | 'hon', # sorry Hon Solo, but judges seem more common. 326 | 'honorable', 327 | 'honourable', 328 | 'host', 329 | 'illustrator', 330 | 'imam', 331 | 'industrialist', 332 | 'information', 333 | 'instructor', 334 | 'intelligence', 335 | 'intendant', 336 | 'inventor', 337 | 'investigator', 338 | 'investor', 339 | 'journalist', 340 | 'journeyman', 341 | 'jr', 342 | 'judge', 343 | 'judicial', 344 | 'junior', 345 | 'jurist', 346 | 'keyboardist', 347 | 'kingdom', 348 | 'knowledge', 349 | 'lady', 350 | 'lama', 351 | 'lamido', 352 | 'law', 353 | 'lawyer', 354 | 'lcdr', 355 | 'lcpl', 356 | 'leader', 357 | 'lecturer', 358 | 'legal', 359 | 'librarian', 360 | 'lieutenant', 361 | 'linguist', 362 | 'literary', 363 | 'lord', 364 | 'lt', 365 | 'ltc', 366 | 'ltcol', 367 | 'ltg', 368 | 'ltgen', 369 | 'ltjg', 370 | 'lyricist', 371 | 'madam', 372 | 'madame', 373 | 'mademoiselle', 374 | 'mag', 375 | 'mag-judge', 376 | 'mag/judge', 377 | 'magistrate', 378 | 'magistrate-judge', 379 | 'magnate', 380 | 'maharajah', 381 | 'maharani', 382 | 'mahdi', 383 | 'maj', 384 | 'majesty', 385 | 'majgen', 386 | 'manager', 387 | 'marcher', 388 | 'marchess', 389 | 'marketing', 390 | 'marquis', 391 | 'mathematician', 392 | 'mathematics', 393 | 'matriarch', 394 | 'mayor', 395 | 'mcpo', 396 | 'mcpoc', 397 | 'mcpon', 398 | 'md', 399 | 'member', 400 | 'memoirist', 401 | 'merchant', 402 | 'met', 403 | 'metropolitan', 404 | 'mg', 405 | 'mgr', 406 | 'mgysgt', 407 | 'military', 408 | 'minister', 409 | 'miss', 410 | 'misses', 411 | 'missionary', 412 | 'mister', 413 | 'mlle', 414 | 'mme', 415 | 'mobster', 416 | 'model', 417 | 'monk', 418 | 'monsignor', 419 | 'most', 420 | 'mountaineer', 421 | 'mpco-cg', 422 | 'mr', 423 | 'mrs', 424 | 'ms', 425 | 'msg', 426 | 'msgt', 427 | 'mufti', 428 | 'mullah', 429 | 'municipal', 430 | 'murshid', 431 | 'musician', 432 | 'musicologist', 433 | 'mx', 434 | 'mystery', 435 | 'nanny', 436 | 'narrator', 437 | 'national', 438 | 'naturalist', 439 | 'navy', 440 | 'neuroscientist', 441 | 'novelist', 442 | 'nurse', 443 | 'obstetritian', 444 | 'officer', 445 | 'opera', 446 | 'operating', 447 | 'ornithologist', 448 | 'painter', 449 | 'paleontologist', 450 | 'pastor', 451 | 'patriarch', 452 | 'pediatrician', 453 | 'personality', 454 | 'petty', 455 | 'pfc', 456 | 'pharaoh', 457 | 'phd', 458 | 'philantropist', 459 | 'philosopher', 460 | 'photographer', 461 | 'physician', 462 | 'physicist', 463 | 'pianist', 464 | 'pilot', 465 | 'pioneer', 466 | 'pir', 467 | 'player', 468 | 'playwright', 469 | 'po1', 470 | 'po2', 471 | 'po3', 472 | 'poet', 473 | 'police', 474 | 'political', 475 | 'politician', 476 | 'prefect', 477 | 'prelate', 478 | 'premier', 479 | 'pres', 480 | 'presbyter', 481 | 'president', 482 | 'presiding', 483 | 'priest', 484 | 'priestess', 485 | 'primate', 486 | 'prime', 487 | 'prin', 488 | 'prince', 489 | 'princess', 490 | 'principal', 491 | 'printer', 492 | 'printmaker', 493 | 'prior', 494 | 'private', 495 | 'pro', 496 | 'producer', 497 | 'prof', 498 | 'professor', 499 | 'provost', 500 | 'pslc', 501 | 'psychiatrist', 502 | 'psychologist', 503 | 'publisher', 504 | 'pursuivant', 505 | 'pv2', 506 | 'pvt', 507 | 'rabbi', 508 | 'radio', 509 | 'radm', 510 | 'rangatira', 511 | 'ranger', 512 | 'rdml', 513 | 'rear', 514 | 'rebbe', 515 | 'registrar', 516 | 'rep', 517 | 'representative', 518 | 'researcher', 519 | 'resident', 520 | 'rev', 521 | 'revenue', 522 | 'reverend', 523 | 'right', 524 | 'risk', 525 | 'rock', 526 | 'royal', 527 | 'rt', 528 | 'sa', 529 | 'sailor', 530 | 'saint', 531 | 'sainte', 532 | 'saoshyant', 533 | 'satirist', 534 | 'scholar', 535 | 'schoolmaster', 536 | 'scientist', 537 | 'scpo', 538 | 'screenwriter', 539 | 'se', 540 | 'secretary', 541 | 'security', 542 | 'seigneur', 543 | 'senator', 544 | 'senior', 545 | 'senior-judge', 546 | 'sergeant', 547 | 'servant', 548 | 'sfc', 549 | 'sgm', 550 | 'sgt', 551 | 'sgtmaj', 552 | 'sgtmajmc', 553 | 'shehu', 554 | 'sheikh', 555 | 'sheriff', 556 | 'siddha', 557 | 'singer', 558 | 'singer-songwriter', 559 | 'sma', 560 | 'smsgt', 561 | 'sn', 562 | 'soccer', 563 | 'social', 564 | 'sociologist', 565 | 'software', 566 | 'soldier', 567 | 'solicitor', 568 | 'soprano', 569 | 'spc', 570 | 'speaker', 571 | 'special', 572 | 'sr', 573 | 'sra', 574 | 'srta', 575 | 'ssg', 576 | 'ssgt', 577 | 'st', 578 | 'staff', 579 | 'state', 580 | 'states', 581 | 'strategy', 582 | 'subaltern', 583 | 'subedar', 584 | 'suffragist', 585 | 'sultan', 586 | 'sultana', 587 | 'superior', 588 | 'supreme', 589 | 'surgeon', 590 | 'swami', 591 | 'swordbearer', 592 | 'sysselmann', 593 | 'tax', 594 | 'teacher', 595 | 'technical', 596 | 'technologist', 597 | 'television ', 598 | 'tenor', 599 | 'theater', 600 | 'theatre', 601 | 'theologian', 602 | 'theorist', 603 | 'timi', 604 | 'tirthankar', 605 | 'translator', 606 | 'travel', 607 | 'treasurer', 608 | 'tsar', 609 | 'tsarina', 610 | 'tsgt', 611 | 'uk', 612 | 'united', 613 | 'us', 614 | 'vadm', 615 | 'vardapet', 616 | 'vc', 617 | 'venerable', 618 | 'verderer', 619 | 'vicar', 620 | 'vice', 621 | 'viscount', 622 | 'vizier', 623 | 'vocalist', 624 | 'voice', 625 | 'warden', 626 | 'warrant', 627 | 'wing', 628 | 'wm', 629 | 'wo-1', 630 | 'wo1', 631 | 'wo2', 632 | 'wo3', 633 | 'wo4', 634 | 'wo5', 635 | 'woodman', 636 | 'writer', 637 | 'zoologist', 638 | ]) 639 | -------------------------------------------------------------------------------- /nameparser/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import sys 5 | import re 6 | from operator import itemgetter 7 | from itertools import groupby 8 | 9 | from nameparser.util import u 10 | from nameparser.util import text_types, binary_type 11 | from nameparser.util import lc 12 | from nameparser.util import log 13 | from nameparser.config import CONSTANTS 14 | from nameparser.config import Constants 15 | from nameparser.config import DEFAULT_ENCODING 16 | 17 | ENCODING = 'utf-8' 18 | 19 | 20 | def group_contiguous_integers(data): 21 | """ 22 | return list of tuples containing first and last index 23 | position of contiguous numbers in a series 24 | """ 25 | ranges = [] 26 | for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]): 27 | group = list(map(itemgetter(1), group)) 28 | if len(group) > 1: 29 | ranges.append((group[0], group[-1])) 30 | return ranges 31 | 32 | 33 | class HumanName(object): 34 | """ 35 | Parse a person's name into individual components. 36 | 37 | Instantiation assigns to ``full_name``, and assignment to 38 | :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the 39 | name, these instance attributes are available. Alternatively, you can pass 40 | any of the instance attributes to the constructor method and skip the parsing 41 | process. If any of the the instance attributes are passed to the constructor 42 | as keywords, :py:func:`parse_full_name` will not be performed. 43 | 44 | **HumanName Instance Attributes** 45 | 46 | * :py:attr:`title` 47 | * :py:attr:`first` 48 | * :py:attr:`middle` 49 | * :py:attr:`last` 50 | * :py:attr:`suffix` 51 | * :py:attr:`nickname` 52 | * :py:attr:`surnames` 53 | 54 | :param str full_name: The name string to be parsed. 55 | :param constants constants: 56 | a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for 57 | `per-instance config `_. 58 | :param str encoding: string representing the encoding of your input 59 | :param str string_format: python string formatting 60 | :param str initials_format: python initials string formatting 61 | :param str initials_delimter: string delimiter for initials 62 | :param str first: first name 63 | :param str middle: middle name 64 | :param str last: last name 65 | :param str title: The title or prenominal 66 | :param str suffix: The suffix or postnominal 67 | :param str nickname: Nicknames 68 | """ 69 | 70 | C = CONSTANTS 71 | """ 72 | A reference to the configuration for this instance, which may or may not be 73 | a reference to the shared, module-wide instance at 74 | :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser 75 | `_. 76 | """ 77 | 78 | original = '' 79 | """ 80 | The original string, untouched by the parser. 81 | """ 82 | 83 | _count = 0 84 | _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname'] 85 | unparsable = True 86 | _full_name = '' 87 | 88 | def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, 89 | string_format=None, initials_format=None, initials_delimiter=None, 90 | first=None, middle=None, last=None, title=None, suffix=None, 91 | nickname=None): 92 | self.C = constants 93 | if type(self.C) is not type(CONSTANTS): 94 | self.C = Constants() 95 | 96 | self.encoding = encoding 97 | self.string_format = string_format or self.C.string_format 98 | self.initials_format = initials_format or self.C.initials_format 99 | self.initials_delimiter = initials_delimiter or self.C.initials_delimiter 100 | if (first or middle or last or title or suffix or nickname): 101 | self.first = first 102 | self.middle = middle 103 | self.last = last 104 | self.title = title 105 | self.suffix = suffix 106 | self.nickname = nickname 107 | self.unparsable = False 108 | else: 109 | # full_name setter triggers the parse 110 | self.full_name = full_name 111 | 112 | def __iter__(self): 113 | return self 114 | 115 | def __len__(self): 116 | l = 0 117 | for x in self: 118 | l += 1 119 | return l 120 | 121 | def __eq__(self, other): 122 | """ 123 | HumanName instances are equal to other objects whose 124 | lower case unicode representation is the same. 125 | """ 126 | return (u(self)).lower() == (u(other)).lower() 127 | 128 | def __ne__(self, other): 129 | return not (u(self)).lower() == (u(other)).lower() 130 | 131 | def __getitem__(self, key): 132 | if isinstance(key, slice): 133 | return [getattr(self, x) for x in self._members[key]] 134 | else: 135 | return getattr(self, key) 136 | 137 | def __setitem__(self, key, value): 138 | if key in self._members: 139 | self._set_list(key, value) 140 | else: 141 | raise KeyError("Not a valid HumanName attribute", key) 142 | 143 | def next(self): 144 | return self.__next__() 145 | 146 | def __next__(self): 147 | if self._count >= len(self._members): 148 | self._count = 0 149 | raise StopIteration 150 | else: 151 | c = self._count 152 | self._count = c + 1 153 | return getattr(self, self._members[c]) or next(self) 154 | 155 | def __unicode__(self): 156 | if self.string_format: 157 | # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" 158 | _s = self.string_format.format(**self.as_dict()) 159 | # remove trailing punctuation from missing nicknames 160 | _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "") 161 | return self.collapse_whitespace(_s).strip(', ') 162 | return " ".join(self) 163 | 164 | def __hash__(self): 165 | return hash(str(self)) 166 | 167 | def __str__(self): 168 | if sys.version_info[0] >= 3: 169 | return self.__unicode__() 170 | return self.__unicode__().encode(self.encoding) 171 | 172 | def __repr__(self): 173 | if self.unparsable: 174 | _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, } 175 | else: 176 | _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % { 177 | 'class': self.__class__.__name__, 178 | 'title': self.title or '', 179 | 'first': self.first or '', 180 | 'middle': self.middle or '', 181 | 'last': self.last or '', 182 | 'suffix': self.suffix or '', 183 | 'nickname': self.nickname or '', 184 | } 185 | if sys.version_info[0] >= 3: 186 | return _string 187 | return _string.encode(self.encoding) 188 | 189 | def as_dict(self, include_empty=True): 190 | """ 191 | Return the parsed name as a dictionary of its attributes. 192 | 193 | :param bool include_empty: Include keys in the dictionary for empty name attributes. 194 | :rtype: dict 195 | 196 | .. doctest:: 197 | 198 | >>> name = HumanName("Bob Dole") 199 | >>> name.as_dict() 200 | {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'} 201 | >>> name.as_dict(False) 202 | {'last': 'Dole', 'first': 'Bob'} 203 | 204 | """ 205 | d = {} 206 | for m in self._members: 207 | if include_empty: 208 | d[m] = getattr(self, m) 209 | else: 210 | val = getattr(self, m) 211 | if val: 212 | d[m] = val 213 | return d 214 | 215 | def __process_initial__(self, name_part, firstname=False): 216 | """ 217 | Name parts may include prefixes or conjunctions. This function filters these from the name unless it is 218 | a first name, since first names cannot be conjunctions or prefixes. 219 | """ 220 | parts = name_part.split(" ") 221 | initials = [] 222 | if len(parts) and isinstance(parts, list): 223 | for part in parts: 224 | if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True: 225 | initials.append(part[0]) 226 | if len(initials) > 0: 227 | return " ".join(initials) 228 | else: 229 | return self.C.empty_attribute_default 230 | 231 | def initials_list(self): 232 | """ 233 | Returns the initials as a list 234 | 235 | .. doctest:: 236 | 237 | >>> name = HumanName("Sir Bob Andrew Dole") 238 | >>> name.initials_list() 239 | ["B", "A", "D"] 240 | >>> name = HumanName("J. Doe") 241 | >>> name.initials_list() 242 | ["J", "D"] 243 | """ 244 | first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] 245 | middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] 246 | last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] 247 | return first_initials_list + middle_initials_list + last_initials_list 248 | 249 | def initials(self): 250 | """ 251 | Return period-delimited initials of the first, middle and optionally last name. 252 | 253 | :param bool include_last_name: Include the last name as part of the initials 254 | :rtype: str 255 | 256 | .. doctest:: 257 | 258 | >>> name = HumanName("Sir Bob Andrew Dole") 259 | >>> name.initials() 260 | "B. A. D." 261 | >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}") 262 | >>> name.initials() 263 | "B. A." 264 | """ 265 | 266 | first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] 267 | middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] 268 | last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] 269 | 270 | initials_dict = { 271 | "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter 272 | if len(first_initials_list) else self.C.empty_attribute_default, 273 | "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter 274 | if len(middle_initials_list) else self.C.empty_attribute_default, 275 | "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter 276 | if len(last_initials_list) else self.C.empty_attribute_default 277 | } 278 | 279 | _s = self.initials_format.format(**initials_dict) 280 | return self.collapse_whitespace(_s) 281 | 282 | @property 283 | def has_own_config(self): 284 | """ 285 | True if this instance is not using the shared module-level 286 | configuration. 287 | """ 288 | return self.C is not CONSTANTS 289 | 290 | # attributes 291 | 292 | @property 293 | def title(self): 294 | """ 295 | The person's titles. Any string of consecutive pieces in 296 | :py:mod:`~nameparser.config.titles` or 297 | :py:mod:`~nameparser.config.conjunctions` 298 | at the beginning of :py:attr:`full_name`. 299 | """ 300 | return " ".join(self.title_list) or self.C.empty_attribute_default 301 | 302 | @property 303 | def first(self): 304 | """ 305 | The person's first name. The first name piece after any known 306 | :py:attr:`title` pieces parsed from :py:attr:`full_name`. 307 | """ 308 | return " ".join(self.first_list) or self.C.empty_attribute_default 309 | 310 | @property 311 | def middle(self): 312 | """ 313 | The person's middle names. All name pieces after the first name and 314 | before the last name parsed from :py:attr:`full_name`. 315 | """ 316 | return " ".join(self.middle_list) or self.C.empty_attribute_default 317 | 318 | @property 319 | def last(self): 320 | """ 321 | The person's last name. The last name piece parsed from 322 | :py:attr:`full_name`. 323 | """ 324 | return " ".join(self.last_list) or self.C.empty_attribute_default 325 | 326 | @property 327 | def suffix(self): 328 | """ 329 | The persons's suffixes. Pieces at the end of the name that are found in 330 | :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end 331 | of comma separated formats, e.g. 332 | "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed 333 | from :py:attr:`full_name`. 334 | """ 335 | return ", ".join(self.suffix_list) or self.C.empty_attribute_default 336 | 337 | @property 338 | def nickname(self): 339 | """ 340 | The person's nicknames. Any text found inside of quotes (``""``) or 341 | parenthesis (``()``) 342 | """ 343 | return " ".join(self.nickname_list) or self.C.empty_attribute_default 344 | 345 | @property 346 | def surnames_list(self): 347 | """ 348 | List of middle names followed by last name. 349 | """ 350 | return self.middle_list + self.last_list 351 | 352 | @property 353 | def surnames(self): 354 | """ 355 | A string of all middle names followed by the last name. 356 | """ 357 | return " ".join(self.surnames_list) or self.C.empty_attribute_default 358 | 359 | # setter methods 360 | 361 | def _set_list(self, attr, value): 362 | if isinstance(value, list): 363 | val = value 364 | elif isinstance(value, text_types): 365 | val = [value] 366 | elif value is None: 367 | val = [] 368 | else: 369 | raise TypeError( 370 | "Can only assign strings, lists or None to name attributes." 371 | " Got {0}".format(type(value))) 372 | setattr(self, attr+"_list", self.parse_pieces(val)) 373 | 374 | @title.setter 375 | def title(self, value): 376 | self._set_list('title', value) 377 | 378 | @first.setter 379 | def first(self, value): 380 | self._set_list('first', value) 381 | 382 | @middle.setter 383 | def middle(self, value): 384 | self._set_list('middle', value) 385 | 386 | @last.setter 387 | def last(self, value): 388 | self._set_list('last', value) 389 | 390 | @suffix.setter 391 | def suffix(self, value): 392 | self._set_list('suffix', value) 393 | 394 | @nickname.setter 395 | def nickname(self, value): 396 | self._set_list('nickname', value) 397 | 398 | # Parse helpers 399 | 400 | def is_title(self, value): 401 | """Is in the :py:data:`~nameparser.config.titles.TITLES` set.""" 402 | return lc(value) in self.C.titles 403 | 404 | def is_conjunction(self, piece): 405 | """Is in the conjunctions set and not :py:func:`is_an_initial()`.""" 406 | if isinstance(piece, list): 407 | for item in piece: 408 | if self.is_conjunction(item): 409 | return True 410 | else: 411 | return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) 412 | 413 | def is_prefix(self, piece): 414 | """ 415 | Lowercase and no periods version of piece is in the 416 | :py:data:`~nameparser.config.prefixes.PREFIXES` set. 417 | """ 418 | if isinstance(piece, list): 419 | for item in piece: 420 | if self.is_prefix(item): 421 | return True 422 | else: 423 | return lc(piece) in self.C.prefixes 424 | 425 | def is_roman_numeral(self, value): 426 | """ 427 | Matches the ``roman_numeral`` regular expression in 428 | :py:data:`~nameparser.config.regexes.REGEXES`. 429 | """ 430 | return bool(self.C.regexes.roman_numeral.match(value)) 431 | 432 | def is_suffix(self, piece): 433 | """ 434 | Is in the suffixes set and not :py:func:`is_an_initial()`. 435 | 436 | Some suffixes may be acronyms (M.B.A) while some are not (Jr.), 437 | so we remove the periods from `piece` when testing against 438 | `C.suffix_acronyms`. 439 | """ 440 | # suffixes may have periods inside them like "M.D." 441 | if isinstance(piece, list): 442 | for item in piece: 443 | if self.is_suffix(item): 444 | return True 445 | else: 446 | return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) 447 | or (lc(piece) in self.C.suffix_not_acronyms)) \ 448 | and not self.is_an_initial(piece) 449 | 450 | def are_suffixes(self, pieces): 451 | """Return True if all pieces are suffixes.""" 452 | for piece in pieces: 453 | if not self.is_suffix(piece): 454 | return False 455 | return True 456 | 457 | def is_rootname(self, piece): 458 | """ 459 | Is not a known title, suffix or prefix. Just first, middle, last names. 460 | """ 461 | return lc(piece) not in self.C.suffixes_prefixes_titles \ 462 | and not self.is_an_initial(piece) 463 | 464 | def is_an_initial(self, value): 465 | """ 466 | Words with a single period at the end, or a single uppercase letter. 467 | 468 | Matches the ``initial`` regular expression in 469 | :py:data:`~nameparser.config.regexes.REGEXES`. 470 | """ 471 | return bool(self.C.regexes.initial.match(value)) 472 | 473 | # full_name parser 474 | 475 | @property 476 | def full_name(self): 477 | """The string output of the HumanName instance.""" 478 | return self.__str__() 479 | 480 | @full_name.setter 481 | def full_name(self, value): 482 | self.original = value 483 | self._full_name = value 484 | if isinstance(value, binary_type): 485 | self._full_name = value.decode(self.encoding) 486 | self.parse_full_name() 487 | 488 | def collapse_whitespace(self, string): 489 | # collapse multiple spaces into single space 490 | string = self.C.regexes.spaces.sub(" ", string.strip()) 491 | if string.endswith(","): 492 | string = string[:-1] 493 | return string 494 | 495 | def pre_process(self): 496 | """ 497 | 498 | This method happens at the beginning of the :py:func:`parse_full_name` 499 | before any other processing of the string aside from unicode 500 | normalization, so it's a good place to do any custom handling in a 501 | subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. 502 | 503 | """ 504 | self.fix_phd() 505 | self.parse_nicknames() 506 | self.squash_emoji() 507 | 508 | def post_process(self): 509 | """ 510 | This happens at the end of the :py:func:`parse_full_name` after 511 | all other processing has taken place. Runs :py:func:`handle_firstnames` 512 | and :py:func:`handle_capitalization`. 513 | """ 514 | self.handle_firstnames() 515 | self.handle_capitalization() 516 | 517 | def fix_phd(self): 518 | try: 519 | _re = self.C.regexes.phd 520 | match = _re.search(self._full_name) 521 | if match: 522 | self.suffix_list.append(match.group(1)) 523 | self._full_name = _re.sub('', self._full_name) 524 | except AttributeError: 525 | pass 526 | 527 | def parse_nicknames(self): 528 | """ 529 | The content of parenthesis or quotes in the name will be added to the 530 | nicknames list. This happens before any other processing of the name. 531 | 532 | Single quotes cannot span white space characters and must border 533 | white space to allow for quotes in names like O'Connor and Kawai'ae'a. 534 | Double quotes and parenthesis can span white space. 535 | 536 | Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; 537 | `quoted_word`, `double_quotes` and `parenthesis`. 538 | """ 539 | 540 | empty_re = re.compile("") 541 | 542 | re_quoted_word = self.C.regexes.quoted_word or empty_re 543 | re_double_quotes = self.C.regexes.double_quotes or empty_re 544 | re_parenthesis = self.C.regexes.parenthesis or empty_re 545 | 546 | for _re in (re_quoted_word, re_double_quotes, re_parenthesis): 547 | if _re.search(self._full_name): 548 | self.nickname_list += [x for x in _re.findall(self._full_name)] 549 | self._full_name = _re.sub('', self._full_name) 550 | 551 | def squash_emoji(self): 552 | """ 553 | Remove emoji from the input string. 554 | """ 555 | re_emoji = self.C.regexes.emoji 556 | if re_emoji and re_emoji.search(self._full_name): 557 | self._full_name = re_emoji.sub('', self._full_name) 558 | 559 | def handle_firstnames(self): 560 | """ 561 | If there are only two parts and one is a title, assume it's a last name 562 | instead of a first name. e.g. Mr. Johnson. Unless it's a special title 563 | like "Sir", then when it's followed by a single name that name is always 564 | a first name. 565 | """ 566 | if self.title \ 567 | and len(self) == 2 \ 568 | and not lc(self.title) in self.C.first_name_titles: 569 | self.last, self.first = self.first, self.last 570 | 571 | def parse_full_name(self): 572 | """ 573 | 574 | The main parse method for the parser. This method is run upon 575 | assignment to the :py:attr:`full_name` attribute or instantiation. 576 | 577 | Basic flow is to hand off to :py:func:`pre_process` to handle 578 | nicknames. It then splits on commas and chooses a code path depending 579 | on the number of commas. 580 | 581 | :py:func:`parse_pieces` then splits those parts on spaces and 582 | :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. 583 | """ 584 | 585 | self.title_list = [] 586 | self.first_list = [] 587 | self.middle_list = [] 588 | self.last_list = [] 589 | self.suffix_list = [] 590 | self.nickname_list = [] 591 | self.unparsable = True 592 | 593 | self.pre_process() 594 | 595 | self._full_name = self.collapse_whitespace(self._full_name) 596 | 597 | # break up full_name by commas 598 | parts = [x.strip() for x in self._full_name.split(",")] 599 | 600 | log.debug("full_name: %s", self._full_name) 601 | log.debug("parts: %s", parts) 602 | 603 | if len(parts) == 1: 604 | 605 | # no commas, title first middle middle middle last suffix 606 | # part[0] 607 | 608 | pieces = self.parse_pieces(parts) 609 | p_len = len(pieces) 610 | for i, piece in enumerate(pieces): 611 | try: 612 | nxt = pieces[i + 1] 613 | except IndexError: 614 | nxt = None 615 | 616 | # title must have a next piece, unless it's just a title 617 | if not self.first \ 618 | and (nxt or p_len == 1) \ 619 | and self.is_title(piece): 620 | self.title_list.append(piece) 621 | continue 622 | if not self.first: 623 | if p_len == 1 and self.nickname: 624 | self.last_list.append(piece) 625 | continue 626 | self.first_list.append(piece) 627 | continue 628 | if self.are_suffixes(pieces[i+1:]) or \ 629 | ( 630 | # if the next piece is the last piece and a roman 631 | # numeral but this piece is not an initial 632 | self.is_roman_numeral(nxt) and i == p_len - 2 633 | and not self.is_an_initial(piece) 634 | ): 635 | self.last_list.append(piece) 636 | self.suffix_list += pieces[i+1:] 637 | break 638 | if not nxt: 639 | self.last_list.append(piece) 640 | continue 641 | 642 | self.middle_list.append(piece) 643 | else: 644 | # if all the end parts are suffixes and there is more than one piece 645 | # in the first part. (Suffixes will never appear after last names 646 | # only, and allows potential first names to be in suffixes, e.g. 647 | # "Johnson, Bart" 648 | 649 | post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1) 650 | 651 | if self.are_suffixes(parts[1].split(' ')) \ 652 | and len(parts[0].split(' ')) > 1: 653 | 654 | # suffix comma: 655 | # title first middle last [suffix], suffix [suffix] [, suffix] 656 | # parts[0], parts[1:...] 657 | 658 | self.suffix_list += parts[1:] 659 | pieces = self.parse_pieces(parts[0].split(' ')) 660 | log.debug("pieces: %s", u(pieces)) 661 | for i, piece in enumerate(pieces): 662 | try: 663 | nxt = pieces[i + 1] 664 | except IndexError: 665 | nxt = None 666 | 667 | if not self.first \ 668 | and (nxt or len(pieces) == 1) \ 669 | and self.is_title(piece): 670 | self.title_list.append(piece) 671 | continue 672 | if not self.first: 673 | self.first_list.append(piece) 674 | continue 675 | if self.are_suffixes(pieces[i+1:]): 676 | self.last_list.append(piece) 677 | self.suffix_list = pieces[i+1:] + self.suffix_list 678 | break 679 | if not nxt: 680 | self.last_list.append(piece) 681 | continue 682 | self.middle_list.append(piece) 683 | else: 684 | 685 | # lastname comma: 686 | # last [suffix], title first middles[,] suffix [,suffix] 687 | # parts[0], parts[1], parts[2:...] 688 | 689 | log.debug("post-comma pieces: %s", u(post_comma_pieces)) 690 | 691 | # lastname part may have suffixes in it 692 | lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) 693 | for piece in lastname_pieces: 694 | # the first one is always a last name, even if it looks like 695 | # a suffix 696 | if self.is_suffix(piece) and len(self.last_list) > 0: 697 | self.suffix_list.append(piece) 698 | else: 699 | self.last_list.append(piece) 700 | 701 | for i, piece in enumerate(post_comma_pieces): 702 | try: 703 | nxt = post_comma_pieces[i + 1] 704 | except IndexError: 705 | nxt = None 706 | 707 | if not self.first \ 708 | and (nxt or len(post_comma_pieces) == 1) \ 709 | and self.is_title(piece): 710 | self.title_list.append(piece) 711 | continue 712 | if not self.first: 713 | self.first_list.append(piece) 714 | continue 715 | if self.is_suffix(piece): 716 | self.suffix_list.append(piece) 717 | continue 718 | self.middle_list.append(piece) 719 | try: 720 | if parts[2]: 721 | self.suffix_list += parts[2:] 722 | except IndexError: 723 | pass 724 | 725 | if len(self) < 0: 726 | log.info("Unparsable: \"%s\" ", self.original) 727 | else: 728 | self.unparsable = False 729 | self.post_process() 730 | 731 | def parse_pieces(self, parts, additional_parts_count=0): 732 | """ 733 | Split parts on spaces and remove commas, join on conjunctions and 734 | lastname prefixes. If parts have periods in the middle, try splitting 735 | on periods and check if the parts are titles or suffixes. If they are 736 | add to the constant so they will be found. 737 | 738 | :param list parts: name part strings from the comma split 739 | :param int additional_parts_count: 740 | 741 | if the comma format contains other parts, we need to know 742 | how many there are to decide if things should be considered a 743 | conjunction. 744 | :return: pieces split on spaces and joined on conjunctions 745 | :rtype: list 746 | """ 747 | 748 | output = [] 749 | for part in parts: 750 | if not isinstance(part, text_types): 751 | raise TypeError("Name parts must be strings. " 752 | "Got {0}".format(type(part))) 753 | output += [x.strip(' ,') for x in part.split(' ')] 754 | 755 | # If part contains periods, check if it's multiple titles or suffixes 756 | # together without spaces if so, add the new part with periods to the 757 | # constants so they get parsed correctly later 758 | for part in output: 759 | # if this part has a period not at the beginning or end 760 | if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part): 761 | # split on periods, any of the split pieces titles or suffixes? 762 | # ("Lt.Gov.") 763 | period_chunks = part.split(".") 764 | titles = list(filter(self.is_title, period_chunks)) 765 | suffixes = list(filter(self.is_suffix, period_chunks)) 766 | 767 | # add the part to the constant so it will be found 768 | if len(list(titles)): 769 | self.C.titles.add(part) 770 | continue 771 | if len(list(suffixes)): 772 | self.C.suffix_not_acronyms.add(part) 773 | continue 774 | 775 | return self.join_on_conjunctions(output, additional_parts_count) 776 | 777 | def join_on_conjunctions(self, pieces, additional_parts_count=0): 778 | """ 779 | Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: 780 | 781 | ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> 782 | ['Mr. and Mrs.', 'John', 'Doe'] 783 | 784 | ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> 785 | ['The Secretary of State', 'Hillary', 'Clinton'] 786 | 787 | When joining titles, saves newly formed piece to the instance's titles 788 | constant so they will be parsed correctly later. E.g. after parsing the 789 | example names above, 'The Secretary of State' and 'Mr. and Mrs.' would 790 | be present in the titles constant set. 791 | 792 | :param list pieces: name pieces strings after split on spaces 793 | :param int additional_parts_count: 794 | :return: new list with piece next to conjunctions merged into one piece 795 | with spaces in it. 796 | :rtype: list 797 | 798 | """ 799 | length = len(pieces) + additional_parts_count 800 | # don't join on conjunctions if there's only 2 parts 801 | if length < 3: 802 | return pieces 803 | 804 | rootname_pieces = [p for p in pieces if self.is_rootname(p)] 805 | total_length = len(rootname_pieces) + additional_parts_count 806 | 807 | # find all the conjunctions, join any conjunctions that are next to each 808 | # other, then join those newly joined conjunctions and any single 809 | # conjunctions to the piece before and after it 810 | conj_index = [i for i, piece in enumerate(pieces) 811 | if self.is_conjunction(piece)] 812 | 813 | contiguous_conj_i = [] 814 | for i, val in enumerate(conj_index): 815 | try: 816 | if conj_index[i+1] == val+1: 817 | contiguous_conj_i += [val] 818 | except IndexError: 819 | pass 820 | 821 | contiguous_conj_i = group_contiguous_integers(conj_index) 822 | 823 | delete_i = [] 824 | for i in contiguous_conj_i: 825 | if type(i) == tuple: 826 | new_piece = " ".join(pieces[i[0]: i[1]+1]) 827 | delete_i += list(range(i[0]+1, i[1]+1)) 828 | pieces[i[0]] = new_piece 829 | else: 830 | new_piece = " ".join(pieces[i: i+2]) 831 | delete_i += [i+1] 832 | pieces[i] = new_piece 833 | # add newly joined conjunctions to constants to be found later 834 | self.C.conjunctions.add(new_piece) 835 | 836 | for i in reversed(delete_i): 837 | # delete pieces in reverse order or the index changes on each delete 838 | del pieces[i] 839 | 840 | if len(pieces) == 1: 841 | # if there's only one piece left, nothing left to do 842 | return pieces 843 | 844 | # refresh conjunction index locations 845 | conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] 846 | 847 | for i in conj_index: 848 | if len(pieces[i]) == 1 and total_length < 4: 849 | # if there are only 3 total parts (minus known titles, suffixes 850 | # and prefixes) and this conjunction is a single letter, prefer 851 | # treating it as an initial rather than a conjunction. 852 | # http://code.google.com/p/python-nameparser/issues/detail?id=11 853 | continue 854 | 855 | if i == 0: 856 | new_piece = " ".join(pieces[i:i+2]) 857 | if self.is_title(pieces[i+1]): 858 | # when joining to a title, make new_piece a title too 859 | self.C.titles.add(new_piece) 860 | pieces[i] = new_piece 861 | pieces.pop(i+1) 862 | # subtract 1 from the index of all the remaining conjunctions 863 | for j, val in enumerate(conj_index): 864 | if val > i: 865 | conj_index[j] = val-1 866 | 867 | else: 868 | new_piece = " ".join(pieces[i-1:i+2]) 869 | if self.is_title(pieces[i-1]): 870 | # when joining to a title, make new_piece a title too 871 | self.C.titles.add(new_piece) 872 | pieces[i-1] = new_piece 873 | pieces.pop(i) 874 | rm_count = 2 875 | try: 876 | pieces.pop(i) 877 | except IndexError: 878 | rm_count = 1 879 | 880 | # subtract the number of removed pieces from the index 881 | # of all the remaining conjunctions 882 | for j, val in enumerate(conj_index): 883 | if val > i: 884 | conj_index[j] = val - rm_count 885 | 886 | # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] 887 | prefixes = list(filter(self.is_prefix, pieces)) 888 | if prefixes: 889 | for prefix in prefixes: 890 | try: 891 | i = pieces.index(prefix) 892 | except ValueError: 893 | # If the prefix is no longer in pieces, it's because it has been 894 | # combined with the prefix that appears right before (or before that when 895 | # chained together) in the last loop, so the index of that newly created 896 | # piece is the same as in the last loop, i==i still, and we want to join 897 | # it to the next piece. 898 | pass 899 | 900 | new_piece = '' 901 | 902 | # join everything after the prefix until the next prefix or suffix 903 | 904 | try: 905 | if i == 0 and total_length >= 1: 906 | # If it's the first piece and there are more than 1 rootnames, assume it's a first name 907 | continue 908 | next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) 909 | j = pieces.index(next_prefix, i + 1) 910 | if j == i + 1: 911 | # if there are two prefixes in sequence, join to the following piece 912 | j += 1 913 | new_piece = ' '.join(pieces[i:j]) 914 | pieces = pieces[:i] + [new_piece] + pieces[j:] 915 | except StopIteration: 916 | try: 917 | # if there are no more prefixes, look for a suffix to stop at 918 | stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) 919 | j = pieces.index(stop_at) 920 | new_piece = ' '.join(pieces[i:j]) 921 | pieces = pieces[:i] + [new_piece] + pieces[j:] 922 | except StopIteration: 923 | # if there were no suffixes, nothing to stop at so join all 924 | # remaining pieces 925 | new_piece = ' '.join(pieces[i:]) 926 | pieces = pieces[:i] + [new_piece] 927 | 928 | log.debug("pieces: %s", pieces) 929 | return pieces 930 | 931 | # Capitalization Support 932 | 933 | def cap_word(self, word, attribute): 934 | if (self.is_prefix(word) and attribute in ('last', 'middle')) \ 935 | or self.is_conjunction(word): 936 | return word.lower() 937 | exceptions = self.C.capitalization_exceptions 938 | if lc(word) in exceptions: 939 | return exceptions[lc(word)] 940 | mac_match = self.C.regexes.mac.match(word) 941 | if mac_match: 942 | def cap_after_mac(m): 943 | return m.group(1).capitalize() + m.group(2).capitalize() 944 | return self.C.regexes.mac.sub(cap_after_mac, word) 945 | else: 946 | return word.capitalize() 947 | 948 | def cap_piece(self, piece, attribute): 949 | if not piece: 950 | return "" 951 | 952 | def replacement(m): return self.cap_word(m.group(0), attribute) 953 | return self.C.regexes.word.sub(replacement, piece) 954 | 955 | def capitalize(self, force=None): 956 | """ 957 | The HumanName class can try to guess the correct capitalization of name 958 | entered in all upper or lower case. By default, it will not adjust the 959 | case of names entered in mixed case. To run capitalization on all names 960 | pass the parameter `force=True`. 961 | 962 | :param bool force: Forces capitalization of mixed case strings. This 963 | parameter overrides rules set within 964 | :py:class:`~nameparser.config.CONSTANTS`. 965 | 966 | **Usage** 967 | 968 | .. doctest:: capitalize 969 | 970 | >>> name = HumanName('bob v. de la macdole-eisenhower phd') 971 | >>> name.capitalize() 972 | >>> str(name) 973 | 'Bob V. de la MacDole-Eisenhower Ph.D.' 974 | >>> # Don't touch good names 975 | >>> name = HumanName('Shirley Maclaine') 976 | >>> name.capitalize() 977 | >>> str(name) 978 | 'Shirley Maclaine' 979 | >>> name.capitalize(force=True) 980 | >>> str(name) 981 | 'Shirley MacLaine' 982 | 983 | """ 984 | name = u(self) 985 | force = self.C.force_mixed_case_capitalization \ 986 | if force is None else force 987 | 988 | if not force and not (name == name.upper() or name == name.lower()): 989 | return 990 | self.title_list = self.cap_piece(self.title, 'title').split(' ') 991 | self.first_list = self.cap_piece(self.first, 'first').split(' ') 992 | self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') 993 | self.last_list = self.cap_piece(self.last, 'last').split(' ') 994 | self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') 995 | 996 | def handle_capitalization(self): 997 | """ 998 | Handles capitalization configurations set within 999 | :py:class:`~nameparser.config.CONSTANTS`. 1000 | """ 1001 | if self.C.capitalize_name: 1002 | self.capitalize() 1003 | --------------------------------------------------------------------------------