├── tox.ini ├── namestand ├── __init__.py ├── patterns.py ├── converters.py └── utils.py ├── .gitignore ├── LICENSE.txt ├── setup.py ├── test └── test_basic.py └── README.md /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py35,py36,py37,py38 3 | 4 | [testenv] 5 | deps=nose 6 | commands=nosetests 7 | -------------------------------------------------------------------------------- /namestand/__init__.py: -------------------------------------------------------------------------------- 1 | from namestand.converters import * 2 | from namestand.utils import * 3 | from namestand import patterns 4 | 5 | VERSION_TUPLE = (0, 1, 1) 6 | VERSION = ".".join(map(str, VERSION_TUPLE)) 7 | -------------------------------------------------------------------------------- /namestand/patterns.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | non_alphanumeric = re.compile(r"[^0-9a-z]+", re.I) 4 | 5 | non_namey = re.compile(r"[^\w\-' ]+", re.UNICODE) 6 | 7 | comma_suffix = re.compile(r", *(JR|SR|I+|IV|VI*)\b") 8 | 9 | last_first = re.compile(r"([^,]*), +([^,]*)") 10 | 11 | starts_with_num = re.compile(r"^(\d)") 12 | 13 | name_cruft = re.compile(r"\b(MR|MS|MRS|ESQ|SIR|HON)\b") 14 | 15 | company_cruft = re.compile(r"\b(LLC|LTD|INC|LLP)\b") 16 | 17 | whitespace = re.compile(r"\s+") 18 | -------------------------------------------------------------------------------- /namestand/converters.py: -------------------------------------------------------------------------------- 1 | import namestand.utils as u 2 | 3 | downscore = u.combine([ 4 | u.lowercase, 5 | u.strip, 6 | u.underscore, 7 | u.stripper("_"), 8 | u.init_num_prefixer("_") 9 | ]) 10 | 11 | person_basic = u.combine([ 12 | u.uppercase, 13 | u.strip, 14 | u.clean_comma_suffix, 15 | u.flip_last_first, 16 | u.remove_non_namey, 17 | u.remove_name_cruft, 18 | u.strip, 19 | u.compress_whitespace 20 | ]) 21 | 22 | company_basic = u.combine([ 23 | u.uppercase, 24 | u.remove_non_namey, 25 | u.remove_company_cruft, 26 | u.strip, 27 | u.compress_whitespace 28 | ]) 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | .DS_Store 55 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014, Jeremy Singer-Vine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name="namestand", 6 | version="0.1.1", 7 | description="Standardize any list of strings, but especially database/CSV column-names.", 8 | long_description="", 9 | classifiers=[ 10 | "Development Status :: 3 - Alpha", 11 | "Intended Audience :: Developers", 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: OS Independent", 14 | "Programming Language :: Python :: 2.6", 15 | "Programming Language :: Python :: 2.7", 16 | "Programming Language :: Python :: 3.1", 17 | "Programming Language :: Python :: 3.4" 18 | ], 19 | keywords="rename columns standardize standardizing names", 20 | author="Jeremy Singer-Vine", 21 | author_email="jeremy.singer-vine@buzzfeed.com", 22 | url="http://github.com/buzzfeednews/namestand/", 23 | license="MIT", 24 | packages=find_packages(exclude=["test",]), 25 | namespace_packages=[], 26 | include_package_data=False, 27 | zip_safe=False, 28 | tests_require=[ 29 | "nose", 30 | ], 31 | test_suite="test", 32 | ) 33 | -------------------------------------------------------------------------------- /namestand/utils.py: -------------------------------------------------------------------------------- 1 | import namestand.patterns as p 2 | from functools import reduce 3 | import re 4 | 5 | try: basestring 6 | except NameError: 7 | basestring = str 8 | 9 | try: re._pattern_type 10 | except AttributeError: 11 | re._pattern_type = re.Pattern 12 | 13 | def is_seq(x): 14 | if hasattr(x, "__iter__") and not isinstance(x, basestring): 15 | return True 16 | else: return False 17 | 18 | def combine(converters): 19 | def applicator(x): 20 | reduced = reduce(lambda m, conv: conv(m), converters, x) 21 | return reduced 22 | def fn(x): 23 | if is_seq(x): 24 | return list(map(fn, x)) 25 | return applicator(x) 26 | return fn 27 | 28 | def uppercase(x): 29 | return x.upper() 30 | 31 | def lowercase(x): 32 | return x.lower() 33 | 34 | def stripper(chars): 35 | def fn(x): 36 | return x.strip(chars) 37 | return fn 38 | 39 | strip = stripper(None) 40 | 41 | # `pattern` can be a string or a compiled regular expression 42 | def translator(pattern, replacement): 43 | def fn(x): 44 | if isinstance(pattern, re._pattern_type): 45 | return re.sub(pattern, replacement, x) 46 | return x.replace(pattern, replacement) 47 | return fn 48 | 49 | # `pattern` can be a string or a compiled regular expression 50 | def swapper(pattern, replacement): 51 | def fn(x): 52 | if isinstance(pattern, re._pattern_type): 53 | return replacement if re.search(pattern, x) else x 54 | return replacement if pattern in x else x 55 | return fn 56 | 57 | # `test` can be a function or a list/tuple 58 | def defaulter(test, default_value): 59 | def fn(x): 60 | if hasattr(test, '__call__'): 61 | return x if test(x) else default_value 62 | return x if x in test else default_value 63 | return fn 64 | 65 | def falsey_replacer(default_value): 66 | return defaulter(lambda x: x, default_value) 67 | 68 | def init_num_prefixer(prefix_char): 69 | return translator(p.starts_with_num, r"{0}\1".format(prefix_char)) 70 | 71 | underscore = translator(p.non_alphanumeric, "_") 72 | 73 | remove_non_namey = translator(p.non_namey, " ") 74 | 75 | clean_comma_suffix = translator(p.comma_suffix, r" \1") 76 | 77 | remove_name_cruft = translator(p.name_cruft, " ") 78 | 79 | remove_company_cruft = translator(p.company_cruft, " ") 80 | 81 | flip_last_first = translator(p.last_first, r"\2 \1") 82 | 83 | compress_whitespace = translator(p.whitespace, " ") 84 | 85 | -------------------------------------------------------------------------------- /test/test_basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import namestand 3 | import re 4 | cols = """ 5 | Id 6 | Id2 7 | Geography 8 | Estimate; EMPLOYMENT STATUS - Population 16 years and over 9 | Margin of Error; EMPLOYMENT STATUS - Population 16 years and over 10 | Percent; EMPLOYMENT STATUS - Population 16 years and over 11 | Percent Margin of Error; EMPLOYMENT STATUS - Population 16 years and over 12 | Estimate; EMPLOYMENT STATUS - In labor force 13 | Margin of Error; EMPLOYMENT STATUS - In labor force 14 | Percent; EMPLOYMENT STATUS - In labor force 15 | Percent Margin of Error; EMPLOYMENT STATUS - In labor force 16 | Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force 17 | Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force 18 | Percent; EMPLOYMENT STATUS - In labor force - Civilian labor force 19 | Percent Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force 20 | Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed 21 | Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed 22 | Percent; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed 23 | Percent Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed 24 | """.strip().split("\n") 25 | 26 | def test_downscore(): 27 | assert(namestand.downscore(cols[0]) == "id") 28 | c = namestand.downscore(cols) 29 | assert(len(c) == len(cols)) 30 | assert(c[0] == "id") 31 | assert(c[1] == "id2") 32 | assert(c[2] == "geography") 33 | assert(c[3] == "estimate_employment_status_population_16_years_and_over") 34 | 35 | def test_num_prefixer(): 36 | d = namestand.downscore 37 | assert(d("2013 Happiness") == "_2013_happiness") 38 | assert(d("The 2013 Happiness") == "the_2013_happiness") 39 | assert(namestand.init_num_prefixer("n")("2013") == "n2013") 40 | 41 | def test_translator(): 42 | converter = namestand.combine([ 43 | namestand.downscore, 44 | namestand.translator("estimate_", "est_"), 45 | namestand.translator("percent_", "pct_"), 46 | namestand.translator("margin_of_error_", "moe_"), 47 | namestand.translator("employment_status", "status"), 48 | namestand.translator("population", "pop"), 49 | namestand.translator("_years_and_over", "y"), 50 | namestand.translator("_civilian", "_civ"), 51 | namestand.translator("_labor_force", "_lf"), 52 | ]) 53 | c = converter(cols) 54 | assert(c[2] == "geography") 55 | assert(c[3] == "est_status_pop_16y") 56 | assert(c[6] == "pct_moe_status_pop_16y") 57 | assert(c[-1] == "pct_moe_status_in_lf_civ_lf_employed") 58 | 59 | def test_last_first(): 60 | lf = namestand.utils.flip_last_first 61 | assert(lf("Antony, Mark") == "Mark Antony") 62 | 63 | def test_flip_proper(): 64 | fp = namestand.person_basic 65 | assert(fp("Antony, Mark") == "MARK ANTONY") 66 | assert(fp("Antony, Mark M.") == "MARK M ANTONY") 67 | assert(fp("Mark M. Antony") == "MARK M ANTONY") 68 | assert(fp(u"Mark M. Antoñy") == u"MARK M ANTOÑY") 69 | assert(fp(u"Diego Velázquez-O'Connor") == u"DIEGO VELÁZQUEZ-O'CONNOR") 70 | 71 | def test_complex_names(): 72 | c = namestand.person_basic 73 | assert(c("Nolpmet, John Esq.") == "JOHN NOLPMET") 74 | assert(c("Nolpmet, John Mr.") == "JOHN NOLPMET") 75 | assert(c("Nolpmet, John M. Mr.") == "JOHN M NOLPMET") 76 | assert(c("John Nolpmet, Jr.") == "JOHN NOLPMET JR") 77 | assert(c("John Nolpmet, VIII") == "JOHN NOLPMET VIII") 78 | 79 | def test_company(): 80 | c = namestand.company_basic 81 | assert(c("American Banana Stand, Inc.") == "AMERICAN BANANA STAND") 82 | 83 | def test_list_defaulter(): 84 | choices = [ "foo", "bar" ] 85 | x = namestand.combine([ 86 | namestand.defaulter(choices, "other") 87 | ]) 88 | orig = [ "gah", "bar", "foo" ] 89 | assert(x(orig) == [ "other", "bar", "foo" ]) 90 | 91 | def test_fn_defaulter(): 92 | x = namestand.combine([ 93 | namestand.falsey_replacer("NOPE") 94 | ]) 95 | orig = [ None, False, "hi", "there" ] 96 | assert(x(orig) == [ "NOPE", "NOPE", "hi", "there" ]) 97 | 98 | def test_swapper(): 99 | x = namestand.swapper("BUZZFEED", "BuzzFeed") 100 | y = namestand.swapper(re.compile("BUZZFEED", re.I), "BuzzFeed") 101 | assert(x("BUZZFEED INC") == "BuzzFeed") 102 | assert(x("THE BUZZFEED") == "BuzzFeed") 103 | assert(x("BuzzFeed Inc.") == "BuzzFeed Inc.") 104 | assert(y("BUZZFEED INC") == "BuzzFeed") 105 | assert(y("BuzzFeed Inc.") == "BuzzFeed") 106 | assert(y("The BuzzFeed Inc.") == "BuzzFeed") 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # namestand 2 | 3 | `namestand` is a Python library for easily transforming/standardizing lists of names (and other strings). No magic here, just a collection of useful tools. 4 | 5 | `namestand` was developed with unwieldy database column–names in mind, but can by applied to any list of strings. Other uses might include: standardizing political donor names, normalizing survey responses, et cetera. 6 | 7 | ## Installation 8 | 9 | ``` 10 | pip install namestand 11 | ``` 12 | 13 | ## Pre-Built Converters 14 | 15 | `namestand` comes with a set* of broadly useful converters. 16 | 17 | *Right now, just two of 'em. Contributions and suggestions welcome. 18 | 19 | ### namestand.downscore(string_or_list_of_strings) 20 | 21 | Suggested usage: column names, form-response options, etc. 22 | 23 | Steps: 24 | 25 | 1. Lowercases the string 26 | 2. Strips any leading and trailing whitespace 27 | 3. Converts any substring of non-ASCII alphanumeric characters to an underscore 28 | 4. Removes any leading and trailing underscores 29 | 5. Prefixes the string with "_" if it starts with a digit (which can otherwise cause trouble with `pandas` and other libraries). E.g., "2013 Happiness" becomes "_2013_happiness". 30 | 31 | Example: 32 | 33 | ```python 34 | namestand.downscore("Case Number") == "case_number" 35 | 36 | namestand.downscore([ 37 | "Case Number", 38 | "Case #", 39 | "Is Super-Duper?" 40 | ]) == [ 41 | "case_number", 42 | "case", 43 | "is_super_duper" 44 | ] 45 | ``` 46 | 47 | ### namestand.person_basic(string_or_list_of_strings) [very alpha] 48 | 49 | Suggested usage: Donor names, etc.; note, though, that this converter does not have any special knowledge of the world, e.g., that "Riccchard" is likely a misspelling of "Richard". 50 | 51 | Steps: 52 | 53 | 1. Uppercases the string 54 | 2. Strips any leading and trailing whitespace 55 | 3. Flips the "first" and "last" names if a comma is present 56 | 4. Removes the following characters that aren't either (unicode) letters, `'`, `-`, or spaces. 57 | 58 | Along the way, it tries to gracefully handle name prefixes (Mr./Mrs./etc.) and suffixes (Jr./Sr./VII/Esq./etc.). 59 | 60 | Example: 61 | 62 | ```python 63 | namestand.person_basic("Antony, Mark") == "MARK ANTONY" 64 | namestand.person_basic([ 65 | u"Diego Velázquez-O'Connor", 66 | "Antony, Mark" 67 | ]) == [ 68 | u"DIEGO VELÁZQUEZ-O'CONNOR", 69 | "MARK ANTONY" 70 | ] 71 | ``` 72 | ### namestand.company_basic(string_or_list_of_strings) [very alpha] 73 | 74 | Tries to remove common cruft from company names. 75 | 76 | Steps: 77 | 78 | 1. Uppercases the string 79 | 2. Strips any leading and trailing whitespace 80 | 3. Removes the following characters that aren't either (unicode) letters, `'`, `-`, or spaces. 81 | 4. Removes "LLC", "LTD", and "INC" 82 | 83 | Example: 84 | 85 | ```python 86 | namestand.person_basic("American Banana Stand, Inc.") == "AMERICAN BANANA STAND" 87 | ``` 88 | 89 | ## Custom Converters 90 | 91 | You can easily build your own name-standardizing pipelines using the following tools. 92 | 93 | ### namestand.combine(list_of_transformers) 94 | 95 | This function accepts a list of transformers (i.e., functions that accept a string and return a string) and returns a pipeline (i.e., a function that can be used in the same way as the pre-built converters). Converters themselves can be used as parts of pipelines, too. For example, if you wanted to change the `downscore` method to use hyphens, instead: 96 | 97 | ```python 98 | downhyphen = namestand.combine([ 99 | namestand.downscore, 100 | lambda x: x.replace("_", "-") 101 | ]) 102 | ``` 103 | 104 | But `namestand` already comes with a few helpers for doing things like string replacements. So you could also do: 105 | 106 | ```python 107 | downhyphen = namestand.combine([ 108 | namestand.downscore, 109 | namestand.translator("_", "-") 110 | ]) 111 | ``` 112 | 113 | Some helpful transformers: 114 | 115 | - __`namestand.translator(pattern, replacement)`__: `pattern` can be a string or a compiled regex. Equivalent to an argument-aware combination of `lambda x: x.replace(string, replacement)` and `lambda x: re.sub(regex, replacement)`. 116 | 117 | - __`namestand.swapper(pattern, replacement)`__: `pattern` can be a string or a compiled regex. If a given name matches the pattern (`re.match` for compiled regexes, `x in pattern` for string-`pattern`s), the entire name is replaced with the replacement. Otherwise, the given name is retained. 118 | 119 | - __`namestand.stripper(chars_to_strip)`__: Equivalent to `lambda x: x.strip(chars_to_strip)` 120 | 121 | - __`namestand.defaulter(test, default_value)`__: `test` can be either a list of "approved" values, or a function that returns True or False. If `x` doesn't pass the test (or isn't in the list), it is replaced with `default_value`. 122 | 123 | ## Tests 124 | 125 | Additional usage examples can be found in [test/](test/). To test, run `nosetests` or `tox` from this repo's root directory. Currently tested, and passing, on the following Python versions: 126 | 127 | ``` 128 | 2.7.14 129 | 3.5.4 130 | 3.6.4 131 | 3.7.5 132 | 3.8.0 133 | ``` 134 | 135 | ## Feedback? 136 | 137 | Pull requests, suggestions, etc. welcome. 138 | --------------------------------------------------------------------------------