├── tox.ini
├── namestand
    ├── __init__.py
    ├── patterns.py
    ├── converters.py
    └── utils.py
├── .gitignore
├── LICENSE.txt
├── setup.py
├── test
    └── test_basic.py
└── README.md


/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py35,py36,py37,py38
3 | 
4 | [testenv]
5 | deps=nose
6 | commands=nosetests
7 | 


--------------------------------------------------------------------------------
/namestand/__init__.py:
--------------------------------------------------------------------------------
1 | from namestand.converters import *
2 | from namestand.utils import *
3 | from namestand import patterns
4 | 
5 | VERSION_TUPLE = (0, 1, 1)
6 | VERSION = ".".join(map(str, VERSION_TUPLE))
7 | 


--------------------------------------------------------------------------------
/namestand/patterns.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | non_alphanumeric = re.compile(r"[^0-9a-z]+", re.I)
 4 | 
 5 | non_namey = re.compile(r"[^\w\-' ]+", re.UNICODE)
 6 | 
 7 | comma_suffix = re.compile(r", *(JR|SR|I+|IV|VI*)\b")
 8 | 
 9 | last_first = re.compile(r"([^,]*), +([^,]*)")
10 | 
11 | starts_with_num = re.compile(r"^(\d)")
12 | 
13 | name_cruft = re.compile(r"\b(MR|MS|MRS|ESQ|SIR|HON)\b")
14 | 
15 | company_cruft = re.compile(r"\b(LLC|LTD|INC|LLP)\b")
16 | 
17 | whitespace = re.compile(r"\s+")
18 | 


--------------------------------------------------------------------------------
/namestand/converters.py:
--------------------------------------------------------------------------------
 1 | import namestand.utils as u
 2 | 
 3 | downscore = u.combine([
 4 |     u.lowercase,
 5 |     u.strip,
 6 |     u.underscore,
 7 |     u.stripper("_"),
 8 |     u.init_num_prefixer("_")
 9 | ])
10 | 
11 | person_basic = u.combine([
12 |     u.uppercase,
13 |     u.strip,
14 |     u.clean_comma_suffix,
15 |     u.flip_last_first,
16 |     u.remove_non_namey,
17 |     u.remove_name_cruft,
18 |     u.strip,
19 |     u.compress_whitespace
20 | ])
21 | 
22 | company_basic = u.combine([
23 |     u.uppercase,
24 |     u.remove_non_namey,
25 |     u.remove_company_cruft,
26 |     u.strip,
27 |     u.compress_whitespace
28 | ])
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | .DS_Store
55 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014, Jeremy Singer-Vine
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from setuptools import setup, find_packages
 3 | 
 4 | setup(
 5 |     name="namestand",
 6 |     version="0.1.1",
 7 |     description="Standardize any list of strings, but especially database/CSV column-names.",
 8 |     long_description="",
 9 |     classifiers=[
10 |         "Development Status :: 3 - Alpha",
11 |         "Intended Audience :: Developers",
12 |         "License :: OSI Approved :: MIT License",
13 |         "Operating System :: OS Independent",
14 |         "Programming Language :: Python :: 2.6",
15 |         "Programming Language :: Python :: 2.7",
16 |         "Programming Language :: Python :: 3.1",
17 |         "Programming Language :: Python :: 3.4"
18 |     ],
19 |     keywords="rename columns standardize standardizing names",
20 |     author="Jeremy Singer-Vine",
21 |     author_email="jeremy.singer-vine@buzzfeed.com",
22 |     url="http://github.com/buzzfeednews/namestand/",
23 |     license="MIT",
24 |     packages=find_packages(exclude=["test",]),
25 |     namespace_packages=[],
26 |     include_package_data=False,
27 |     zip_safe=False,
28 |     tests_require=[
29 |         "nose",
30 |     ],
31 |     test_suite="test",
32 | )
33 | 


--------------------------------------------------------------------------------
/namestand/utils.py:
--------------------------------------------------------------------------------
 1 | import namestand.patterns as p
 2 | from functools import reduce
 3 | import re
 4 | 
 5 | try: basestring
 6 | except NameError:
 7 |     basestring = str
 8 | 
 9 | try: re._pattern_type
10 | except AttributeError:
11 |     re._pattern_type = re.Pattern
12 | 
13 | def is_seq(x):
14 |     if hasattr(x, "__iter__") and not isinstance(x, basestring):
15 |         return True
16 |     else: return False
17 | 
18 | def combine(converters):
19 |     def applicator(x):
20 |         reduced = reduce(lambda m, conv: conv(m), converters, x)
21 |         return reduced
22 |     def fn(x):
23 |         if is_seq(x):
24 |             return list(map(fn, x))
25 |         return applicator(x)
26 |     return fn
27 | 
28 | def uppercase(x):
29 |     return x.upper()
30 | 
31 | def lowercase(x):
32 |     return x.lower()
33 | 
34 | def stripper(chars):
35 |     def fn(x):
36 |         return x.strip(chars)
37 |     return fn
38 | 
39 | strip = stripper(None)
40 | 
41 | # `pattern` can be a string or a compiled regular expression
42 | def translator(pattern, replacement):
43 |     def fn(x):
44 |         if isinstance(pattern, re._pattern_type):
45 |             return re.sub(pattern, replacement, x)
46 |         return x.replace(pattern, replacement)
47 |     return fn
48 | 
49 | # `pattern` can be a string or a compiled regular expression
50 | def swapper(pattern, replacement):
51 |     def fn(x):
52 |         if isinstance(pattern, re._pattern_type):
53 |             return replacement if re.search(pattern, x) else x
54 |         return replacement if pattern in x else x
55 |     return fn
56 | 
57 | # `test` can be a function or a list/tuple
58 | def defaulter(test, default_value):
59 |     def fn(x):
60 |         if hasattr(test, '__call__'):
61 |             return x if test(x) else default_value
62 |         return x if x in test else default_value
63 |     return fn
64 | 
65 | def falsey_replacer(default_value):
66 |     return defaulter(lambda x: x, default_value)
67 | 
68 | def init_num_prefixer(prefix_char):
69 |     return translator(p.starts_with_num, r"{0}\1".format(prefix_char))
70 | 
71 | underscore = translator(p.non_alphanumeric, "_")
72 | 
73 | remove_non_namey = translator(p.non_namey, " ")
74 | 
75 | clean_comma_suffix = translator(p.comma_suffix, r" \1")
76 | 
77 | remove_name_cruft = translator(p.name_cruft, " ")
78 | 
79 | remove_company_cruft = translator(p.company_cruft, " ")
80 | 
81 | flip_last_first = translator(p.last_first, r"\2 \1")
82 | 
83 | compress_whitespace = translator(p.whitespace, " ")
84 | 
85 | 


--------------------------------------------------------------------------------
/test/test_basic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import namestand
  3 | import re
  4 | cols = """
  5 | Id
  6 | Id2
  7 | Geography
  8 | Estimate; EMPLOYMENT STATUS - Population 16 years and over
  9 | Margin of Error; EMPLOYMENT STATUS - Population 16 years and over
 10 | Percent; EMPLOYMENT STATUS - Population 16 years and over
 11 | Percent Margin of Error; EMPLOYMENT STATUS - Population 16 years and over
 12 | Estimate; EMPLOYMENT STATUS - In labor force
 13 | Margin of Error; EMPLOYMENT STATUS - In labor force
 14 | Percent; EMPLOYMENT STATUS - In labor force
 15 | Percent Margin of Error; EMPLOYMENT STATUS - In labor force
 16 | Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force
 17 | Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force
 18 | Percent; EMPLOYMENT STATUS - In labor force - Civilian labor force
 19 | Percent Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force
 20 | Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed
 21 | Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed
 22 | Percent; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed
 23 | Percent Margin of Error; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed
 24 | """.strip().split("\n")
 25 | 
 26 | def test_downscore():
 27 |    assert(namestand.downscore(cols[0]) == "id")
 28 |    c = namestand.downscore(cols) 
 29 |    assert(len(c) == len(cols))
 30 |    assert(c[0] == "id")
 31 |    assert(c[1] == "id2")
 32 |    assert(c[2] == "geography")
 33 |    assert(c[3] == "estimate_employment_status_population_16_years_and_over")
 34 | 
 35 | def test_num_prefixer():
 36 |     d = namestand.downscore
 37 |     assert(d("2013 Happiness") == "_2013_happiness")
 38 |     assert(d("The 2013 Happiness") == "the_2013_happiness")
 39 |     assert(namestand.init_num_prefixer("n")("2013") == "n2013")
 40 | 
 41 | def test_translator():
 42 |     converter = namestand.combine([
 43 |         namestand.downscore,
 44 |         namestand.translator("estimate_", "est_"),
 45 |         namestand.translator("percent_", "pct_"),
 46 |         namestand.translator("margin_of_error_", "moe_"),
 47 |         namestand.translator("employment_status", "status"),
 48 |         namestand.translator("population", "pop"),
 49 |         namestand.translator("_years_and_over", "y"),
 50 |         namestand.translator("_civilian", "_civ"),
 51 |         namestand.translator("_labor_force", "_lf"),
 52 |     ])
 53 |     c = converter(cols)
 54 |     assert(c[2] == "geography")
 55 |     assert(c[3] == "est_status_pop_16y")
 56 |     assert(c[6] == "pct_moe_status_pop_16y")
 57 |     assert(c[-1] == "pct_moe_status_in_lf_civ_lf_employed")
 58 | 
 59 | def test_last_first():
 60 |     lf = namestand.utils.flip_last_first
 61 |     assert(lf("Antony, Mark") == "Mark Antony")
 62 | 
 63 | def test_flip_proper():
 64 |     fp = namestand.person_basic
 65 |     assert(fp("Antony, Mark") == "MARK ANTONY")
 66 |     assert(fp("Antony, Mark M.") == "MARK M ANTONY")
 67 |     assert(fp("Mark M. Antony") == "MARK M ANTONY")
 68 |     assert(fp(u"Mark M. Antoñy") == u"MARK M ANTOÑY")
 69 |     assert(fp(u"Diego Velázquez-O'Connor") == u"DIEGO VELÁZQUEZ-O'CONNOR")
 70 | 
 71 | def test_complex_names():
 72 |     c = namestand.person_basic
 73 |     assert(c("Nolpmet, John Esq.") == "JOHN NOLPMET")
 74 |     assert(c("Nolpmet, John Mr.") == "JOHN NOLPMET")
 75 |     assert(c("Nolpmet, John M. Mr.") == "JOHN M NOLPMET")
 76 |     assert(c("John Nolpmet, Jr.") == "JOHN NOLPMET JR")
 77 |     assert(c("John Nolpmet, VIII") == "JOHN NOLPMET VIII")
 78 | 
 79 | def test_company():
 80 |     c = namestand.company_basic
 81 |     assert(c("American Banana Stand, Inc.") == "AMERICAN BANANA STAND")
 82 | 
 83 | def test_list_defaulter():
 84 |     choices = [ "foo", "bar" ]
 85 |     x = namestand.combine([
 86 |         namestand.defaulter(choices, "other")
 87 |     ])
 88 |     orig = [ "gah", "bar", "foo" ]
 89 |     assert(x(orig) == [ "other", "bar", "foo" ])
 90 | 
 91 | def test_fn_defaulter():
 92 |     x = namestand.combine([
 93 |         namestand.falsey_replacer("NOPE")
 94 |     ])
 95 |     orig = [ None, False, "hi", "there" ]
 96 |     assert(x(orig) == [ "NOPE", "NOPE", "hi", "there" ])
 97 | 
 98 | def test_swapper():
 99 |     x = namestand.swapper("BUZZFEED", "BuzzFeed")
100 |     y = namestand.swapper(re.compile("BUZZFEED", re.I), "BuzzFeed")
101 |     assert(x("BUZZFEED INC") == "BuzzFeed")
102 |     assert(x("THE BUZZFEED") == "BuzzFeed")
103 |     assert(x("BuzzFeed Inc.") == "BuzzFeed Inc.")
104 |     assert(y("BUZZFEED INC") == "BuzzFeed")
105 |     assert(y("BuzzFeed Inc.") == "BuzzFeed")
106 |     assert(y("The BuzzFeed Inc.") == "BuzzFeed")
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # namestand
  2 | 
  3 | `namestand` is a Python library for easily transforming/standardizing lists of names (and other strings). No magic here, just a collection of useful tools.
  4 | 
  5 | `namestand` was developed with unwieldy database column–names in mind, but can by applied to any list of strings. Other uses might include: standardizing political donor names, normalizing survey responses, et cetera.
  6 | 
  7 | ## Installation
  8 | 
  9 | ```
 10 | pip install namestand
 11 | ```
 12 | 
 13 | ## Pre-Built Converters
 14 | 
 15 | `namestand` comes with a set* of broadly useful converters.
 16 | 
 17 | *Right now, just two of 'em. Contributions and suggestions welcome.
 18 | 
 19 | ### namestand.downscore(string_or_list_of_strings)
 20 | 
 21 | Suggested usage: column names, form-response options, etc.
 22 | 
 23 | Steps:
 24 | 
 25 | 1. Lowercases the string
 26 | 2. Strips any leading and trailing whitespace
 27 | 3. Converts any substring of non-ASCII alphanumeric characters to an underscore
 28 | 4. Removes any leading and trailing underscores
 29 | 5. Prefixes the string with "_" if it starts with a digit (which can otherwise cause trouble with `pandas` and other libraries). E.g., "2013 Happiness" becomes "_2013_happiness".
 30 | 
 31 | Example:
 32 | 
 33 | ```python
 34 | namestand.downscore("Case Number") == "case_number"
 35 | 
 36 | namestand.downscore([
 37 |     "Case Number",
 38 |     "Case #",
 39 |     "Is Super-Duper?"
 40 | ]) == [
 41 |     "case_number",
 42 |     "case",
 43 |     "is_super_duper"
 44 | ]
 45 | ```
 46 | 
 47 | ### namestand.person_basic(string_or_list_of_strings) [very alpha]
 48 | 
 49 | Suggested usage: Donor names, etc.; note, though, that this converter does not have any special knowledge of the world, e.g., that "Riccchard" is likely a misspelling of "Richard".
 50 | 
 51 | Steps:
 52 | 
 53 | 1. Uppercases the string
 54 | 2. Strips any leading and trailing whitespace
 55 | 3. Flips the "first" and "last" names if a comma is present
 56 | 4. Removes the following characters that aren't either (unicode) letters, `'`, `-`, or spaces.
 57 | 
 58 | Along the way, it tries to gracefully handle name prefixes (Mr./Mrs./etc.) and suffixes (Jr./Sr./VII/Esq./etc.).
 59 | 
 60 | Example:
 61 | 
 62 | ```python
 63 | namestand.person_basic("Antony, Mark") == "MARK ANTONY"
 64 | namestand.person_basic([
 65 |     u"Diego Velázquez-O'Connor",
 66 |     "Antony, Mark"
 67 | ]) == [
 68 |     u"DIEGO VELÁZQUEZ-O'CONNOR",
 69 |     "MARK ANTONY"
 70 | ]
 71 | ```
 72 | ### namestand.company_basic(string_or_list_of_strings) [very alpha]
 73 | 
 74 | Tries to remove common cruft from company names.
 75 | 
 76 | Steps:
 77 | 
 78 | 1. Uppercases the string
 79 | 2. Strips any leading and trailing whitespace
 80 | 3. Removes the following characters that aren't either (unicode) letters, `'`, `-`, or spaces.
 81 | 4. Removes "LLC", "LTD", and "INC"
 82 | 
 83 | Example:
 84 | 
 85 | ```python
 86 | namestand.person_basic("American Banana Stand, Inc.") == "AMERICAN BANANA STAND"
 87 | ```
 88 | 
 89 | ## Custom Converters
 90 | 
 91 | You can easily build your own name-standardizing pipelines using the following tools.
 92 | 
 93 | ### namestand.combine(list_of_transformers)
 94 | 
 95 | This function accepts a list of transformers (i.e., functions that accept a string and return a string) and returns a pipeline (i.e., a function that can be used in the same way as the pre-built converters). Converters themselves can be used as parts of pipelines, too. For example, if you wanted to change the `downscore` method to use hyphens, instead:
 96 | 
 97 | ```python
 98 | downhyphen = namestand.combine([
 99 |     namestand.downscore,
100 |     lambda x: x.replace("_", "-")
101 | ])
102 | ```
103 | 
104 | But `namestand` already comes with a few helpers for doing things like string replacements. So you could also do:
105 | 
106 | ```python
107 | downhyphen = namestand.combine([
108 |     namestand.downscore,
109 |     namestand.translator("_", "-")
110 | ])
111 | ```
112 | 
113 | Some helpful transformers:
114 | 
115 | - __`namestand.translator(pattern, replacement)`__: `pattern` can be a string or a compiled regex. Equivalent to an argument-aware combination of `lambda x: x.replace(string, replacement)` and `lambda x: re.sub(regex, replacement)`.
116 | 
117 | - __`namestand.swapper(pattern, replacement)`__: `pattern` can be a string or a compiled regex. If a given name matches the pattern (`re.match` for compiled regexes, `x in pattern` for string-`pattern`s), the entire name is replaced with the replacement. Otherwise, the given name is retained.
118 | 
119 | - __`namestand.stripper(chars_to_strip)`__: Equivalent to `lambda x: x.strip(chars_to_strip)`
120 | 
121 | - __`namestand.defaulter(test, default_value)`__: `test` can be either a list of "approved" values, or a function that returns True or False. If `x` doesn't pass the test (or isn't in the list), it is replaced with `default_value`.
122 | 
123 | ## Tests
124 | 
125 | Additional usage examples can be found in [test/](test/). To test, run `nosetests` or `tox` from this repo's root directory. Currently tested, and passing, on the following Python versions:
126 | 
127 | ```
128 | 2.7.14
129 | 3.5.4
130 | 3.6.4
131 | 3.7.5
132 | 3.8.0
133 | ```
134 | 
135 | ## Feedback?
136 | 
137 | Pull requests, suggestions, etc. welcome.
138 | 


--------------------------------------------------------------------------------