├── requirements-dev.txt ├── requirements.txt ├── forcats ├── __init__.py └── functions.py ├── README.md ├── .travis.yml ├── setup.py ├── LICENSE ├── tests ├── test_cat_lump.py ├── test_cat_count.py └── test_cat_anon.py └── .gitignore /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | pytest 4 | -------------------------------------------------------------------------------- /forcats/__init__.py: -------------------------------------------------------------------------------- 1 | """ Tools for working with categorical data 2 | """ 3 | 4 | from forcats.functions import cat_lump, cat_count, cat_anon # noqa -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # forcats 2 | 3 | [![Build Status](https://travis-ci.org/MangoTheCat/forcats.svg?branch=master)](https://travis-ci.org/MangoTheCat/forcats) 4 | 5 | > Python tools for working with categorical data 6 | 7 | ## License 8 | 9 | MIT 2019 © Mango Solutions 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | - "3.6" 5 | - "3.7-dev" # 3.7 development branch 6 | # command to install dependencies 7 | install: 8 | - pip install -r requirements.txt 9 | - python setup.py install 10 | # command to run tests 11 | script: 12 | - pytest 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md", "r") as f: 4 | long_description = f.read() 5 | 6 | setup( 7 | name="forcats", 8 | version="0.1.0", 9 | packages=["forcats"], 10 | description="Tools for working with categorical data", 11 | url="http://github.com/MangoTheCat/forcats", 12 | author="Mango Solutions", 13 | zip_safe=False, 14 | install_requires=["pandas", "numpy"], 15 | license="MIT", 16 | ) 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Mango Solutions 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_cat_lump.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from forcats.functions import cat_lump 4 | 5 | import pytest 6 | 7 | 8 | @pytest.fixture() 9 | def data(): 10 | 11 | # First column is as strings, second is as pandas categorical data type 12 | df = pd.DataFrame( 13 | { 14 | "A": ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"], 15 | "B": pd.Categorical(["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]), 16 | } 17 | ) 18 | return df 19 | 20 | 21 | def test_cat_lump_on_string_column(data): 22 | 23 | result = cat_lump(data.A, n=2) 24 | 25 | target = ["a", "a", "a", "a", "b", "b", "b", "Other", "Other", "Other"] 26 | assert result.tolist() == target 27 | 28 | 29 | def test_cat_lump_on_categorical_column(data): 30 | 31 | result = cat_lump(data.B, n=2) 32 | 33 | target = ["a", "a", "a", "a", "b", "b", "b", "Other", "Other", "Other"] 34 | assert result.tolist() == target 35 | 36 | def test_cat_lump_with_prop(data): 37 | 38 | result_hi = cat_lump(data.B, prop=0.3) 39 | 40 | target = ["a", "a", "a", "a", "b", "b", "b", "Other", "Other", "Other"] 41 | assert result_hi.tolist() == target 42 | 43 | result_lo = cat_lump(data.B, prop=0.1) 44 | 45 | target = ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"] 46 | assert result_lo.tolist() == target 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | # C extensions 6 | *.so 7 | # Distribution / packaging 8 | .Python 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | # Unit test / coverage reports 34 | htmlcov/ 35 | .tox/ 36 | .coverage 37 | .coverage.* 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | *.cover 42 | .hypothesis/ 43 | .pytest_cache/ 44 | # Translations 45 | *.mo 46 | *.pot 47 | # Django stuff: 48 | *.log 49 | local_settings.py 50 | db.sqlite3 51 | # Flask stuff: 52 | instance/ 53 | .webassets-cache 54 | # Scrapy stuff: 55 | .scrapy 56 | # Sphinx documentation 57 | docs/_build/ 58 | # PyBuilder 59 | target/ 60 | # Jupyter Notebook 61 | .ipynb_checkpoints 62 | # pyenv 63 | .python-version 64 | # celery beat schedule file 65 | celerybeat-schedule 66 | # SageMath parsed files 67 | *.sage.py 68 | # Environments 69 | .env 70 | .venv 71 | env/ 72 | venv/ 73 | ENV/ 74 | env.bak/ 75 | venv.bak/ 76 | # Spyder project settings 77 | .spyderproject 78 | .spyproject 79 | # Rope project settings 80 | .ropeproject 81 | # mkdocs documentation 82 | /site 83 | # mypy 84 | .mypy_cache/ -------------------------------------------------------------------------------- /tests/test_cat_count.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from forcats.functions import cat_count 4 | 5 | import pytest 6 | @pytest.fixture() 7 | def data(): 8 | 9 | # First column is as strings, second is as pandas categorical data type 10 | df = pd.DataFrame( 11 | { 12 | "A": ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"], 13 | "B": pd.Categorical(["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]), 14 | } 15 | ) 16 | return df 17 | 18 | def test_cat_count_on_string_column(data): 19 | result = (cat_count(data.A, sort=False) 20 | .set_index('f') 21 | .sort_index() 22 | .rename_axis(None) 23 | ) 24 | 25 | target = pd.DataFrame({ 26 | 'n': {'a': 4, 'b': 3, 'c': 2, 'd': 1} 27 | }) 28 | 29 | pd.testing.assert_frame_equal(result, target) 30 | 31 | 32 | def test_cat_count_on_string_column_sort(data): 33 | 34 | result = cat_count(data.A, sort = True) 35 | 36 | target = pd.DataFrame({ 37 | 'f': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, 38 | 'n': {0: 4, 1: 3, 2: 2, 3: 1}, 39 | }) 40 | pd.testing.assert_frame_equal(result, target) 41 | 42 | 43 | def test_cat_count_on_categorical_column(data): 44 | 45 | result = cat_count(data.B, prop = True) 46 | 47 | target = pd.DataFrame({ 48 | 'f': pd.Categorical(['a', 'b', 'c', 'd']), 49 | 'n': [4, 3, 2, 1], 50 | 'p': [0.4, 0.3, 0.2, 0.1] 51 | }) 52 | 53 | pd.testing.assert_frame_equal(result, target) 54 | 55 | def test_cat_count_on_categorical_column_sort(data): 56 | 57 | result = cat_count(data.B, sort = True) 58 | 59 | target = pd.DataFrame({ 60 | 'f': pd.Categorical(['a', 'b', 'c', 'd']), 61 | 'n': [4, 3, 2, 1] 62 | }) 63 | pd.testing.assert_frame_equal(result, target) 64 | -------------------------------------------------------------------------------- /tests/test_cat_anon.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from forcats.functions import cat_anon 4 | 5 | import pytest 6 | 7 | 8 | @pytest.fixture() 9 | def data(): 10 | 11 | # First column is as strings, second is as pandas categorical data type, 12 | # third is a ten-level string series 13 | df = pd.DataFrame( 14 | { 15 | "A": ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"], 16 | "B": pd.Categorical(["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]), 17 | "C": list('abcdefghij') 18 | } 19 | ) 20 | return df 21 | 22 | 23 | def test_cat_anon_on_string_column(data): 24 | 25 | result = cat_anon(data.A) 26 | 27 | target = ['1', '1', '1', '1', '2', '2', '2', '3', '3', '4'] 28 | assert result.tolist() == target 29 | 30 | result_pre = cat_anon(data.A, 'Person_') 31 | 32 | target = ['Person_1', 'Person_1', 'Person_1', 'Person_1', 33 | 'Person_2', 'Person_2', 'Person_2', 'Person_3', 34 | 'Person_3', 'Person_4'] 35 | assert result_pre.tolist() == target 36 | 37 | 38 | def test_cat_anon_on_categorical_column(data): 39 | 40 | result = cat_anon(data.B) 41 | 42 | target = ['1', '1', '1', '1', '2', '2', '2', '3', '3', '4'] 43 | assert result.tolist() == target 44 | 45 | result_pre = cat_anon(data.B, 'Country_') 46 | 47 | target = ['Country_1', 'Country_1', 'Country_1', 'Country_1', 48 | 'Country_2', 'Country_2', 'Country_2', 'Country_3', 49 | 'Country_3', 'Country_4'] 50 | assert result_pre.tolist() == target 51 | 52 | 53 | def test_cat_anon_on_10_level_column(data): 54 | 55 | result = cat_anon(data.C) 56 | 57 | target = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10'] 58 | assert result.tolist() == target 59 | 60 | result_pre = cat_anon(data.C, 'Religion_') 61 | 62 | target = ['Religion_01', 'Religion_02', 'Religion_03', 'Religion_04', 63 | 'Religion_05', 'Religion_06', 'Religion_07', 'Religion_08', 64 | 'Religion_09', 'Religion_10'] 65 | assert result_pre.tolist() == target 66 | -------------------------------------------------------------------------------- /forcats/functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def cat_lump(x, n=5, prop=None, other_level="Other"): 4 | """ 5 | Lump together least common categories into an "Other" category 6 | 7 | Parameters 8 | ---------- 9 | x : pd.Series 10 | series to be modified 11 | n : int 12 | number of levels to preserve 13 | prop : float 14 | optional instead of n. Choose the minimum proportion for a level. 15 | Must be between 0 and 1. Overrides n. 16 | other_level : str 17 | "other" category label 18 | 19 | Returns 20 | ------- 21 | y : pd.Series 22 | modified series (with categorical type) 23 | """ 24 | counts = x.value_counts() 25 | if prop: 26 | assert 0 <= prop <= 1 27 | min_count = int(prop * x.size) 28 | if min_count > counts.min(): 29 | repl = counts.loc[counts < min_count].index 30 | x = x.replace(repl, other_level) 31 | elif len(counts) > n: 32 | repl = counts.iloc[n:].index 33 | x = x.replace(repl, other_level) 34 | return x 35 | 36 | 37 | def cat_count(x, sort=False, prop=False): 38 | """ 39 | Count entries in a factor 40 | 41 | Parameters 42 | ---------- 43 | x : pd.Series 44 | series to be counted 45 | sort : boolean 46 | If `True`, sort the result so that the most common values are displayed at the top. 47 | prop : boolean 48 | If `True`, compute the fraction of marginal table. 49 | 50 | Returns 51 | ------- 52 | y : pd.core.frame.DataFrame 53 | A df with columns `f`, `n` and `p`, if prop is `True`. 54 | """ 55 | 56 | counts = x.value_counts(sort=sort) 57 | df = pd.DataFrame({'f': counts.index, 'n': counts.values}) 58 | 59 | if(prop): 60 | df['p'] = df['n']/sum(df['n']) 61 | 62 | return df 63 | 64 | 65 | def cat_anon(x, prefix = ""): 66 | """ 67 | Anonymise category levels 68 | 69 | Parameters 70 | ---------- 71 | x : pd.Series 72 | series to be modified 73 | prefix : string 74 | string prefix to insert in front of the numeric labels 75 | 76 | Returns 77 | ------- 78 | y : pd.Series 79 | Anonymised pandas series 80 | """ 81 | x_array = pd.factorize(x)[0] + 1 82 | 83 | digits = [len(str(x)) for x in x_array] 84 | digits = max(digits) 85 | 86 | x = [str(x).zfill(digits) for x in x_array] 87 | x = prefix + pd.Series(x) 88 | 89 | return x 90 | --------------------------------------------------------------------------------