├── requirements-dev.txt
├── requirements.txt
├── forcats
    ├── __init__.py
    └── functions.py
├── README.md
├── .travis.yml
├── setup.py
├── LICENSE
├── tests
    ├── test_cat_lump.py
    ├── test_cat_count.py
    └── test_cat_anon.py
└── .gitignore


/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | pytest
4 | 


--------------------------------------------------------------------------------
/forcats/__init__.py:
--------------------------------------------------------------------------------
1 | """ Tools for working with categorical data
2 | """
3 | 
4 | from forcats.functions import cat_lump, cat_count, cat_anon  # noqa


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # forcats
 2 | 
 3 | [![Build Status](https://travis-ci.org/MangoTheCat/forcats.svg?branch=master)](https://travis-ci.org/MangoTheCat/forcats)
 4 | 
 5 | > Python tools for working with categorical data
 6 | 
 7 | ## License
 8 | 
 9 | MIT 2019 © Mango Solutions
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 |   - "3.6"
 5 |   - "3.7-dev" # 3.7 development branch
 6 | # command to install dependencies
 7 | install:
 8 |   - pip install -r requirements.txt
 9 |   - python setup.py install
10 | # command to run tests
11 | script:
12 |   - pytest
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(
 7 |     name="forcats",
 8 |     version="0.1.0",
 9 |     packages=["forcats"],
10 |     description="Tools for working with categorical data",
11 |     url="http://github.com/MangoTheCat/forcats",
12 |     author="Mango Solutions",
13 |     zip_safe=False,
14 |     install_requires=["pandas", "numpy"],
15 |     license="MIT",
16 | )
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Mango Solutions
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/test_cat_lump.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from forcats.functions import cat_lump
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def data():
10 | 
11 |     # First column is as strings, second is as pandas categorical data type
12 |     df = pd.DataFrame(
13 |         {
14 |             "A": ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"],
15 |             "B": pd.Categorical(["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]),
16 |         }
17 |     )
18 |     return df
19 | 
20 | 
21 | def test_cat_lump_on_string_column(data):
22 | 
23 |     result = cat_lump(data.A, n=2)
24 | 
25 |     target = ["a", "a", "a", "a", "b", "b", "b", "Other", "Other", "Other"]
26 |     assert result.tolist() == target
27 | 
28 | 
29 | def test_cat_lump_on_categorical_column(data):
30 | 
31 |     result = cat_lump(data.B, n=2)
32 | 
33 |     target = ["a", "a", "a", "a", "b", "b", "b", "Other", "Other", "Other"]
34 |     assert result.tolist() == target
35 | 
36 | def test_cat_lump_with_prop(data):
37 | 
38 |     result_hi = cat_lump(data.B, prop=0.3)
39 | 
40 |     target = ["a", "a", "a", "a", "b", "b", "b", "Other", "Other", "Other"]
41 |     assert result_hi.tolist() == target
42 | 
43 |     result_lo = cat_lump(data.B, prop=0.1)
44 | 
45 |     target = ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]
46 |     assert result_lo.tolist() == target
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | # C extensions
 6 | *.so
 7 | # Distribution / packaging
 8 | .Python
 9 | build/
10 | develop-eggs/
11 | dist/
12 | downloads/
13 | eggs/
14 | .eggs/
15 | lib/
16 | lib64/
17 | parts/
18 | sdist/
19 | var/
20 | wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | # Installer logs
31 | pip-log.txt
32 | pip-delete-this-directory.txt
33 | # Unit test / coverage reports
34 | htmlcov/
35 | .tox/
36 | .coverage
37 | .coverage.*
38 | .cache
39 | nosetests.xml
40 | coverage.xml
41 | *.cover
42 | .hypothesis/
43 | .pytest_cache/
44 | # Translations
45 | *.mo
46 | *.pot
47 | # Django stuff:
48 | *.log
49 | local_settings.py
50 | db.sqlite3
51 | # Flask stuff:
52 | instance/
53 | .webassets-cache
54 | # Scrapy stuff:
55 | .scrapy
56 | # Sphinx documentation
57 | docs/_build/
58 | # PyBuilder
59 | target/
60 | # Jupyter Notebook
61 | .ipynb_checkpoints
62 | # pyenv
63 | .python-version
64 | # celery beat schedule file
65 | celerybeat-schedule
66 | # SageMath parsed files
67 | *.sage.py
68 | # Environments
69 | .env
70 | .venv
71 | env/
72 | venv/
73 | ENV/
74 | env.bak/
75 | venv.bak/
76 | # Spyder project settings
77 | .spyderproject
78 | .spyproject
79 | # Rope project settings
80 | .ropeproject
81 | # mkdocs documentation
82 | /site
83 | # mypy
84 | .mypy_cache/


--------------------------------------------------------------------------------
/tests/test_cat_count.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from forcats.functions import cat_count
 4 | 
 5 | import pytest
 6 | @pytest.fixture()
 7 | def data():
 8 | 
 9 |     # First column is as strings, second is as pandas categorical data type
10 |     df = pd.DataFrame(
11 |         {
12 |             "A": ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"],
13 |             "B": pd.Categorical(["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]),
14 |         }
15 |     )
16 |     return df
17 | 
18 | def test_cat_count_on_string_column(data):
19 |     result = (cat_count(data.A, sort=False)
20 |         .set_index('f')
21 |         .sort_index()
22 |         .rename_axis(None)
23 |     )
24 |     
25 |     target = pd.DataFrame({
26 |         'n': {'a': 4, 'b': 3, 'c': 2, 'd': 1}
27 |     })
28 |     
29 |     pd.testing.assert_frame_equal(result, target)
30 | 
31 | 
32 | def test_cat_count_on_string_column_sort(data):
33 | 
34 |     result = cat_count(data.A, sort = True)
35 | 
36 |     target = pd.DataFrame({
37 |         'f': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
38 |         'n': {0: 4, 1: 3, 2: 2, 3: 1},
39 |         })
40 |     pd.testing.assert_frame_equal(result, target)
41 | 
42 | 
43 | def test_cat_count_on_categorical_column(data):
44 | 
45 |     result = cat_count(data.B, prop = True)
46 |     
47 |     target = pd.DataFrame({
48 |         'f': pd.Categorical(['a', 'b', 'c', 'd']),
49 |         'n': [4, 3, 2, 1],
50 |         'p': [0.4, 0.3, 0.2, 0.1]
51 |         })
52 |     
53 |     pd.testing.assert_frame_equal(result, target)
54 | 
55 | def test_cat_count_on_categorical_column_sort(data):
56 | 
57 |     result = cat_count(data.B, sort = True)
58 |     
59 |     target = pd.DataFrame({
60 |         'f': pd.Categorical(['a', 'b', 'c', 'd']),
61 |         'n': [4, 3, 2, 1]
62 |         })
63 |     pd.testing.assert_frame_equal(result, target)
64 | 


--------------------------------------------------------------------------------
/tests/test_cat_anon.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from forcats.functions import cat_anon
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def data():
10 | 
11 |     # First column is as strings, second is as pandas categorical data type,
12 | 	# third is a ten-level string series
13 |     df = pd.DataFrame(
14 |         {
15 |             "A": ["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"],
16 |             "B": pd.Categorical(["a", "a", "a", "a", "b", "b", "b", "c", "c", "d"]),
17 |             "C": list('abcdefghij')
18 |         }
19 |     )
20 |     return df
21 | 
22 | 
23 | def test_cat_anon_on_string_column(data):
24 | 
25 |     result = cat_anon(data.A)
26 | 
27 |     target = ['1', '1', '1', '1', '2', '2', '2', '3', '3', '4']
28 |     assert result.tolist() == target
29 | 
30 |     result_pre = cat_anon(data.A, 'Person_')
31 | 
32 |     target = ['Person_1', 'Person_1', 'Person_1', 'Person_1', 
33 |               'Person_2', 'Person_2', 'Person_2', 'Person_3', 
34 |               'Person_3', 'Person_4']
35 |     assert result_pre.tolist() == target
36 | 
37 | 
38 | def test_cat_anon_on_categorical_column(data):
39 | 
40 |     result = cat_anon(data.B)
41 | 
42 |     target = ['1', '1', '1', '1', '2', '2', '2', '3', '3', '4']
43 |     assert result.tolist() == target
44 | 
45 |     result_pre = cat_anon(data.B, 'Country_')
46 | 
47 |     target = ['Country_1', 'Country_1', 'Country_1', 'Country_1', 
48 |               'Country_2', 'Country_2', 'Country_2', 'Country_3', 
49 |               'Country_3', 'Country_4']
50 |     assert result_pre.tolist() == target
51 | 
52 | 
53 | def test_cat_anon_on_10_level_column(data):
54 | 
55 |     result = cat_anon(data.C)
56 | 
57 |     target = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
58 |     assert result.tolist() == target
59 | 
60 |     result_pre = cat_anon(data.C, 'Religion_')
61 | 
62 |     target = ['Religion_01', 'Religion_02', 'Religion_03', 'Religion_04', 
63 |               'Religion_05', 'Religion_06', 'Religion_07', 'Religion_08', 
64 |               'Religion_09', 'Religion_10']
65 |     assert result_pre.tolist() == target
66 | 


--------------------------------------------------------------------------------
/forcats/functions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | def cat_lump(x, n=5, prop=None, other_level="Other"):
 4 |     """
 5 |     Lump together least common categories into an "Other" category
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     x : pd.Series
10 |       series to be modified
11 |     n : int
12 |       number of levels to preserve
13 |     prop : float
14 |       optional instead of n. Choose the minimum proportion for a level.
15 |       Must be between 0 and 1. Overrides n.
16 |     other_level : str
17 |       "other" category label
18 | 
19 |     Returns
20 |     -------
21 |     y : pd.Series
22 |       modified series (with categorical type)
23 |     """
24 |     counts = x.value_counts()
25 |     if prop:
26 |         assert 0 <= prop <= 1
27 |         min_count = int(prop * x.size)
28 |         if min_count > counts.min():
29 |             repl = counts.loc[counts < min_count].index
30 |             x = x.replace(repl, other_level)
31 |     elif len(counts) > n:
32 |         repl = counts.iloc[n:].index
33 |         x = x.replace(repl, other_level)
34 |     return x
35 | 
36 | 
37 | def cat_count(x, sort=False, prop=False):
38 |     """
39 |     Count entries in a factor
40 | 
41 |     Parameters
42 |     ----------
43 |     x : pd.Series
44 |       series to be counted
45 |     sort : boolean
46 |       If `True`, sort the result so that the most common values are displayed at the top.
47 |     prop : boolean
48 |       If `True`, compute the fraction of marginal table.
49 | 
50 |     Returns
51 |     -------
52 |     y : pd.core.frame.DataFrame
53 |       A df with columns `f`, `n` and `p`, if prop is `True`.
54 |     """
55 | 
56 |     counts = x.value_counts(sort=sort)
57 |     df = pd.DataFrame({'f': counts.index, 'n': counts.values})
58 | 
59 |     if(prop):
60 |         df['p'] = df['n']/sum(df['n'])
61 | 
62 |     return df
63 | 
64 | 
65 | def cat_anon(x, prefix = ""):
66 |     """
67 |     Anonymise category levels
68 | 
69 |     Parameters
70 |     ----------
71 |     x : pd.Series
72 |       series to be modified
73 |     prefix : string
74 |       string prefix to insert in front of the numeric labels
75 | 	  
76 |     Returns
77 |     -------
78 |     y : pd.Series
79 |       Anonymised pandas series
80 |     """
81 |     x_array = pd.factorize(x)[0] + 1
82 | 	
83 |     digits = [len(str(x)) for x in x_array]
84 |     digits = max(digits)
85 | 
86 |     x = [str(x).zfill(digits) for x in x_array]
87 |     x = prefix + pd.Series(x)
88 | 
89 |     return x
90 | 	


--------------------------------------------------------------------------------