├── .coveragerc
├── .gitignore
├── .landscape.yaml
├── .travis.yml
├── ISSUE_TEMPLATE.md
├── LICENSE
├── MANIFEST.in
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── adult.csv.gz
├── ci
    ├── .travis_install.sh
    └── .travis_test.sh
├── datacleaner
    ├── __init__.py
    ├── _version.py
    └── datacleaner.py
├── setup.py
└── tests.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | source = datacleaner
4 | include = */datacleaner/*
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | test_data/adult.csv.gz
65 | 
66 | testing.ipynb
67 | 
68 | tests.ipynb
69 | 


--------------------------------------------------------------------------------
/.landscape.yaml:
--------------------------------------------------------------------------------
1 | doc-warnings: yes
2 | 
3 | ignore-patterns:
4 |   - __init__.py
5 | 
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | virtualenv:
 3 |   system_site_packages: true
 4 | env:
 5 |   matrix:
 6 |     # let's start simple:
 7 |     - PYTHON_VERSION="2.7" LATEST="true"
 8 |     - PYTHON_VERSION="3.4" LATEST="true"
 9 |     - PYTHON_VERSION="3.5" COVERAGE="true" LATEST="true"
10 |     - PYTHON_VERSION="3.5" LATEST="true"
11 | install: source ./ci/.travis_install.sh
12 | script: bash ./ci/.travis_test.sh
13 | after_success:
14 |     # Ignore coveralls failures as the coveralls server is not very reliable
15 |     # but we don't want travis to report a failure in the github UI just
16 |     # because the coverage report failed to be published.
17 |     - if [[ "$COVERAGE" == "true" ]]; then coveralls || echo "failed"; fi
18 | cache: apt
19 | sudo: false
20 | 


--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | [provide general introduction to the issue and why it is relevant to this repository]
 2 | 
 3 | ## Context of the issue
 4 | 
 5 | [provide more detailed introduction to the issue itself and why it is relevant]
 6 | 
 7 | [the remaining entries are only necessary if you are reporting a bug]
 8 | 
 9 | ## Process to reproduce the issue
10 | 
11 | [ordered list the process to finding and recreating the issue, example below]
12 | 
13 | 1. User creates TPOT instance
14 | 2. User calls TPOT `fit()` function with training data
15 | 3. TPOT crashes with a `KeyError` after 5 generations
16 | 
17 | ## Expected result
18 | 
19 | [describe what you would expect to have resulted from this process]
20 | 
21 | ## Current result
22 | 
23 | [describe what you currently experience from this process, and thereby explain the bug]
24 | 
25 | ## Possible fix
26 | 
27 | [not necessary, but suggest fixes or reasons for the bug]
28 | 
29 | ## `name of issue` screenshot
30 | 
31 | [if relevant, include a screenshot]
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Randy Olson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | recursive-include datacleaner *.py
3 | 


--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## What does this PR do?
 2 | 
 3 | 
 4 | 
 5 | ## Where should the reviewer start?
 6 | 
 7 | 
 8 | 
 9 | ## How should this PR be tested?
10 | 
11 | 
12 | 
13 | ## Any background context you want to provide?
14 | 
15 | 
16 | 
17 | ## What are the relevant issues?
18 | 
19 | [you can link directly to issues by entering # then the number of the issue, for example, #3 links to issue 3]
20 | 
21 | ## Screenshots (if appropriate)
22 | 
23 | 
24 | 
25 | ## Questions:
26 | 
27 | - Do the docs need to be updated?
28 | - Does this PR add new (Python) dependencies?
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/rhiever/datacleaner.svg?branch=master)](https://travis-ci.org/rhiever/datacleaner)
  2 | [![Code Health](https://landscape.io/github/rhiever/datacleaner/master/landscape.svg?style=flat)](https://landscape.io/github/rhiever/datacleaner/master)
  3 | [![Coverage Status](https://coveralls.io/repos/github/rhiever/datacleaner/badge.svg?branch=master)](https://coveralls.io/github/rhiever/datacleaner?branch=master)
  4 | ![Python 2.7](https://img.shields.io/badge/python-2.7-blue.svg)
  5 | ![Python 3.5](https://img.shields.io/badge/python-3.5-blue.svg)
  6 | ![License](https://img.shields.io/badge/license-MIT%20License-blue.svg)
  7 | [![PyPI version](https://badge.fury.io/py/datacleaner.svg)](https://badge.fury.io/py/datacleaner)
  8 | 
  9 | 
 10 | # datacleaner
 11 | 
 12 | [![Join the chat at https://gitter.im/rhiever/datacleaner](https://badges.gitter.im/rhiever/datacleaner.svg)](https://gitter.im/rhiever/datacleaner?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 13 | 
 14 | A Python tool that automatically cleans data sets and readies them for analysis.
 15 | 
 16 | ## datacleaner is not magic
 17 | 
 18 | datacleaner works with data in [pandas DataFrames](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html).
 19 | 
 20 | datacleaner is not magic, and it won't take an unorganized blob of text and automagically parse it out for you.
 21 | 
 22 | What datacleaner *will* do is save you a ton of time encoding and cleaning your data once it's already in a format that pandas DataFrames can handle.
 23 | 
 24 | Currently, datacleaner does the following:
 25 | 
 26 | * Optionally drops any row with a missing value
 27 | 
 28 | * Replaces missing values with the mode (for categorical variables) or median (for continuous variables) on a column-by-column basis
 29 | 
 30 | * Encodes non-numerical variables (e.g., categorical variables with strings) with numerical equivalents
 31 | 
 32 | We plan to add more cleaning features as the project grows.
 33 | 
 34 | ## License
 35 | 
 36 | Please see the [repository license](https://github.com/rhiever/datacleaner/blob/master/LICENSE) for the licensing and usage information for datacleaner.
 37 | 
 38 | Generally, we have licensed datacleaner to make it as widely usable as possible.
 39 | 
 40 | ## Installation
 41 | 
 42 | datacleaner is built to use pandas DataFrames and some scikit-learn modules for data preprocessing. As such, we recommend installing the [Anaconda Python distribution](https://www.continuum.io/downloads) prior to installing datacleaner.
 43 | 
 44 | Once the prerequisites are installed, datacleaner can be installed with a simple `pip` command:
 45 | 
 46 | ```
 47 | pip install datacleaner
 48 | ```
 49 | 
 50 | ## Usage
 51 | 
 52 | ### datacleaner on the command line
 53 | 
 54 | datacleaner can be used on the command line. Use `--help` to see its usage instructions.
 55 | 
 56 | ```
 57 | usage: datacleaner [-h] [-cv CROSS_VAL_FILENAME] [-o OUTPUT_FILENAME]
 58 |                    [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR]
 59 |                    [-os OUTPUT_SEPARATOR] [--drop-nans]
 60 |                    [--ignore-update-check] [--version]
 61 |                    INPUT_FILENAME
 62 | 
 63 | A Python tool that automatically cleans data sets and readies them for analysis
 64 | 
 65 | positional arguments:
 66 |   INPUT_FILENAME        File name of the data file to clean
 67 | 
 68 | optional arguments:
 69 |   -h, --help            show this help message and exit
 70 |   -cv CROSS_VAL_FILENAME
 71 |                         File name for the validation data set if performing
 72 |                         cross-validation
 73 |   -o OUTPUT_FILENAME    Data file to output the cleaned data set to
 74 |   -cvo CV_OUTPUT_FILENAME
 75 |                         Data file to output the cleaned cross-validation data
 76 |                         set to
 77 |   -is INPUT_SEPARATOR   Column separator for the input file(s) (default: \t)
 78 |   -os OUTPUT_SEPARATOR  Column separator for the output file(s) (default: \t)
 79 |   --drop-nans           Drop all rows that have a NaN in any column (default: False)
 80 |   --ignore-update-check
 81 |                         Do not check for the latest version of datacleaner
 82 |                         (default: False)
 83 |   --version             show program's version number and exit
 84 | ```
 85 | 
 86 | An example command-line call to datacleaner may look like:
 87 | 
 88 | ```
 89 | datacleaner my_data.csv -o my_clean.data.csv -is , -os ,
 90 | ```
 91 | 
 92 | which will read the data from `my_data.csv` (assuming columns are separated by commas), clean the data set, then output the resulting data set to `my_clean.data.csv`.
 93 | 
 94 | ### datacleaner in scripts
 95 | 
 96 | datacleaner can also be used as part of a script. There are two primary functions implemented in datacleaner: `autoclean` and `autoclean_cv`.
 97 | 
 98 | ```
 99 | autoclean(input_dataframe, drop_nans=False, copy=False, ignore_update_check=False)
100 |     Performs a series of automated data cleaning transformations on the provided data set
101 |     
102 |     Parameters
103 |     ----------
104 |     input_dataframe: pandas.DataFrame
105 |         Data set to clean
106 |     drop_nans: bool
107 |         Drop all rows that have a NaN in any column (default: False)
108 |     copy: bool
109 |         Make a copy of the data set (default: False) 
110 |     encoder: category_encoders transformer
111 |         The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
112 |     encoder_kwargs: category_encoders
113 |         The a valid sklearn transformer to encode categorical features. Default (None)
114 |     ignore_update_check: bool
115 |         Do not check for the latest version of datacleaner
116 | 
117 |     Returns
118 |     ----------
119 |     output_dataframe: pandas.DataFrame
120 |         Cleaned data set
121 | ```
122 | 
123 | ```
124 | autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, ignore_update_check=False)
125 |     Performs a series of automated data cleaning transformations on the provided training and testing data sets
126 |     
127 |     Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations
128 |     from only the training set, then applying those transformations to both the training and testing set.
129 |     By doing so, this function will prevent information leak from the training set into the testing set.
130 |     
131 |     Parameters
132 |     ----------
133 |     training_dataframe: pandas.DataFrame
134 |         Training data set
135 |     testing_dataframe: pandas.DataFrame
136 |         Testing data set
137 |     drop_nans: bool
138 |         Drop all rows that have a NaN in any column (default: False)
139 |     copy: bool
140 |         Make a copy of the data set (default: False)  
141 |     encoder: category_encoders transformer
142 |         The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
143 |     encoder_kwargs: category_encoders
144 |         The a valid sklearn transformer to encode categorical features. Default (None)
145 |     ignore_update_check: bool
146 |         Do not check for the latest version of datacleaner
147 | 
148 |     Returns
149 |     ----------
150 |     output_training_dataframe: pandas.DataFrame
151 |         Cleaned training data set
152 |     output_testing_dataframe: pandas.DataFrame
153 |         Cleaned testing data set
154 | ```
155 | 
156 | Below is an example of datacleaner performing basic cleaning on a data set.
157 | 
158 | ```python
159 | from datacleaner import autoclean
160 | import pandas as pd
161 | 
162 | my_data = pd.read_csv('my_data.csv', sep=',')
163 | my_clean_data = autoclean(my_data)
164 | my_data.to_csv('my_clean_data.csv', sep=',', index=False)
165 | ```
166 | 
167 | Note that because datacleaner works directly on [pandas DataFrames](http://pandas.pydata.org/pandas-docs/stable/10min.html), all [DataFrame operations](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) are still available to the resulting data sets.
168 | 
169 | ## Contributing to datacleaner
170 | 
171 | We welcome you to [check the existing issues](https://github.com/rhiever/datacleaner/issues/) for bugs or enhancements to work on. If you have an idea for an extension to datacleaner, please [file a new issue](https://github.com/rhiever/datacleaner/issues/new) so we can discuss it.
172 | 
173 | ## Citing datacleaner
174 | 
175 | If you use datacleaner as part of your workflow in a scientific publication, please consider citing the datacleaner repository with the following DOI:
176 | 
177 | [![DOI](https://zenodo.org/badge/20747/rhiever/datacleaner.svg)](https://zenodo.org/badge/latestdoi/20747/rhiever/datacleaner)
178 | 


--------------------------------------------------------------------------------
/adult.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhiever/datacleaner/f6f92d763ab385013b72776acf990857d4949e66/adult.csv.gz


--------------------------------------------------------------------------------
/ci/.travis_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # modified from https://github.com/trevorstephens/gplearn
 4 | 
 5 | # This script is meant to be called by the "install" step defined in
 6 | # .travis.yml. See http://docs.travis-ci.com/ for more details.
 7 | # The behavior of the script is controlled by environment variabled defined
 8 | # in the .travis.yml in the top level folder of the project.
 9 | 
10 | 
11 | # License: GNU/GPLv3
12 | 
13 | set -e
14 | 
15 | # Fix the compilers to workaround avoid having the Python 3.4 build
16 | # lookup for g++44 unexpectedly.
17 | export CC=gcc
18 | export CXX=g++
19 | 
20 | # Deactivate the travis-provided virtual environment and setup a
21 | # conda-based environment instead
22 | deactivate
23 | 
24 | # Use the miniconda installer for faster download / install of conda
25 | # itself
26 | wget http://repo.continuum.io/miniconda/Miniconda-3.9.1-Linux-x86_64.sh \
27 |     -O miniconda.sh
28 | chmod +x miniconda.sh && ./miniconda.sh -b
29 | export PATH=/home/travis/miniconda/bin:$PATH
30 | conda update --yes conda
31 | 
32 | # Configure the conda environment and put it in the path using the
33 | # provided versions
34 | if [[ "$LATEST" == "true" ]]; then
35 |     conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
36 |         numpy scipy scikit-learn cython pandas
37 | else
38 |     conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
39 |         numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \
40 |         scikit-learn=$SKLEARN_VERSION \
41 | 	      pandas=$PANDAS_VERSION \
42 |         cython
43 | fi
44 | 
45 | source activate testenv
46 | 
47 | pip install update_checker
48 | 
49 | if [[ "$COVERAGE" == "true" ]]; then
50 |     pip install coverage coveralls
51 | fi
52 | 
53 | # build output in the travis output when it succeeds.
54 | python --version
55 | python -c "import numpy; print('numpy %s' % numpy.__version__)"
56 | python -c "import scipy; print('scipy %s' % scipy.__version__)"
57 | python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
58 | python -c "import pandas; print('pandas %s' % pandas.__version__)"
59 | python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)"
60 | python setup.py build_ext --inplace
61 | 


--------------------------------------------------------------------------------
/ci/.travis_test.sh:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/trevorstephens/gplearn
 2 | 
 3 | # This script is meant to be called by the "install" step defined in
 4 | # .travis.yml. See http://docs.travis-ci.com/ for more details.
 5 | # The behavior of the script is controlled by environment variabled defined
 6 | # in the .travis.yml in the top level folder of the project.
 7 | 
 8 | # License: GNU/GPLv3
 9 | 
10 | set -e
11 | 
12 | python --version
13 | python -c "import numpy; print('numpy %s' % numpy.__version__)"
14 | python -c "import scipy; print('scipy %s' % scipy.__version__)"
15 | python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
16 | python -c "import pandas; print('pandas %s' % pandas.__version__)"
17 | python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)"
18 | 
19 | if [[ "$COVERAGE" == "true" ]]; then
20 |     nosetests -s -v --with-coverage
21 | else
22 |     nosetests -s -v
23 | fi
24 | #make test-doc test-sphinxext
25 | 


--------------------------------------------------------------------------------
/datacleaner/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Copyright (c) 2016 Randal S. Olson
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software
 7 | and associated documentation files (the "Software"), to deal in the Software without restriction,
 8 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all copies or substantial
13 | portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
16 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
19 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 | """
21 | 
22 | from ._version import __version__
23 | from .datacleaner import autoclean, autoclean_cv, main
24 | 


--------------------------------------------------------------------------------
/datacleaner/_version.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Copyright (c) 2016 Randal S. Olson
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software
 7 | and associated documentation files (the "Software"), to deal in the Software without restriction,
 8 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all copies or substantial
13 | portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
16 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
19 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 | """
21 | 
22 | __version__ = '0.1.5'
23 | 


--------------------------------------------------------------------------------
/datacleaner/datacleaner.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Copyright (c) 2016 Randal S. Olson
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software
  7 | and associated documentation files (the "Software"), to deal in the Software without restriction,
  8 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
 10 | subject to the following conditions:
 11 | 
 12 | The above copyright notice and this permission notice shall be included in all copies or substantial
 13 | portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
 16 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 19 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 | """
 21 | 
 22 | from __future__ import print_function
 23 | import pandas as pd
 24 | from sklearn.preprocessing import LabelEncoder
 25 | import argparse
 26 | from update_checker import update_check
 27 | 
 28 | from ._version import __version__
 29 | 
 30 | update_checked = False
 31 | 
 32 | def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None,
 33 |               encoder_kwargs=None, ignore_update_check=False):
 34 |     """Performs a series of automated data cleaning transformations on the provided data set
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     input_dataframe: pandas.DataFrame
 39 |         Data set to clean
 40 |     drop_nans: bool
 41 |         Drop all rows that have a NaN in any column (default: False)
 42 |     copy: bool
 43 |         Make a copy of the data set (default: False)
 44 |     encoder: category_encoders transformer
 45 |         The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
 46 |     encoder_kwargs: category_encoders
 47 |         The a valid sklearn transformer to encode categorical features. Default (None)
 48 |     ignore_update_check: bool
 49 |         Do not check for the latest version of datacleaner
 50 | 
 51 |     Returns
 52 |     ----------
 53 |     output_dataframe: pandas.DataFrame
 54 |         Cleaned data set
 55 | 
 56 |     """
 57 |     global update_checked
 58 |     if ignore_update_check:
 59 |         update_checked = True
 60 | 
 61 |     if not update_checked:
 62 |         update_check('datacleaner', __version__)
 63 |         update_checked = True
 64 | 
 65 |     if copy:
 66 |         input_dataframe = input_dataframe.copy()
 67 | 
 68 |     if drop_nans:
 69 |         input_dataframe.dropna(inplace=True)
 70 | 
 71 |     if encoder_kwargs is None:
 72 |         encoder_kwargs = {}
 73 | 
 74 |     for column in input_dataframe.columns.values:
 75 |         # Replace NaNs with the median or mode of the column depending on the column type
 76 |         try:
 77 |             input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True)
 78 |         except TypeError:
 79 |             most_frequent = input_dataframe[column].mode()
 80 |             # If the mode can't be computed, use the nearest valid value
 81 |             # See https://github.com/rhiever/datacleaner/issues/8
 82 |             if len(most_frequent) > 0:
 83 |                 input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)
 84 |             else:
 85 |                 input_dataframe[column].fillna(method='bfill', inplace=True)
 86 |                 input_dataframe[column].fillna(method='ffill', inplace=True)
 87 | 
 88 | 
 89 |         # Encode all strings with numerical equivalents
 90 |         if str(input_dataframe[column].values.dtype) == 'object':
 91 |             if encoder is not None:
 92 |                 column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values)
 93 |             else:
 94 |                 column_encoder = LabelEncoder().fit(input_dataframe[column].values)
 95 | 
 96 |             input_dataframe[column] = column_encoder.transform(input_dataframe[column].values)
 97 | 
 98 |     return input_dataframe
 99 | 
100 | def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False,
101 |                  encoder=None, encoder_kwargs=None, ignore_update_check=False):
102 |     """Performs a series of automated data cleaning transformations on the provided training and testing data sets
103 | 
104 |     Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations
105 |     from only the training set, then applying those transformations to both the training and testing set.
106 |     By doing so, this function will prevent information leak from the training set into the testing set.
107 | 
108 |     Parameters
109 |     ----------
110 |     training_dataframe: pandas.DataFrame
111 |         Training data set
112 |     testing_dataframe: pandas.DataFrame
113 |         Testing data set
114 |     drop_nans: bool
115 |         Drop all rows that have a NaN in any column (default: False)
116 |     copy: bool
117 |         Make a copy of the data set (default: False)
118 |     encoder: category_encoders transformer
119 |         The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
120 |     encoder_kwargs: category_encoders
121 |         The a valid sklearn transformer to encode categorical features. Default (None)
122 |     ignore_update_check: bool
123 |         Do not check for the latest version of datacleaner
124 | 
125 |     Returns
126 |     ----------
127 |     output_training_dataframe: pandas.DataFrame
128 |         Cleaned training data set
129 |     output_testing_dataframe: pandas.DataFrame
130 |         Cleaned testing data set
131 | 
132 |     """
133 |     global update_checked
134 |     if ignore_update_check:
135 |         update_checked = True
136 | 
137 |     if not update_checked:
138 |         update_check('datacleaner', __version__)
139 |         update_checked = True
140 | 
141 |     if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values):
142 |         raise ValueError('The training and testing DataFrames do not have the same columns. '
143 |                          'Make sure that you are providing the same columns.')
144 | 
145 |     if copy:
146 |         training_dataframe = training_dataframe.copy()
147 |         testing_dataframe = testing_dataframe.copy()
148 |     
149 |     if drop_nans:
150 |         training_dataframe.dropna(inplace=True)
151 |         testing_dataframe.dropna(inplace=True)
152 | 
153 |     if encoder_kwargs is None:
154 |         encoder_kwargs = {}
155 | 
156 |     for column in training_dataframe.columns.values:
157 |         # Replace NaNs with the median or mode of the column depending on the column type
158 |         try:
159 |             column_median = training_dataframe[column].median()
160 |             training_dataframe[column].fillna(column_median, inplace=True)
161 |             testing_dataframe[column].fillna(column_median, inplace=True)
162 |         except TypeError:
163 |             column_mode = training_dataframe[column].mode()[0]
164 |             training_dataframe[column].fillna(column_mode, inplace=True)
165 |             testing_dataframe[column].fillna(column_mode, inplace=True)
166 | 
167 |         # Encode all strings with numerical equivalents
168 |         if str(training_dataframe[column].values.dtype) == 'object':
169 |             if encoder is not None:
170 |                 column_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values)
171 |             else:
172 |                 column_encoder = LabelEncoder().fit(training_dataframe[column].values)
173 | 
174 |             training_dataframe[column] = column_encoder.transform(training_dataframe[column].values)
175 |             testing_dataframe[column] = column_encoder.transform(testing_dataframe[column].values)
176 | 
177 |     return training_dataframe, testing_dataframe
178 | 
179 | 
180 | def main():
181 |     """Main function that is called when datacleaner is run on the command line"""
182 |     parser = argparse.ArgumentParser(description='A Python tool that automatically cleans data sets and readies them for analysis')
183 | 
184 |     parser.add_argument('INPUT_FILENAME', type=str, help='File name of the data file to clean')
185 | 
186 |     parser.add_argument('-cv', action='store', dest='CROSS_VAL_FILENAME', default=None,
187 |                          type=str, help='File name for the validation data set if performing cross-validation')
188 | 
189 |     parser.add_argument('-o', action='store', dest='OUTPUT_FILENAME', default=None,
190 |                         type=str, help='Data file to output the cleaned data set to')
191 | 
192 |     parser.add_argument('-cvo', action='store', dest='CV_OUTPUT_FILENAME', default=None,
193 |                         type=str, help='Data file to output the cleaned cross-validation data set to')
194 | 
195 |     parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t',
196 |                         type=str, help='Column separator for the input file(s) (default: \\t)')
197 |                     
198 |     parser.add_argument('-os', action='store', dest='OUTPUT_SEPARATOR', default='\t',
199 |                         type=str, help='Column separator for the output file(s) (default: \\t)')
200 | 
201 |     parser.add_argument('--drop-nans', action='store_true', dest='DROP_NANS', default=False,
202 |                         help='Drop all rows that have a NaN in any column (default: False)')
203 |                         
204 |     parser.add_argument('--ignore-update-check', action='store_true', dest='IGNORE_UPDATE_CHECK', default=False,
205 |                         help='Do not check for the latest version of datacleaner (default: False)')
206 | 
207 |     parser.add_argument('--version', action='version', version='datacleaner v{version}'.format(version=__version__))
208 | 
209 |     args = parser.parse_args()
210 | 
211 |     input_data = pd.read_csv(args.INPUT_FILENAME, sep=args.INPUT_SEPARATOR)
212 |     if args.CROSS_VAL_FILENAME is None:
213 |         clean_data = autoclean(input_data, drop_nans=args.DROP_NANS, ignore_update_check=args.IGNORE_UPDATE_CHECK)
214 |         if args.OUTPUT_FILENAME is None:
215 |             print('Cleaned data set:')
216 |             print(clean_data)
217 |             print('')
218 |             print('If you cannot view the entire data set, output it to a file instead. '
219 |                   'Type datacleaner --help for more information.')
220 |         else:
221 |             clean_data.to_csv(args.OUTPUT_FILENAME, sep=args.OUTPUT_SEPARATOR, index=False)
222 |     else:
223 |         if args.OUTPUT_FILENAME is not None and args.CV_OUTPUT_FILENAME is None:
224 |             print('You must specify both output file names. Type datacleaner --help for more information.')
225 |             return
226 |     
227 |         cross_val_data = pd.read_csv(args.CROSS_VAL_FILENAME, sep=args.INPUT_SEPARATOR)
228 |         clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data,
229 |                                                                drop_nans=args.DROP_NANS,
230 |                                                                ignore_update_check=args.IGNORE_UPDATE_CHECK)
231 | 
232 |         if args.OUTPUT_FILENAME is None:
233 |             print('Cleaned training data set:')
234 |             print(clean_training_data)
235 |             print('')
236 |             print('Cleaned testing data set:')
237 |             print(clean_testing_data)
238 |             print('')
239 |             print('If you cannot view the entire data set, output it to a file instead. '
240 |                   'Type datacleaner --help for more information.')
241 |         else:
242 |             clean_training_data.to_csv(args.OUTPUT_FILENAME, sep=args.OUTPUT_SEPARATOR, index=False)
243 |             clean_testing_data.to_csv(args.OUTPUT_FILENAME, sep=args.OUTPUT_SEPARATOR, index=False)
244 | 
245 | if __name__ == '__main__':
246 |     main()
247 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from setuptools import setup, find_packages
 4 | 
 5 | def calculate_version():
 6 |     initpy = open('datacleaner/_version.py').read().split('\n')
 7 |     version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1]
 8 |     return version
 9 | 
10 | package_version = calculate_version()
11 | 
12 | setup(
13 |     name='datacleaner',
14 |     version=package_version,
15 |     author='Randal S. Olson',
16 |     author_email='rso@randalolson.com',
17 |     packages=find_packages(),
18 |     url='https://github.com/rhiever/datacleaner',
19 |     license='License :: OSI Approved :: MIT License',
20 |     entry_points={'console_scripts': ['datacleaner=datacleaner:main', ]},
21 |     description=('A Python tool that automatically cleans data sets and readies them for analysis.'),
22 |     long_description='''
23 | A Python tool that automatically cleans data sets and readies them for analysis.
24 | 
25 | Contact
26 | =============
27 | If you have any questions or comments about datacleaner, please feel free to contact me via:
28 | 
29 | E-mail: rso@randalolson.com
30 | 
31 | or Twitter: https://twitter.com/randal_olson
32 | 
33 | This project is hosted at https://github.com/rhiever/datacleaner
34 | ''',
35 |     zip_safe=True,
36 |     install_requires=['pandas', 'scikit-learn', 'update_checker'],
37 |     classifiers=[
38 |         'Intended Audience :: Developers',
39 |         'Intended Audience :: Information Technology',
40 |         'Intended Audience :: Science/Research',
41 |         'License :: OSI Approved :: MIT License',
42 |         'Programming Language :: Python :: 2',
43 |         'Programming Language :: Python :: 2.7',
44 |         'Programming Language :: Python :: 3',
45 |         'Programming Language :: Python :: 3.4',
46 |         'Programming Language :: Python :: 3.5',
47 |         'Topic :: Utilities'
48 |     ],
49 |     keywords=['data cleaning', 'csv', 'machine learning', 'data analysis', 'data engineering'],
50 | )
51 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
  1 | from datacleaner import autoclean, autoclean_cv
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.preprocessing import LabelEncoder
  5 | 
  6 | np.random.seed(300)
  7 | 
  8 | def test_autoclean_already_clean_data():
  9 |     """Test autoclean() with already-clean data"""
 10 |     data = pd.DataFrame({'A': np.random.rand(1000),
 11 |                          'B': np.random.rand(1000),
 12 |                          'C': np.random.randint(0, 3, 1000)})
 13 | 
 14 |     cleaned_data = autoclean(data)
 15 | 
 16 |     # autoclean() should not change the data at all
 17 |     assert cleaned_data.equals(data)
 18 | 
 19 | def test_autoclean_cv_already_clean_data():
 20 |     """Test autoclean_cv() with already-clean data"""
 21 |     data = pd.DataFrame({'A': np.random.rand(1000),
 22 |                          'B': np.random.rand(1000),
 23 |                          'C': np.random.randint(0, 3, 1000)})
 24 | 
 25 |     training_data = data[:500].copy()
 26 |     testing_data = data[500:].copy()
 27 | 
 28 |     cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)
 29 | 
 30 |     # autoclean_cv() should not change the data at all
 31 |     assert cleaned_training_data.equals(training_data)
 32 |     assert cleaned_testing_data.equals(testing_data)
 33 | 
 34 | def test_autoclean_with_nans_all_numerical():
 35 |     """Test autoclean() with a data set that has all numerical values and some NaNs"""
 36 |     data = pd.DataFrame({'A': np.random.rand(1000),
 37 |                          'B': np.random.rand(1000),
 38 |                          'C': np.random.randint(0, 3, 1000)})
 39 | 
 40 |     data.loc[10:20, 'A'] = np.nan
 41 |     data.loc[50:70, 'C'] = np.nan
 42 | 
 43 |     hand_cleaned_data = data.copy()
 44 |     hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
 45 |     hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].median(), inplace=True)
 46 | 
 47 |     cleaned_data = autoclean(data)
 48 | 
 49 |     assert cleaned_data.equals(hand_cleaned_data)
 50 | 
 51 | def test_autoclean_cv_with_nans_all_numerical():
 52 |     """Test autoclean_cv() with a data set that has all numerical values and some NaNs"""
 53 |     data = pd.DataFrame({'A': np.random.rand(1000),
 54 |                          'B': np.random.rand(1000),
 55 |                          'C': np.random.randint(0, 3, 1000)})
 56 | 
 57 |     training_data = data[:500].copy()
 58 |     testing_data = data[500:].copy()
 59 | 
 60 |     training_data.loc[10:20, 'A'] = np.nan
 61 |     training_data.loc[50:70, 'C'] = np.nan
 62 | 
 63 |     testing_data.loc[70:80, 'A'] = np.nan
 64 |     testing_data.loc[10:40, 'C'] = np.nan
 65 | 
 66 |     hand_cleaned_training_data = training_data.copy()
 67 |     hand_cleaned_testing_data = testing_data.copy()
 68 | 
 69 |     training_A_median = hand_cleaned_training_data['A'].median()
 70 |     training_C_median = hand_cleaned_training_data['C'].median()
 71 | 
 72 |     hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True)
 73 |     hand_cleaned_training_data['C'].fillna(training_C_median, inplace=True)
 74 | 
 75 |     hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True)
 76 |     hand_cleaned_testing_data['C'].fillna(training_C_median, inplace=True)
 77 | 
 78 |     cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)
 79 | 
 80 |     assert cleaned_training_data.equals(hand_cleaned_training_data)
 81 |     assert cleaned_testing_data.equals(hand_cleaned_testing_data)
 82 | 
 83 | def test_autoclean_no_nans_with_strings():
 84 |     """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
 85 |     data = pd.DataFrame({'A': np.random.rand(1000),
 86 |                          'B': np.random.rand(1000),
 87 |                          'C': np.random.randint(0, 3, 1000)})
 88 | 
 89 |     string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
 90 |     data['C'] = data['C'].apply(lambda x: string_map[x])
 91 | 
 92 |     hand_cleaned_data = data.copy()
 93 |     hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)
 94 | 
 95 |     cleaned_data = autoclean(data)
 96 | 
 97 |     assert cleaned_data.equals(hand_cleaned_data)
 98 | 
 99 | def test_autoclean_cv_no_nans_with_strings():
100 |     """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
101 |     data = pd.DataFrame({'A': np.random.rand(1000),
102 |                          'B': np.random.rand(1000),
103 |                          'C': np.random.randint(0, 3, 1000)})
104 | 
105 |     string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
106 |     data['C'] = data['C'].apply(lambda x: string_map[x])
107 | 
108 |     training_data = data[:500].copy()
109 |     testing_data = data[500:].copy()
110 | 
111 |     cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)
112 | 
113 |     hand_cleaned_training_data = training_data.copy()
114 |     hand_cleaned_testing_data = testing_data.copy()
115 | 
116 |     encoder = LabelEncoder()
117 |     hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
118 |     hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)
119 | 
120 |     assert cleaned_training_data.equals(hand_cleaned_training_data)
121 |     assert cleaned_testing_data.equals(hand_cleaned_testing_data)
122 | 
123 | def test_autoclean_with_nans_with_strings():
124 |     """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs"""
125 |     data = pd.DataFrame({'A': np.random.rand(1000),
126 |                          'B': np.random.rand(1000),
127 |                          'C': np.random.randint(0, 3, 1000)})
128 | 
129 |     string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
130 |     data['C'] = data['C'].apply(lambda x: string_map[x])
131 | 
132 |     data.loc[10:20, 'A'] = np.nan
133 |     data.loc[50:70, 'C'] = np.nan
134 | 
135 |     hand_cleaned_data = data.copy()
136 |     hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True)
137 |     hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True)
138 |     hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)
139 | 
140 |     cleaned_data = autoclean(data)
141 | 
142 |     assert cleaned_data.equals(hand_cleaned_data)
143 | 
144 | def test_autoclean_cv_with_nans_with_strings():
145 |     """Test autoclean_cv() with a data set that has some string-encoded categorical values and some NaNs"""
146 |     data = pd.DataFrame({'A': np.random.rand(1000),
147 |                          'B': np.random.rand(1000),
148 |                          'C': np.random.randint(0, 3, 1000)})
149 | 
150 |     string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
151 |     data['C'] = data['C'].apply(lambda x: string_map[x])
152 | 
153 |     training_data = data[:500].copy()
154 |     testing_data = data[500:].copy()
155 | 
156 |     training_data.loc[10:20, 'A'] = np.nan
157 |     training_data.loc[50:70, 'C'] = np.nan
158 | 
159 |     testing_data.loc[70:80, 'A'] = np.nan
160 |     testing_data.loc[10:40, 'C'] = np.nan
161 | 
162 |     hand_cleaned_training_data = training_data.copy()
163 |     hand_cleaned_testing_data = testing_data.copy()
164 | 
165 |     training_A_median = hand_cleaned_training_data['A'].median()
166 |     training_C_mode = hand_cleaned_training_data['C'].mode()[0]
167 |     hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True)
168 |     hand_cleaned_training_data['C'].fillna(training_C_mode, inplace=True)
169 | 
170 |     hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True)
171 |     hand_cleaned_testing_data['C'].fillna(training_C_mode, inplace=True)
172 | 
173 |     encoder = LabelEncoder()
174 |     hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
175 |     hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)
176 | 
177 |     cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)
178 | 
179 |     assert cleaned_training_data.equals(hand_cleaned_training_data)
180 |     assert cleaned_testing_data.equals(hand_cleaned_testing_data)
181 | 
182 | def test_autoclean_real_data():
183 |     """Test autoclean() with the adult data set"""
184 |     adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
185 |     adult_data.loc[30:60, 'age'] = np.nan
186 |     adult_data.loc[90:100, 'education'] = np.nan
187 | 
188 |     hand_cleaned_adult_data = adult_data.copy()
189 | 
190 |     hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True)
191 |     hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True)
192 | 
193 |     for column in ['workclass', 'education', 'marital-status',
194 |                    'occupation', 'relationship', 'race',
195 |                    'sex', 'native-country', 'label']:
196 |         hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values)
197 | 
198 |     cleaned_adult_data = autoclean(adult_data)
199 | 
200 |     assert cleaned_adult_data.equals(hand_cleaned_adult_data)
201 | 
202 | def test_autoclean_cv_real_data():
203 |     """Test autoclean_cv() with the adult data set"""
204 |     adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')
205 | 
206 |     training_adult_data = adult_data[:int(len(adult_data) / 2.)].copy()
207 |     testing_adult_data = adult_data[int(len(adult_data) / 2.):].copy()
208 | 
209 |     training_adult_data.loc[30:60, 'age'] = np.nan
210 |     training_adult_data.loc[90:100, 'education'] = np.nan
211 | 
212 |     testing_adult_data.loc[90:110, 'age'] = np.nan
213 |     testing_adult_data.loc[20:40, 'education'] = np.nan
214 | 
215 |     hand_cleaned_training_adult_data = training_adult_data.copy()
216 |     hand_cleaned_testing_adult_data = testing_adult_data.copy()
217 | 
218 |     training_age_median = hand_cleaned_training_adult_data['age'].median()
219 |     training_education_mode = hand_cleaned_training_adult_data['education'].mode()[0]
220 | 
221 |     hand_cleaned_training_adult_data['age'].fillna(training_age_median, inplace=True)
222 |     hand_cleaned_training_adult_data['education'].fillna(training_education_mode, inplace=True)
223 | 
224 |     hand_cleaned_testing_adult_data['age'].fillna(training_age_median, inplace=True)
225 |     hand_cleaned_testing_adult_data['education'].fillna(training_education_mode, inplace=True)
226 | 
227 |     for column in ['workclass', 'education', 'marital-status',
228 |                    'occupation', 'relationship', 'race',
229 |                    'sex', 'native-country', 'label']:
230 |         encoder = LabelEncoder()
231 |         hand_cleaned_training_adult_data[column] = encoder.fit_transform(hand_cleaned_training_adult_data[column].values)
232 |         hand_cleaned_testing_adult_data[column] = encoder.transform(hand_cleaned_testing_adult_data[column].values)
233 | 
234 |     cleaned_adult_training_data, cleaned_adult_testing_data = autoclean_cv(training_adult_data, testing_adult_data)
235 | 
236 |     assert cleaned_adult_training_data.equals(hand_cleaned_training_adult_data)
237 |     assert cleaned_adult_testing_data.equals(hand_cleaned_testing_adult_data)
238 | 


--------------------------------------------------------------------------------