├── .coveragerc ├── .gitignore ├── .landscape.yaml ├── .travis.yml ├── ISSUE_TEMPLATE.md ├── LICENSE ├── MANIFEST.in ├── PULL_REQUEST_TEMPLATE.md ├── README.md ├── adult.csv.gz ├── ci ├── .travis_install.sh └── .travis_test.sh ├── datacleaner ├── __init__.py ├── _version.py └── datacleaner.py ├── setup.py └── tests.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = datacleaner 4 | include = */datacleaner/* 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | test_data/adult.csv.gz 65 | 66 | testing.ipynb 67 | 68 | tests.ipynb 69 | -------------------------------------------------------------------------------- /.landscape.yaml: -------------------------------------------------------------------------------- 1 | doc-warnings: yes 2 | 3 | ignore-patterns: 4 | - __init__.py 5 | 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | virtualenv: 3 | system_site_packages: true 4 | env: 5 | matrix: 6 | # let's start simple: 7 | - PYTHON_VERSION="2.7" LATEST="true" 8 | - PYTHON_VERSION="3.4" LATEST="true" 9 | - PYTHON_VERSION="3.5" COVERAGE="true" LATEST="true" 10 | - PYTHON_VERSION="3.5" LATEST="true" 11 | install: source ./ci/.travis_install.sh 12 | script: bash ./ci/.travis_test.sh 13 | after_success: 14 | # Ignore coveralls failures as the coveralls server is not very reliable 15 | # but we don't want travis to report a failure in the github UI just 16 | # because the coverage report failed to be published. 17 | - if [[ "$COVERAGE" == "true" ]]; then coveralls || echo "failed"; fi 18 | cache: apt 19 | sudo: false 20 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | [provide general introduction to the issue and why it is relevant to this repository] 2 | 3 | ## Context of the issue 4 | 5 | [provide more detailed introduction to the issue itself and why it is relevant] 6 | 7 | [the remaining entries are only necessary if you are reporting a bug] 8 | 9 | ## Process to reproduce the issue 10 | 11 | [ordered list the process to finding and recreating the issue, example below] 12 | 13 | 1. User creates TPOT instance 14 | 2. User calls TPOT `fit()` function with training data 15 | 3. TPOT crashes with a `KeyError` after 5 generations 16 | 17 | ## Expected result 18 | 19 | [describe what you would expect to have resulted from this process] 20 | 21 | ## Current result 22 | 23 | [describe what you currently experience from this process, and thereby explain the bug] 24 | 25 | ## Possible fix 26 | 27 | [not necessary, but suggest fixes or reasons for the bug] 28 | 29 | ## `name of issue` screenshot 30 | 31 | [if relevant, include a screenshot] 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Randy Olson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | recursive-include datacleaner *.py 3 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What does this PR do? 2 | 3 | 4 | 5 | ## Where should the reviewer start? 6 | 7 | 8 | 9 | ## How should this PR be tested? 10 | 11 | 12 | 13 | ## Any background context you want to provide? 14 | 15 | 16 | 17 | ## What are the relevant issues? 18 | 19 | [you can link directly to issues by entering # then the number of the issue, for example, #3 links to issue 3] 20 | 21 | ## Screenshots (if appropriate) 22 | 23 | 24 | 25 | ## Questions: 26 | 27 | - Do the docs need to be updated? 28 | - Does this PR add new (Python) dependencies? 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/rhiever/datacleaner.svg?branch=master)](https://travis-ci.org/rhiever/datacleaner) 2 | [![Code Health](https://landscape.io/github/rhiever/datacleaner/master/landscape.svg?style=flat)](https://landscape.io/github/rhiever/datacleaner/master) 3 | [![Coverage Status](https://coveralls.io/repos/github/rhiever/datacleaner/badge.svg?branch=master)](https://coveralls.io/github/rhiever/datacleaner?branch=master) 4 | ![Python 2.7](https://img.shields.io/badge/python-2.7-blue.svg) 5 | ![Python 3.5](https://img.shields.io/badge/python-3.5-blue.svg) 6 | ![License](https://img.shields.io/badge/license-MIT%20License-blue.svg) 7 | [![PyPI version](https://badge.fury.io/py/datacleaner.svg)](https://badge.fury.io/py/datacleaner) 8 | 9 | 10 | # datacleaner 11 | 12 | [![Join the chat at https://gitter.im/rhiever/datacleaner](https://badges.gitter.im/rhiever/datacleaner.svg)](https://gitter.im/rhiever/datacleaner?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 13 | 14 | A Python tool that automatically cleans data sets and readies them for analysis. 15 | 16 | ## datacleaner is not magic 17 | 18 | datacleaner works with data in [pandas DataFrames](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html). 19 | 20 | datacleaner is not magic, and it won't take an unorganized blob of text and automagically parse it out for you. 21 | 22 | What datacleaner *will* do is save you a ton of time encoding and cleaning your data once it's already in a format that pandas DataFrames can handle. 23 | 24 | Currently, datacleaner does the following: 25 | 26 | * Optionally drops any row with a missing value 27 | 28 | * Replaces missing values with the mode (for categorical variables) or median (for continuous variables) on a column-by-column basis 29 | 30 | * Encodes non-numerical variables (e.g., categorical variables with strings) with numerical equivalents 31 | 32 | We plan to add more cleaning features as the project grows. 33 | 34 | ## License 35 | 36 | Please see the [repository license](https://github.com/rhiever/datacleaner/blob/master/LICENSE) for the licensing and usage information for datacleaner. 37 | 38 | Generally, we have licensed datacleaner to make it as widely usable as possible. 39 | 40 | ## Installation 41 | 42 | datacleaner is built to use pandas DataFrames and some scikit-learn modules for data preprocessing. As such, we recommend installing the [Anaconda Python distribution](https://www.continuum.io/downloads) prior to installing datacleaner. 43 | 44 | Once the prerequisites are installed, datacleaner can be installed with a simple `pip` command: 45 | 46 | ``` 47 | pip install datacleaner 48 | ``` 49 | 50 | ## Usage 51 | 52 | ### datacleaner on the command line 53 | 54 | datacleaner can be used on the command line. Use `--help` to see its usage instructions. 55 | 56 | ``` 57 | usage: datacleaner [-h] [-cv CROSS_VAL_FILENAME] [-o OUTPUT_FILENAME] 58 | [-cvo CV_OUTPUT_FILENAME] [-is INPUT_SEPARATOR] 59 | [-os OUTPUT_SEPARATOR] [--drop-nans] 60 | [--ignore-update-check] [--version] 61 | INPUT_FILENAME 62 | 63 | A Python tool that automatically cleans data sets and readies them for analysis 64 | 65 | positional arguments: 66 | INPUT_FILENAME File name of the data file to clean 67 | 68 | optional arguments: 69 | -h, --help show this help message and exit 70 | -cv CROSS_VAL_FILENAME 71 | File name for the validation data set if performing 72 | cross-validation 73 | -o OUTPUT_FILENAME Data file to output the cleaned data set to 74 | -cvo CV_OUTPUT_FILENAME 75 | Data file to output the cleaned cross-validation data 76 | set to 77 | -is INPUT_SEPARATOR Column separator for the input file(s) (default: \t) 78 | -os OUTPUT_SEPARATOR Column separator for the output file(s) (default: \t) 79 | --drop-nans Drop all rows that have a NaN in any column (default: False) 80 | --ignore-update-check 81 | Do not check for the latest version of datacleaner 82 | (default: False) 83 | --version show program's version number and exit 84 | ``` 85 | 86 | An example command-line call to datacleaner may look like: 87 | 88 | ``` 89 | datacleaner my_data.csv -o my_clean.data.csv -is , -os , 90 | ``` 91 | 92 | which will read the data from `my_data.csv` (assuming columns are separated by commas), clean the data set, then output the resulting data set to `my_clean.data.csv`. 93 | 94 | ### datacleaner in scripts 95 | 96 | datacleaner can also be used as part of a script. There are two primary functions implemented in datacleaner: `autoclean` and `autoclean_cv`. 97 | 98 | ``` 99 | autoclean(input_dataframe, drop_nans=False, copy=False, ignore_update_check=False) 100 | Performs a series of automated data cleaning transformations on the provided data set 101 | 102 | Parameters 103 | ---------- 104 | input_dataframe: pandas.DataFrame 105 | Data set to clean 106 | drop_nans: bool 107 | Drop all rows that have a NaN in any column (default: False) 108 | copy: bool 109 | Make a copy of the data set (default: False) 110 | encoder: category_encoders transformer 111 | The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) 112 | encoder_kwargs: category_encoders 113 | The a valid sklearn transformer to encode categorical features. Default (None) 114 | ignore_update_check: bool 115 | Do not check for the latest version of datacleaner 116 | 117 | Returns 118 | ---------- 119 | output_dataframe: pandas.DataFrame 120 | Cleaned data set 121 | ``` 122 | 123 | ``` 124 | autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, ignore_update_check=False) 125 | Performs a series of automated data cleaning transformations on the provided training and testing data sets 126 | 127 | Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations 128 | from only the training set, then applying those transformations to both the training and testing set. 129 | By doing so, this function will prevent information leak from the training set into the testing set. 130 | 131 | Parameters 132 | ---------- 133 | training_dataframe: pandas.DataFrame 134 | Training data set 135 | testing_dataframe: pandas.DataFrame 136 | Testing data set 137 | drop_nans: bool 138 | Drop all rows that have a NaN in any column (default: False) 139 | copy: bool 140 | Make a copy of the data set (default: False) 141 | encoder: category_encoders transformer 142 | The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) 143 | encoder_kwargs: category_encoders 144 | The a valid sklearn transformer to encode categorical features. Default (None) 145 | ignore_update_check: bool 146 | Do not check for the latest version of datacleaner 147 | 148 | Returns 149 | ---------- 150 | output_training_dataframe: pandas.DataFrame 151 | Cleaned training data set 152 | output_testing_dataframe: pandas.DataFrame 153 | Cleaned testing data set 154 | ``` 155 | 156 | Below is an example of datacleaner performing basic cleaning on a data set. 157 | 158 | ```python 159 | from datacleaner import autoclean 160 | import pandas as pd 161 | 162 | my_data = pd.read_csv('my_data.csv', sep=',') 163 | my_clean_data = autoclean(my_data) 164 | my_data.to_csv('my_clean_data.csv', sep=',', index=False) 165 | ``` 166 | 167 | Note that because datacleaner works directly on [pandas DataFrames](http://pandas.pydata.org/pandas-docs/stable/10min.html), all [DataFrame operations](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) are still available to the resulting data sets. 168 | 169 | ## Contributing to datacleaner 170 | 171 | We welcome you to [check the existing issues](https://github.com/rhiever/datacleaner/issues/) for bugs or enhancements to work on. If you have an idea for an extension to datacleaner, please [file a new issue](https://github.com/rhiever/datacleaner/issues/new) so we can discuss it. 172 | 173 | ## Citing datacleaner 174 | 175 | If you use datacleaner as part of your workflow in a scientific publication, please consider citing the datacleaner repository with the following DOI: 176 | 177 | [![DOI](https://zenodo.org/badge/20747/rhiever/datacleaner.svg)](https://zenodo.org/badge/latestdoi/20747/rhiever/datacleaner) 178 | -------------------------------------------------------------------------------- /adult.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhiever/datacleaner/f6f92d763ab385013b72776acf990857d4949e66/adult.csv.gz -------------------------------------------------------------------------------- /ci/.travis_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # modified from https://github.com/trevorstephens/gplearn 4 | 5 | # This script is meant to be called by the "install" step defined in 6 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 7 | # The behavior of the script is controlled by environment variabled defined 8 | # in the .travis.yml in the top level folder of the project. 9 | 10 | 11 | # License: GNU/GPLv3 12 | 13 | set -e 14 | 15 | # Fix the compilers to workaround avoid having the Python 3.4 build 16 | # lookup for g++44 unexpectedly. 17 | export CC=gcc 18 | export CXX=g++ 19 | 20 | # Deactivate the travis-provided virtual environment and setup a 21 | # conda-based environment instead 22 | deactivate 23 | 24 | # Use the miniconda installer for faster download / install of conda 25 | # itself 26 | wget http://repo.continuum.io/miniconda/Miniconda-3.9.1-Linux-x86_64.sh \ 27 | -O miniconda.sh 28 | chmod +x miniconda.sh && ./miniconda.sh -b 29 | export PATH=/home/travis/miniconda/bin:$PATH 30 | conda update --yes conda 31 | 32 | # Configure the conda environment and put it in the path using the 33 | # provided versions 34 | if [[ "$LATEST" == "true" ]]; then 35 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ 36 | numpy scipy scikit-learn cython pandas 37 | else 38 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ 39 | numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ 40 | scikit-learn=$SKLEARN_VERSION \ 41 | pandas=$PANDAS_VERSION \ 42 | cython 43 | fi 44 | 45 | source activate testenv 46 | 47 | pip install update_checker 48 | 49 | if [[ "$COVERAGE" == "true" ]]; then 50 | pip install coverage coveralls 51 | fi 52 | 53 | # build output in the travis output when it succeeds. 54 | python --version 55 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 56 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 57 | python -c "import sklearn; print('sklearn %s' % sklearn.__version__)" 58 | python -c "import pandas; print('pandas %s' % pandas.__version__)" 59 | python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)" 60 | python setup.py build_ext --inplace 61 | -------------------------------------------------------------------------------- /ci/.travis_test.sh: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/trevorstephens/gplearn 2 | 3 | # This script is meant to be called by the "install" step defined in 4 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 5 | # The behavior of the script is controlled by environment variabled defined 6 | # in the .travis.yml in the top level folder of the project. 7 | 8 | # License: GNU/GPLv3 9 | 10 | set -e 11 | 12 | python --version 13 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 14 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 15 | python -c "import sklearn; print('sklearn %s' % sklearn.__version__)" 16 | python -c "import pandas; print('pandas %s' % pandas.__version__)" 17 | python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)" 18 | 19 | if [[ "$COVERAGE" == "true" ]]; then 20 | nosetests -s -v --with-coverage 21 | else 22 | nosetests -s -v 23 | fi 24 | #make test-doc test-sphinxext 25 | -------------------------------------------------------------------------------- /datacleaner/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright (c) 2016 Randal S. Olson 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software 7 | and associated documentation files (the "Software"), to deal in the Software without restriction, 8 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or substantial 13 | portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 16 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 19 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | """ 21 | 22 | from ._version import __version__ 23 | from .datacleaner import autoclean, autoclean_cv, main 24 | -------------------------------------------------------------------------------- /datacleaner/_version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright (c) 2016 Randal S. Olson 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software 7 | and associated documentation files (the "Software"), to deal in the Software without restriction, 8 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or substantial 13 | portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 16 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 19 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | """ 21 | 22 | __version__ = '0.1.5' 23 | -------------------------------------------------------------------------------- /datacleaner/datacleaner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright (c) 2016 Randal S. Olson 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software 7 | and associated documentation files (the "Software"), to deal in the Software without restriction, 8 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or substantial 13 | portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 16 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 19 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | """ 21 | 22 | from __future__ import print_function 23 | import pandas as pd 24 | from sklearn.preprocessing import LabelEncoder 25 | import argparse 26 | from update_checker import update_check 27 | 28 | from ._version import __version__ 29 | 30 | update_checked = False 31 | 32 | def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, 33 | encoder_kwargs=None, ignore_update_check=False): 34 | """Performs a series of automated data cleaning transformations on the provided data set 35 | 36 | Parameters 37 | ---------- 38 | input_dataframe: pandas.DataFrame 39 | Data set to clean 40 | drop_nans: bool 41 | Drop all rows that have a NaN in any column (default: False) 42 | copy: bool 43 | Make a copy of the data set (default: False) 44 | encoder: category_encoders transformer 45 | The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) 46 | encoder_kwargs: category_encoders 47 | The a valid sklearn transformer to encode categorical features. Default (None) 48 | ignore_update_check: bool 49 | Do not check for the latest version of datacleaner 50 | 51 | Returns 52 | ---------- 53 | output_dataframe: pandas.DataFrame 54 | Cleaned data set 55 | 56 | """ 57 | global update_checked 58 | if ignore_update_check: 59 | update_checked = True 60 | 61 | if not update_checked: 62 | update_check('datacleaner', __version__) 63 | update_checked = True 64 | 65 | if copy: 66 | input_dataframe = input_dataframe.copy() 67 | 68 | if drop_nans: 69 | input_dataframe.dropna(inplace=True) 70 | 71 | if encoder_kwargs is None: 72 | encoder_kwargs = {} 73 | 74 | for column in input_dataframe.columns.values: 75 | # Replace NaNs with the median or mode of the column depending on the column type 76 | try: 77 | input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True) 78 | except TypeError: 79 | most_frequent = input_dataframe[column].mode() 80 | # If the mode can't be computed, use the nearest valid value 81 | # See https://github.com/rhiever/datacleaner/issues/8 82 | if len(most_frequent) > 0: 83 | input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) 84 | else: 85 | input_dataframe[column].fillna(method='bfill', inplace=True) 86 | input_dataframe[column].fillna(method='ffill', inplace=True) 87 | 88 | 89 | # Encode all strings with numerical equivalents 90 | if str(input_dataframe[column].values.dtype) == 'object': 91 | if encoder is not None: 92 | column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values) 93 | else: 94 | column_encoder = LabelEncoder().fit(input_dataframe[column].values) 95 | 96 | input_dataframe[column] = column_encoder.transform(input_dataframe[column].values) 97 | 98 | return input_dataframe 99 | 100 | def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, 101 | encoder=None, encoder_kwargs=None, ignore_update_check=False): 102 | """Performs a series of automated data cleaning transformations on the provided training and testing data sets 103 | 104 | Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations 105 | from only the training set, then applying those transformations to both the training and testing set. 106 | By doing so, this function will prevent information leak from the training set into the testing set. 107 | 108 | Parameters 109 | ---------- 110 | training_dataframe: pandas.DataFrame 111 | Training data set 112 | testing_dataframe: pandas.DataFrame 113 | Testing data set 114 | drop_nans: bool 115 | Drop all rows that have a NaN in any column (default: False) 116 | copy: bool 117 | Make a copy of the data set (default: False) 118 | encoder: category_encoders transformer 119 | The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) 120 | encoder_kwargs: category_encoders 121 | The a valid sklearn transformer to encode categorical features. Default (None) 122 | ignore_update_check: bool 123 | Do not check for the latest version of datacleaner 124 | 125 | Returns 126 | ---------- 127 | output_training_dataframe: pandas.DataFrame 128 | Cleaned training data set 129 | output_testing_dataframe: pandas.DataFrame 130 | Cleaned testing data set 131 | 132 | """ 133 | global update_checked 134 | if ignore_update_check: 135 | update_checked = True 136 | 137 | if not update_checked: 138 | update_check('datacleaner', __version__) 139 | update_checked = True 140 | 141 | if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values): 142 | raise ValueError('The training and testing DataFrames do not have the same columns. ' 143 | 'Make sure that you are providing the same columns.') 144 | 145 | if copy: 146 | training_dataframe = training_dataframe.copy() 147 | testing_dataframe = testing_dataframe.copy() 148 | 149 | if drop_nans: 150 | training_dataframe.dropna(inplace=True) 151 | testing_dataframe.dropna(inplace=True) 152 | 153 | if encoder_kwargs is None: 154 | encoder_kwargs = {} 155 | 156 | for column in training_dataframe.columns.values: 157 | # Replace NaNs with the median or mode of the column depending on the column type 158 | try: 159 | column_median = training_dataframe[column].median() 160 | training_dataframe[column].fillna(column_median, inplace=True) 161 | testing_dataframe[column].fillna(column_median, inplace=True) 162 | except TypeError: 163 | column_mode = training_dataframe[column].mode()[0] 164 | training_dataframe[column].fillna(column_mode, inplace=True) 165 | testing_dataframe[column].fillna(column_mode, inplace=True) 166 | 167 | # Encode all strings with numerical equivalents 168 | if str(training_dataframe[column].values.dtype) == 'object': 169 | if encoder is not None: 170 | column_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values) 171 | else: 172 | column_encoder = LabelEncoder().fit(training_dataframe[column].values) 173 | 174 | training_dataframe[column] = column_encoder.transform(training_dataframe[column].values) 175 | testing_dataframe[column] = column_encoder.transform(testing_dataframe[column].values) 176 | 177 | return training_dataframe, testing_dataframe 178 | 179 | 180 | def main(): 181 | """Main function that is called when datacleaner is run on the command line""" 182 | parser = argparse.ArgumentParser(description='A Python tool that automatically cleans data sets and readies them for analysis') 183 | 184 | parser.add_argument('INPUT_FILENAME', type=str, help='File name of the data file to clean') 185 | 186 | parser.add_argument('-cv', action='store', dest='CROSS_VAL_FILENAME', default=None, 187 | type=str, help='File name for the validation data set if performing cross-validation') 188 | 189 | parser.add_argument('-o', action='store', dest='OUTPUT_FILENAME', default=None, 190 | type=str, help='Data file to output the cleaned data set to') 191 | 192 | parser.add_argument('-cvo', action='store', dest='CV_OUTPUT_FILENAME', default=None, 193 | type=str, help='Data file to output the cleaned cross-validation data set to') 194 | 195 | parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t', 196 | type=str, help='Column separator for the input file(s) (default: \\t)') 197 | 198 | parser.add_argument('-os', action='store', dest='OUTPUT_SEPARATOR', default='\t', 199 | type=str, help='Column separator for the output file(s) (default: \\t)') 200 | 201 | parser.add_argument('--drop-nans', action='store_true', dest='DROP_NANS', default=False, 202 | help='Drop all rows that have a NaN in any column (default: False)') 203 | 204 | parser.add_argument('--ignore-update-check', action='store_true', dest='IGNORE_UPDATE_CHECK', default=False, 205 | help='Do not check for the latest version of datacleaner (default: False)') 206 | 207 | parser.add_argument('--version', action='version', version='datacleaner v{version}'.format(version=__version__)) 208 | 209 | args = parser.parse_args() 210 | 211 | input_data = pd.read_csv(args.INPUT_FILENAME, sep=args.INPUT_SEPARATOR) 212 | if args.CROSS_VAL_FILENAME is None: 213 | clean_data = autoclean(input_data, drop_nans=args.DROP_NANS, ignore_update_check=args.IGNORE_UPDATE_CHECK) 214 | if args.OUTPUT_FILENAME is None: 215 | print('Cleaned data set:') 216 | print(clean_data) 217 | print('') 218 | print('If you cannot view the entire data set, output it to a file instead. ' 219 | 'Type datacleaner --help for more information.') 220 | else: 221 | clean_data.to_csv(args.OUTPUT_FILENAME, sep=args.OUTPUT_SEPARATOR, index=False) 222 | else: 223 | if args.OUTPUT_FILENAME is not None and args.CV_OUTPUT_FILENAME is None: 224 | print('You must specify both output file names. Type datacleaner --help for more information.') 225 | return 226 | 227 | cross_val_data = pd.read_csv(args.CROSS_VAL_FILENAME, sep=args.INPUT_SEPARATOR) 228 | clean_training_data, clean_testing_data = autoclean_cv(input_data, cross_val_data, 229 | drop_nans=args.DROP_NANS, 230 | ignore_update_check=args.IGNORE_UPDATE_CHECK) 231 | 232 | if args.OUTPUT_FILENAME is None: 233 | print('Cleaned training data set:') 234 | print(clean_training_data) 235 | print('') 236 | print('Cleaned testing data set:') 237 | print(clean_testing_data) 238 | print('') 239 | print('If you cannot view the entire data set, output it to a file instead. ' 240 | 'Type datacleaner --help for more information.') 241 | else: 242 | clean_training_data.to_csv(args.OUTPUT_FILENAME, sep=args.OUTPUT_SEPARATOR, index=False) 243 | clean_testing_data.to_csv(args.OUTPUT_FILENAME, sep=args.OUTPUT_SEPARATOR, index=False) 244 | 245 | if __name__ == '__main__': 246 | main() 247 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from setuptools import setup, find_packages 4 | 5 | def calculate_version(): 6 | initpy = open('datacleaner/_version.py').read().split('\n') 7 | version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1] 8 | return version 9 | 10 | package_version = calculate_version() 11 | 12 | setup( 13 | name='datacleaner', 14 | version=package_version, 15 | author='Randal S. Olson', 16 | author_email='rso@randalolson.com', 17 | packages=find_packages(), 18 | url='https://github.com/rhiever/datacleaner', 19 | license='License :: OSI Approved :: MIT License', 20 | entry_points={'console_scripts': ['datacleaner=datacleaner:main', ]}, 21 | description=('A Python tool that automatically cleans data sets and readies them for analysis.'), 22 | long_description=''' 23 | A Python tool that automatically cleans data sets and readies them for analysis. 24 | 25 | Contact 26 | ============= 27 | If you have any questions or comments about datacleaner, please feel free to contact me via: 28 | 29 | E-mail: rso@randalolson.com 30 | 31 | or Twitter: https://twitter.com/randal_olson 32 | 33 | This project is hosted at https://github.com/rhiever/datacleaner 34 | ''', 35 | zip_safe=True, 36 | install_requires=['pandas', 'scikit-learn', 'update_checker'], 37 | classifiers=[ 38 | 'Intended Audience :: Developers', 39 | 'Intended Audience :: Information Technology', 40 | 'Intended Audience :: Science/Research', 41 | 'License :: OSI Approved :: MIT License', 42 | 'Programming Language :: Python :: 2', 43 | 'Programming Language :: Python :: 2.7', 44 | 'Programming Language :: Python :: 3', 45 | 'Programming Language :: Python :: 3.4', 46 | 'Programming Language :: Python :: 3.5', 47 | 'Topic :: Utilities' 48 | ], 49 | keywords=['data cleaning', 'csv', 'machine learning', 'data analysis', 'data engineering'], 50 | ) 51 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | from datacleaner import autoclean, autoclean_cv 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.preprocessing import LabelEncoder 5 | 6 | np.random.seed(300) 7 | 8 | def test_autoclean_already_clean_data(): 9 | """Test autoclean() with already-clean data""" 10 | data = pd.DataFrame({'A': np.random.rand(1000), 11 | 'B': np.random.rand(1000), 12 | 'C': np.random.randint(0, 3, 1000)}) 13 | 14 | cleaned_data = autoclean(data) 15 | 16 | # autoclean() should not change the data at all 17 | assert cleaned_data.equals(data) 18 | 19 | def test_autoclean_cv_already_clean_data(): 20 | """Test autoclean_cv() with already-clean data""" 21 | data = pd.DataFrame({'A': np.random.rand(1000), 22 | 'B': np.random.rand(1000), 23 | 'C': np.random.randint(0, 3, 1000)}) 24 | 25 | training_data = data[:500].copy() 26 | testing_data = data[500:].copy() 27 | 28 | cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) 29 | 30 | # autoclean_cv() should not change the data at all 31 | assert cleaned_training_data.equals(training_data) 32 | assert cleaned_testing_data.equals(testing_data) 33 | 34 | def test_autoclean_with_nans_all_numerical(): 35 | """Test autoclean() with a data set that has all numerical values and some NaNs""" 36 | data = pd.DataFrame({'A': np.random.rand(1000), 37 | 'B': np.random.rand(1000), 38 | 'C': np.random.randint(0, 3, 1000)}) 39 | 40 | data.loc[10:20, 'A'] = np.nan 41 | data.loc[50:70, 'C'] = np.nan 42 | 43 | hand_cleaned_data = data.copy() 44 | hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True) 45 | hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].median(), inplace=True) 46 | 47 | cleaned_data = autoclean(data) 48 | 49 | assert cleaned_data.equals(hand_cleaned_data) 50 | 51 | def test_autoclean_cv_with_nans_all_numerical(): 52 | """Test autoclean_cv() with a data set that has all numerical values and some NaNs""" 53 | data = pd.DataFrame({'A': np.random.rand(1000), 54 | 'B': np.random.rand(1000), 55 | 'C': np.random.randint(0, 3, 1000)}) 56 | 57 | training_data = data[:500].copy() 58 | testing_data = data[500:].copy() 59 | 60 | training_data.loc[10:20, 'A'] = np.nan 61 | training_data.loc[50:70, 'C'] = np.nan 62 | 63 | testing_data.loc[70:80, 'A'] = np.nan 64 | testing_data.loc[10:40, 'C'] = np.nan 65 | 66 | hand_cleaned_training_data = training_data.copy() 67 | hand_cleaned_testing_data = testing_data.copy() 68 | 69 | training_A_median = hand_cleaned_training_data['A'].median() 70 | training_C_median = hand_cleaned_training_data['C'].median() 71 | 72 | hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True) 73 | hand_cleaned_training_data['C'].fillna(training_C_median, inplace=True) 74 | 75 | hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True) 76 | hand_cleaned_testing_data['C'].fillna(training_C_median, inplace=True) 77 | 78 | cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) 79 | 80 | assert cleaned_training_data.equals(hand_cleaned_training_data) 81 | assert cleaned_testing_data.equals(hand_cleaned_testing_data) 82 | 83 | def test_autoclean_no_nans_with_strings(): 84 | """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs""" 85 | data = pd.DataFrame({'A': np.random.rand(1000), 86 | 'B': np.random.rand(1000), 87 | 'C': np.random.randint(0, 3, 1000)}) 88 | 89 | string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} 90 | data['C'] = data['C'].apply(lambda x: string_map[x]) 91 | 92 | hand_cleaned_data = data.copy() 93 | hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values) 94 | 95 | cleaned_data = autoclean(data) 96 | 97 | assert cleaned_data.equals(hand_cleaned_data) 98 | 99 | def test_autoclean_cv_no_nans_with_strings(): 100 | """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs""" 101 | data = pd.DataFrame({'A': np.random.rand(1000), 102 | 'B': np.random.rand(1000), 103 | 'C': np.random.randint(0, 3, 1000)}) 104 | 105 | string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} 106 | data['C'] = data['C'].apply(lambda x: string_map[x]) 107 | 108 | training_data = data[:500].copy() 109 | testing_data = data[500:].copy() 110 | 111 | cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) 112 | 113 | hand_cleaned_training_data = training_data.copy() 114 | hand_cleaned_testing_data = testing_data.copy() 115 | 116 | encoder = LabelEncoder() 117 | hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values) 118 | hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values) 119 | 120 | assert cleaned_training_data.equals(hand_cleaned_training_data) 121 | assert cleaned_testing_data.equals(hand_cleaned_testing_data) 122 | 123 | def test_autoclean_with_nans_with_strings(): 124 | """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs""" 125 | data = pd.DataFrame({'A': np.random.rand(1000), 126 | 'B': np.random.rand(1000), 127 | 'C': np.random.randint(0, 3, 1000)}) 128 | 129 | string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} 130 | data['C'] = data['C'].apply(lambda x: string_map[x]) 131 | 132 | data.loc[10:20, 'A'] = np.nan 133 | data.loc[50:70, 'C'] = np.nan 134 | 135 | hand_cleaned_data = data.copy() 136 | hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True) 137 | hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True) 138 | hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values) 139 | 140 | cleaned_data = autoclean(data) 141 | 142 | assert cleaned_data.equals(hand_cleaned_data) 143 | 144 | def test_autoclean_cv_with_nans_with_strings(): 145 | """Test autoclean_cv() with a data set that has some string-encoded categorical values and some NaNs""" 146 | data = pd.DataFrame({'A': np.random.rand(1000), 147 | 'B': np.random.rand(1000), 148 | 'C': np.random.randint(0, 3, 1000)}) 149 | 150 | string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} 151 | data['C'] = data['C'].apply(lambda x: string_map[x]) 152 | 153 | training_data = data[:500].copy() 154 | testing_data = data[500:].copy() 155 | 156 | training_data.loc[10:20, 'A'] = np.nan 157 | training_data.loc[50:70, 'C'] = np.nan 158 | 159 | testing_data.loc[70:80, 'A'] = np.nan 160 | testing_data.loc[10:40, 'C'] = np.nan 161 | 162 | hand_cleaned_training_data = training_data.copy() 163 | hand_cleaned_testing_data = testing_data.copy() 164 | 165 | training_A_median = hand_cleaned_training_data['A'].median() 166 | training_C_mode = hand_cleaned_training_data['C'].mode()[0] 167 | hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True) 168 | hand_cleaned_training_data['C'].fillna(training_C_mode, inplace=True) 169 | 170 | hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True) 171 | hand_cleaned_testing_data['C'].fillna(training_C_mode, inplace=True) 172 | 173 | encoder = LabelEncoder() 174 | hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values) 175 | hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values) 176 | 177 | cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) 178 | 179 | assert cleaned_training_data.equals(hand_cleaned_training_data) 180 | assert cleaned_testing_data.equals(hand_cleaned_testing_data) 181 | 182 | def test_autoclean_real_data(): 183 | """Test autoclean() with the adult data set""" 184 | adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip') 185 | adult_data.loc[30:60, 'age'] = np.nan 186 | adult_data.loc[90:100, 'education'] = np.nan 187 | 188 | hand_cleaned_adult_data = adult_data.copy() 189 | 190 | hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True) 191 | hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True) 192 | 193 | for column in ['workclass', 'education', 'marital-status', 194 | 'occupation', 'relationship', 'race', 195 | 'sex', 'native-country', 'label']: 196 | hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values) 197 | 198 | cleaned_adult_data = autoclean(adult_data) 199 | 200 | assert cleaned_adult_data.equals(hand_cleaned_adult_data) 201 | 202 | def test_autoclean_cv_real_data(): 203 | """Test autoclean_cv() with the adult data set""" 204 | adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip') 205 | 206 | training_adult_data = adult_data[:int(len(adult_data) / 2.)].copy() 207 | testing_adult_data = adult_data[int(len(adult_data) / 2.):].copy() 208 | 209 | training_adult_data.loc[30:60, 'age'] = np.nan 210 | training_adult_data.loc[90:100, 'education'] = np.nan 211 | 212 | testing_adult_data.loc[90:110, 'age'] = np.nan 213 | testing_adult_data.loc[20:40, 'education'] = np.nan 214 | 215 | hand_cleaned_training_adult_data = training_adult_data.copy() 216 | hand_cleaned_testing_adult_data = testing_adult_data.copy() 217 | 218 | training_age_median = hand_cleaned_training_adult_data['age'].median() 219 | training_education_mode = hand_cleaned_training_adult_data['education'].mode()[0] 220 | 221 | hand_cleaned_training_adult_data['age'].fillna(training_age_median, inplace=True) 222 | hand_cleaned_training_adult_data['education'].fillna(training_education_mode, inplace=True) 223 | 224 | hand_cleaned_testing_adult_data['age'].fillna(training_age_median, inplace=True) 225 | hand_cleaned_testing_adult_data['education'].fillna(training_education_mode, inplace=True) 226 | 227 | for column in ['workclass', 'education', 'marital-status', 228 | 'occupation', 'relationship', 'race', 229 | 'sex', 'native-country', 'label']: 230 | encoder = LabelEncoder() 231 | hand_cleaned_training_adult_data[column] = encoder.fit_transform(hand_cleaned_training_adult_data[column].values) 232 | hand_cleaned_testing_adult_data[column] = encoder.transform(hand_cleaned_testing_adult_data[column].values) 233 | 234 | cleaned_adult_training_data, cleaned_adult_testing_data = autoclean_cv(training_adult_data, testing_adult_data) 235 | 236 | assert cleaned_adult_training_data.equals(hand_cleaned_training_adult_data) 237 | assert cleaned_adult_testing_data.equals(hand_cleaned_testing_adult_data) 238 | --------------------------------------------------------------------------------