├── .gitignore ├── Dora ├── __init__.py ├── main.py ├── spec.py └── spec_data.csv ├── README.md └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ -------------------------------------------------------------------------------- /Dora/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import Dora -------------------------------------------------------------------------------- /Dora/main.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn import preprocessing 6 | from sklearn.feature_extraction import DictVectorizer 7 | 8 | class Dora: 9 | def __init__(self, data = None, output = None): 10 | self.snapshots = {} 11 | self.logs = [] 12 | self.configure(data = data, output = output) 13 | 14 | def configure(self, data = None, output = None): 15 | if (type(output) is str or type(output) is int): 16 | self.output = output 17 | if (type(data) is str): 18 | self.initial_data = pd.read_csv(data) 19 | self.data = self.initial_data.copy() 20 | self.logs = [] 21 | if (type(data) is pd.DataFrame): 22 | self.initial_data = data 23 | self.data = self.initial_data.copy() 24 | self.logs = [] 25 | 26 | def remove_feature(self, feature_name): 27 | del self.data[feature_name] 28 | self._log("self.remove_feature('{0}')".format(feature_name)) 29 | 30 | def extract_feature(self, old_feat, new_feat, mapper): 31 | new_feature_column = map(mapper, self.data[old_feat]) 32 | self.data[new_feat] = list(new_feature_column) 33 | self._log("self.extract_feature({0}, {1}, {2})".format(old_feat, new_feat, mapper)) 34 | 35 | def impute_missing_values(self): 36 | column_names = self.input_columns() 37 | imp = preprocessing.Imputer() 38 | imp.fit(self.data[column_names]) 39 | self.data[column_names] = imp.transform(self.data[column_names]) 40 | self._log("self.impute_missing_values()") 41 | 42 | def scale_input_values(self): 43 | column_names = self.input_columns() 44 | self.data[column_names] = preprocessing.scale(self.data[column_names]) 45 | self._log("self.scale_input_values()") 46 | 47 | def extract_ordinal_feature(self, feature_name): 48 | feature = self.data[feature_name] 49 | feature_dictionaries = map( 50 | lambda x: { str(feature_name): str(x) }, 51 | feature 52 | ) 53 | vec = DictVectorizer() 54 | one_hot_matrix = vec.fit_transform(feature_dictionaries).toarray() 55 | one_hot_matrix = pd.DataFrame(one_hot_matrix) 56 | one_hot_matrix.columns = vec.get_feature_names() 57 | self.data = pd.concat( 58 | [ 59 | self.data, 60 | one_hot_matrix 61 | ], 62 | axis = 1 63 | ) 64 | del self.data[feature_name] 65 | self._log("self.extract_ordinal_feature('{0}')".format(feature_name)) 66 | 67 | def set_training_and_validation(self): 68 | training_rows = np.random.rand(len(self.data)) < 0.8 69 | self.training_data = self.data[training_rows] 70 | self.validation_data = self.data[~training_rows] 71 | 72 | def plot_feature(self, feature_name): 73 | x = self.data[feature_name] 74 | y = self.data[self.output] 75 | fit = np.polyfit(x, y, deg = 1) 76 | fig, ax = plt.subplots() 77 | ax.plot(x, fit[1] + fit[0] * x) 78 | ax.scatter(x, y) 79 | ax.set_title("{0} vs. {1}".format(feature_name, self.output)) 80 | fig.show() 81 | 82 | def input_columns(self): 83 | column_names = list(self.data.columns) 84 | column_names.remove(self.output) 85 | return column_names 86 | 87 | def explore(self): 88 | features = self.input_columns() 89 | row_count = math.floor(math.sqrt(len(features))) 90 | col_count = math.ceil(len(features) / row_count) 91 | figure = plt.figure(1) 92 | 93 | for index, feature in enumerate(features): 94 | figure.add_subplot(row_count, col_count, index + 1) 95 | x = self.data[feature] 96 | y = self.data[self.output] 97 | fit = np.polyfit(x, y, deg = 1) 98 | plt.plot(x, fit[0] * x + fit[1]) 99 | plt.scatter(x, y) 100 | plt.title("{0} vs. {1}".format(feature, self.output)) 101 | plt.show() 102 | 103 | def snapshot(self, name): 104 | snapshot = { 105 | "data": self.data.copy(), 106 | "logs": self.logs.copy() 107 | } 108 | self.snapshots[name] = snapshot 109 | 110 | def use_snapshot(self, name): 111 | self.data = self.snapshots[name]["data"] 112 | self.logs = self.snapshots[name]["logs"] 113 | 114 | def _log(self, string): 115 | self.logs.append(string) 116 | -------------------------------------------------------------------------------- /Dora/spec.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from main import Dora 3 | import pandas as pd 4 | 5 | class TestDora(unittest.TestCase): 6 | def setUp(self): 7 | self.dora = Dora() 8 | self.dora.configure(output = 'A', data = './spec_data.csv') 9 | 10 | def test_configure(self): 11 | data = pd.read_csv('./spec_data.csv') 12 | self.assertEqual(self.dora.output, 'A') 13 | self.assertTrue(self.dora.data.equals(data)) 14 | 15 | def test_remove_feature(self): 16 | self.dora.remove_feature('useless_feature') 17 | self.assertFalse('useless_feature' in self.dora.data.columns) 18 | 19 | def test_extract_feature(self): 20 | self.dora.extract_feature( 21 | 'useless_feature', 22 | 'another_useless_feature', 23 | lambda x: x * 2 24 | ) 25 | 26 | actual_column = list(self.dora.data['another_useless_feature']) 27 | expected_column = [2, 2, 2] 28 | self.assertEqual(actual_column, expected_column) 29 | 30 | def test_impute_missing_values(self): 31 | del self.dora.data['D'] 32 | self.dora.impute_missing_values() 33 | 34 | actual_column = list(self.dora.data['B']) 35 | expected_column = [2.0, 5.0, 8.0] 36 | self.assertEqual(actual_column, expected_column) 37 | 38 | def test_scale_input_values(self): 39 | del self.dora.data['D'], self.dora.data['B'] 40 | self.dora.scale_input_values() 41 | 42 | actual_column = list(self.dora.data['C']) 43 | expected_column = [-1.224745, 0.0, 1.224745] 44 | pairwise_diffs = map( 45 | lambda actual, expected: abs(actual - expected), 46 | actual_column, 47 | expected_column 48 | ) 49 | total_diff = sum(pairwise_diffs) 50 | self.assertAlmostEqual(total_diff, 0, places = 6) 51 | 52 | def test_extract_ordinal_feature(self): 53 | self.dora.extract_ordinal_feature('D') 54 | features = self.dora.data.columns 55 | self.assertTrue('D=left' in features and 'D=right' in features) 56 | 57 | def test_input_columns(self): 58 | actual_input_columns = list(self.dora.input_columns()) 59 | expected_input_columns = list(self.dora.data.columns) 60 | expected_input_columns.remove(self.dora.output) 61 | self.assertEqual(actual_input_columns, expected_input_columns) 62 | 63 | def test_logs(self): 64 | self.dora.extract_ordinal_feature('D') 65 | self.dora.impute_missing_values() 66 | self.dora.scale_input_values() 67 | 68 | actual_logs = self.dora.logs 69 | expected_logs = [ 70 | "self.extract_ordinal_feature('D')", 71 | 'self.impute_missing_values()', 72 | 'self.scale_input_values()' 73 | ] 74 | self.assertEqual(actual_logs, expected_logs) 75 | 76 | def test_snapshots(self): 77 | self.dora.snapshot('start') 78 | self.dora.extract_ordinal_feature('D') 79 | self.dora.use_snapshot('start') 80 | 81 | self.assertEqual(self.dora.logs, []) 82 | self.assertTrue(self.dora.data.equals(self.dora.initial_data)) 83 | 84 | if __name__ == '__main__': 85 | unittest.main() 86 | -------------------------------------------------------------------------------- /Dora/spec_data.csv: -------------------------------------------------------------------------------- 1 | "A","B","C","D","useless_feature" 2 | 1,2,0,"left",1 3 | 4,,1,"right",1 4 | 7,8,2,"left",1 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dora 2 | Exploratory data analysis toolkit for Python. 3 | 4 | ## Contents 5 | - [Summary](#summary) 6 | - [Setup](#setup) 7 | - [Usage](#use) 8 | - [Reading Data & Configuration](#config) 9 | - [Cleaning](#clean) 10 | - [Feature Selection & Extraction](#feature) 11 | - [Visualization](#visual) 12 | - [Model Validation](#model) 13 | - [Data Versioning](#version) 14 | - [Testing](#test) 15 | - [Contribute](#contribute) 16 | - [License](#license) 17 | 18 | 19 | ## Summary 20 | 21 | Dora is a Python library designed to automate the painful parts of exploratory data analysis. 22 | 23 | The library contains convenience functions for data cleaning, feature selection & extraction, visualization, partitioning data for model validation, and versioning transformations of data. 24 | 25 | The library uses and is intended to be a helpful addition to common Python data analysis tools such as pandas, scikit-learn, and matplotlib. 26 | 27 | 28 | ## Setup 29 | 30 | To ensure latest code, install this library from the Github repo. 31 | 32 | ``` 33 | >>> from Dora import Dora 34 | ``` 35 | 36 | 37 | ## Usage 38 | 39 | 40 | #### Reading Data & Configuration 41 | 42 | ```python 43 | # without initial config 44 | >>> dora = Dora() 45 | >>> dora.configure(output = 'A', data = 'path/to/data.csv') 46 | 47 | # is the same as 48 | >>> import pandas as pd 49 | >>> dataframe = pd.read_csv('path/to/data.csv') 50 | >>> dora = Dora(output = 'A', data = dataframe) 51 | 52 | >>> dora.data 53 | A B C D useless_feature 54 | 0 1 2 0 left 1 55 | 1 4 NaN 1 right 1 56 | 2 7 8 2 left 1 57 | ``` 58 | 59 | 60 | #### Cleaning 61 | 62 | ```python 63 | # read data with missing and poorly scaled values 64 | >>> import pandas as pd 65 | >>> df = pd.DataFrame([ 66 | ... [1, 2, 100], 67 | ... [2, None, 200], 68 | ... [1, 6, None] 69 | ... ]) 70 | >>> dora = Dora(output = 0, data = df) 71 | >>> dora.data 72 | 0 1 2 73 | 0 1 2 100 74 | 1 2 NaN 200 75 | 2 1 6 NaN 76 | 77 | # impute the missing values (using the average of each column) 78 | >>> dora.impute_missing_values() 79 | >>> dora.data 80 | 0 1 2 81 | 0 1 2 100 82 | 1 2 4 200 83 | 2 1 6 150 84 | 85 | # scale the values of the input variables (center to mean and scale to unit variance) 86 | >>> dora.scale_input_values() 87 | >>> dora.data 88 | 0 1 2 89 | 0 1 -1.224745 -1.224745 90 | 1 2 0.000000 1.224745 91 | 2 1 1.224745 0.000000 92 | ``` 93 | 94 | 95 | #### Feature Selection & Extraction 96 | 97 | ```python 98 | # feature selection / removing a feature 99 | >>> dora.data 100 | A B C D useless_feature 101 | 0 1 2 0 left 1 102 | 1 4 NaN 1 right 1 103 | 2 7 8 2 left 1 104 | 105 | >>> dora.remove_feature('useless_feature') 106 | >>> dora.data 107 | A B C D 108 | 0 1 2 0 left 109 | 1 4 NaN 1 right 110 | 2 7 8 2 left 111 | 112 | # extract an ordinal feature through one-hot encoding 113 | >>> dora.extract_ordinal_feature('D') 114 | >>> dora.data 115 | A B C D=left D=right 116 | 0 1 2 0 1 0 117 | 1 4 NaN 1 0 1 118 | 2 7 8 2 1 0 119 | 120 | # extract a transformation of another feature 121 | >>> dora.extract_feature('C', 'twoC', lambda x: x * 2) 122 | >>> dora.data 123 | A B C D=left D=right twoC 124 | 0 1 2 0 1 0 0 125 | 1 4 NaN 1 0 1 2 126 | 2 7 8 2 1 0 4 127 | ``` 128 | 129 | 130 | #### Visualization 131 | 132 | ```python 133 | # plot a single feature against the output variable 134 | dora.plot_feature('column-name') 135 | 136 | # render plots of each feature against the output variable 137 | dora.explore() 138 | ``` 139 | 140 | 141 | #### Model Validation 142 | 143 | ```python 144 | # create random partition of training / validation data (~ 80/20 split) 145 | dora.set_training_and_validation() 146 | 147 | # train a model on the data 148 | X = dora.training_data[dora.input_columns()] 149 | y = dora.training_data[dora.output] 150 | 151 | some_model.fit(X, y) 152 | 153 | # validate the model 154 | X = dora.validation_data[dora.input_columns()] 155 | y = dora.validation_data[dora.output] 156 | 157 | some_model.score(X, y) 158 | ``` 159 | 160 | 161 | #### Data Versioning 162 | 163 | ```python 164 | # save a version of your data 165 | >>> dora.data 166 | A B C D useless_feature 167 | 0 1 2 0 left 1 168 | 1 4 NaN 1 right 1 169 | 2 7 8 2 left 1 170 | >>> dora.snapshot('initial_data') 171 | 172 | # keep track of changes to data 173 | >>> dora.remove_feature('useless_feature') 174 | >>> dora.extract_ordinal_feature('D') 175 | >>> dora.impute_missing_values() 176 | >>> dora.scale_input_values() 177 | >>> dora.data 178 | A B C D=left D=right 179 | 0 1 -1.224745 -1.224745 0.707107 -0.707107 180 | 1 4 0.000000 0.000000 -1.414214 1.414214 181 | 2 7 1.224745 1.224745 0.707107 -0.707107 182 | 183 | >>> dora.logs 184 | ["self.remove_feature('useless_feature')", "self.extract_ordinal_feature('D')", 'self.impute_missing_values()', 'self.scale_input_values()'] 185 | 186 | # use a previous version of the data 187 | >>> dora.snapshot('transform1') 188 | >>> dora.use_snapshot('initial_data') 189 | >>> dora.data 190 | A B C D useless_feature 191 | 0 1 2 0 left 1 192 | 1 4 NaN 1 right 1 193 | 2 7 8 2 left 1 194 | >>> dora.logs 195 | [] 196 | 197 | # switch back to your transformation 198 | >>> dora.use_snapshot('transform1') 199 | >>> dora.data 200 | A B C D=left D=right 201 | 0 1 -1.224745 -1.224745 0.707107 -0.707107 202 | 1 4 0.000000 0.000000 -1.414214 1.414214 203 | 2 7 1.224745 1.224745 0.707107 -0.707107 204 | >>> dora.logs 205 | ["self.remove_feature('useless_feature')", "self.extract_ordinal_feature('D')", 'self.impute_missing_values()', 'self.scale_input_values()'] 206 | ``` 207 | 208 | 209 | ## Testing 210 | 211 | To run the test suite, simply run `python3 spec.py` from the `Dora` directory. 212 | 213 | 214 | ## Contribute 215 | 216 | Pull requests welcome! Feature requests / bugs will be addressed through issues on this repository. While not every feature request will necessarily be handled by me, maintaining a record for interested contributors is useful. 217 | 218 | Additionally, feel free to submit pull requests which add features or address bugs yourself. 219 | 220 | 221 | 222 | ## License 223 | 224 | **The MIT License (MIT)** 225 | 226 | > Copyright (c) 2016 Nathan Epstein 227 | > 228 | > Permission is hereby granted, free of charge, to any person obtaining a copy 229 | > of this software and associated documentation files (the "Software"), to deal 230 | > in the Software without restriction, including without limitation the rights 231 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 232 | > copies of the Software, and to permit persons to whom the Software is 233 | > furnished to do so, subject to the following conditions: 234 | > 235 | > The above copyright notice and this permission notice shall be included in 236 | > all copies or substantial portions of the Software. 237 | > 238 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 239 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 240 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 241 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 242 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 243 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 244 | > THE SOFTWARE. 245 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name = "Dora", 5 | version = "0.0.4", 6 | author = "Nathan Epstein", 7 | author_email = "ne2210@columbia.edu", 8 | description = ("Exploratory data analysis toolkit for Python"), 9 | license = "MIT", 10 | keywords = "exploratory data analysis", 11 | install_requires = [ 12 | "matplotlib>=1.5.1", 13 | "pandas>=0.17.1", 14 | "numpy>=1.10.4", 15 | "scipy>=0.17.0", 16 | "scikit-learn", 17 | ], 18 | packages = ['Dora'] 19 | ) 20 | --------------------------------------------------------------------------------