├── .gitignore
├── Dora
    ├── __init__.py
    ├── main.py
    ├── spec.py
    └── spec_data.csv
├── README.md
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__


--------------------------------------------------------------------------------
/Dora/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import Dora


--------------------------------------------------------------------------------
/Dora/main.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from sklearn import preprocessing
  6 | from sklearn.feature_extraction import DictVectorizer
  7 | 
  8 | class Dora:
  9 |   def __init__(self, data = None, output = None):
 10 |     self.snapshots = {}
 11 |     self.logs = []
 12 |     self.configure(data = data, output = output)
 13 | 
 14 |   def configure(self, data = None, output = None):
 15 |     if (type(output) is str or type(output) is int):
 16 |       self.output = output
 17 |     if (type(data) is str):
 18 |       self.initial_data = pd.read_csv(data)
 19 |       self.data = self.initial_data.copy()
 20 |       self.logs = []
 21 |     if (type(data) is pd.DataFrame):
 22 |       self.initial_data = data
 23 |       self.data = self.initial_data.copy()
 24 |       self.logs = []
 25 | 
 26 |   def remove_feature(self, feature_name):
 27 |     del self.data[feature_name]
 28 |     self._log("self.remove_feature('{0}')".format(feature_name))
 29 | 
 30 |   def extract_feature(self, old_feat, new_feat, mapper):
 31 |     new_feature_column = map(mapper, self.data[old_feat])
 32 |     self.data[new_feat] = list(new_feature_column)
 33 |     self._log("self.extract_feature({0}, {1}, {2})".format(old_feat, new_feat, mapper))
 34 | 
 35 |   def impute_missing_values(self):
 36 |     column_names = self.input_columns()
 37 |     imp = preprocessing.Imputer()
 38 |     imp.fit(self.data[column_names])
 39 |     self.data[column_names] = imp.transform(self.data[column_names])
 40 |     self._log("self.impute_missing_values()")
 41 | 
 42 |   def scale_input_values(self):
 43 |     column_names = self.input_columns()
 44 |     self.data[column_names] = preprocessing.scale(self.data[column_names])
 45 |     self._log("self.scale_input_values()")
 46 | 
 47 |   def extract_ordinal_feature(self, feature_name):
 48 |     feature = self.data[feature_name]
 49 |     feature_dictionaries = map(
 50 |       lambda x: { str(feature_name): str(x) },
 51 |       feature
 52 |     )
 53 |     vec = DictVectorizer()
 54 |     one_hot_matrix = vec.fit_transform(feature_dictionaries).toarray()
 55 |     one_hot_matrix = pd.DataFrame(one_hot_matrix)
 56 |     one_hot_matrix.columns = vec.get_feature_names()
 57 |     self.data = pd.concat(
 58 |       [
 59 |         self.data,
 60 |         one_hot_matrix
 61 |       ],
 62 |       axis = 1
 63 |     )
 64 |     del self.data[feature_name]
 65 |     self._log("self.extract_ordinal_feature('{0}')".format(feature_name))
 66 | 
 67 |   def set_training_and_validation(self):
 68 |     training_rows = np.random.rand(len(self.data)) < 0.8
 69 |     self.training_data = self.data[training_rows]
 70 |     self.validation_data = self.data[~training_rows]
 71 | 
 72 |   def plot_feature(self, feature_name):
 73 |     x = self.data[feature_name]
 74 |     y = self.data[self.output]
 75 |     fit = np.polyfit(x, y, deg = 1)
 76 |     fig, ax = plt.subplots()
 77 |     ax.plot(x, fit[1] + fit[0] * x)
 78 |     ax.scatter(x, y)
 79 |     ax.set_title("{0} vs. {1}".format(feature_name, self.output))
 80 |     fig.show()
 81 | 
 82 |   def input_columns(self):
 83 |     column_names = list(self.data.columns)
 84 |     column_names.remove(self.output)
 85 |     return column_names
 86 | 
 87 |   def explore(self):
 88 |     features = self.input_columns()
 89 |     row_count = math.floor(math.sqrt(len(features)))
 90 |     col_count = math.ceil(len(features) / row_count)
 91 |     figure = plt.figure(1)
 92 | 
 93 |     for index, feature in enumerate(features):
 94 |       figure.add_subplot(row_count, col_count, index + 1)
 95 |       x = self.data[feature]
 96 |       y = self.data[self.output]
 97 |       fit = np.polyfit(x, y, deg = 1)
 98 |       plt.plot(x, fit[0] * x + fit[1])
 99 |       plt.scatter(x, y)
100 |       plt.title("{0} vs. {1}".format(feature, self.output))
101 |     plt.show()
102 | 
103 |   def snapshot(self, name):
104 |     snapshot = {
105 |       "data": self.data.copy(),
106 |       "logs": self.logs.copy()
107 |     }
108 |     self.snapshots[name] = snapshot
109 | 
110 |   def use_snapshot(self, name):
111 |     self.data = self.snapshots[name]["data"]
112 |     self.logs = self.snapshots[name]["logs"]
113 | 
114 |   def _log(self, string):
115 |     self.logs.append(string)
116 | 


--------------------------------------------------------------------------------
/Dora/spec.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from main import Dora
 3 | import pandas as pd
 4 | 
 5 | class TestDora(unittest.TestCase):
 6 |   def setUp(self):
 7 |     self.dora = Dora()
 8 |     self.dora.configure(output = 'A', data = './spec_data.csv')
 9 | 
10 |   def test_configure(self):
11 |     data = pd.read_csv('./spec_data.csv')
12 |     self.assertEqual(self.dora.output, 'A')
13 |     self.assertTrue(self.dora.data.equals(data))
14 | 
15 |   def test_remove_feature(self):
16 |     self.dora.remove_feature('useless_feature')
17 |     self.assertFalse('useless_feature' in self.dora.data.columns)
18 | 
19 |   def test_extract_feature(self):
20 |     self.dora.extract_feature(
21 |       'useless_feature',
22 |       'another_useless_feature',
23 |       lambda x: x * 2
24 |     )
25 | 
26 |     actual_column = list(self.dora.data['another_useless_feature'])
27 |     expected_column = [2, 2, 2]
28 |     self.assertEqual(actual_column, expected_column)
29 | 
30 |   def test_impute_missing_values(self):
31 |     del self.dora.data['D']
32 |     self.dora.impute_missing_values()
33 | 
34 |     actual_column = list(self.dora.data['B'])
35 |     expected_column = [2.0, 5.0, 8.0]
36 |     self.assertEqual(actual_column, expected_column)
37 | 
38 |   def test_scale_input_values(self):
39 |     del self.dora.data['D'], self.dora.data['B']
40 |     self.dora.scale_input_values()
41 | 
42 |     actual_column = list(self.dora.data['C'])
43 |     expected_column = [-1.224745, 0.0, 1.224745]
44 |     pairwise_diffs = map(
45 |       lambda actual, expected: abs(actual - expected),
46 |       actual_column,
47 |       expected_column
48 |     )
49 |     total_diff = sum(pairwise_diffs)
50 |     self.assertAlmostEqual(total_diff, 0, places = 6)
51 | 
52 |   def test_extract_ordinal_feature(self):
53 |     self.dora.extract_ordinal_feature('D')
54 |     features = self.dora.data.columns
55 |     self.assertTrue('D=left' in features and 'D=right' in features)
56 | 
57 |   def test_input_columns(self):
58 |     actual_input_columns = list(self.dora.input_columns())
59 |     expected_input_columns = list(self.dora.data.columns)
60 |     expected_input_columns.remove(self.dora.output)
61 |     self.assertEqual(actual_input_columns, expected_input_columns)
62 | 
63 |   def test_logs(self):
64 |     self.dora.extract_ordinal_feature('D')
65 |     self.dora.impute_missing_values()
66 |     self.dora.scale_input_values()
67 | 
68 |     actual_logs = self.dora.logs
69 |     expected_logs = [
70 |       "self.extract_ordinal_feature('D')",
71 |        'self.impute_missing_values()',
72 |        'self.scale_input_values()'
73 |     ]
74 |     self.assertEqual(actual_logs, expected_logs)
75 | 
76 |   def test_snapshots(self):
77 |     self.dora.snapshot('start')
78 |     self.dora.extract_ordinal_feature('D')
79 |     self.dora.use_snapshot('start')
80 | 
81 |     self.assertEqual(self.dora.logs, [])
82 |     self.assertTrue(self.dora.data.equals(self.dora.initial_data))
83 | 
84 | if __name__ == '__main__':
85 |     unittest.main()
86 | 


--------------------------------------------------------------------------------
/Dora/spec_data.csv:
--------------------------------------------------------------------------------
1 | "A","B","C","D","useless_feature"
2 |  1,2,0,"left",1
3 |  4,,1,"right",1
4 |  7,8,2,"left",1


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dora
  2 | Exploratory data analysis toolkit for Python.
  3 | 
  4 | ## Contents
  5 | - [Summary](#summary)
  6 | - [Setup](#setup)
  7 | - [Usage](#use)
  8 |   - [Reading Data & Configuration](#config)
  9 |   - [Cleaning](#clean)
 10 |   - [Feature Selection & Extraction](#feature)
 11 |   - [Visualization](#visual)
 12 |   - [Model Validation](#model)
 13 |   - [Data Versioning](#version)
 14 | - [Testing](#test)
 15 | - [Contribute](#contribute)
 16 | - [License](#license)
 17 | 
 18 | <a name="summary"></a>
 19 | ## Summary
 20 | 
 21 | Dora is a Python library designed to automate the painful parts of exploratory data analysis.
 22 | 
 23 | The library contains convenience functions for data cleaning, feature selection & extraction, visualization, partitioning data for model validation, and versioning transformations of data.
 24 | 
 25 | The library uses and is intended to be a helpful addition to common Python data analysis tools such as pandas, scikit-learn, and matplotlib.
 26 | 
 27 | <a name="setup"></a>
 28 | ## Setup
 29 | 
 30 | To ensure latest code, install this library from the Github repo. 
 31 | 
 32 | ```
 33 | >>> from Dora import Dora
 34 | ```
 35 | 
 36 | <a name="use"></a>
 37 | ## Usage
 38 | 
 39 | <a name="config" ></a>
 40 | #### Reading Data & Configuration
 41 | 
 42 | ```python
 43 | # without initial config
 44 | >>> dora = Dora()
 45 | >>> dora.configure(output = 'A', data = 'path/to/data.csv')
 46 | 
 47 | # is the same as
 48 | >>> import pandas as pd
 49 | >>> dataframe = pd.read_csv('path/to/data.csv')
 50 | >>> dora = Dora(output = 'A', data = dataframe)
 51 | 
 52 | >>> dora.data
 53 |    A   B  C      D  useless_feature
 54 | 0  1   2  0   left                1
 55 | 1  4 NaN  1  right                1
 56 | 2  7   8  2   left                1
 57 | ```
 58 | 
 59 | <a name="clean" ></a>
 60 | #### Cleaning
 61 | 
 62 | ```python
 63 | # read data with missing and poorly scaled values
 64 | >>> import pandas as pd
 65 | >>> df = pd.DataFrame([
 66 | ...   [1, 2, 100],
 67 | ...   [2, None, 200],
 68 | ...   [1, 6, None]
 69 | ... ])
 70 | >>> dora = Dora(output = 0, data = df)
 71 | >>> dora.data
 72 |    0   1    2
 73 | 0  1   2  100
 74 | 1  2 NaN  200
 75 | 2  1   6  NaN
 76 | 
 77 | # impute the missing values (using the average of each column)
 78 | >>> dora.impute_missing_values()
 79 | >>> dora.data
 80 |    0  1    2
 81 | 0  1  2  100
 82 | 1  2  4  200
 83 | 2  1  6  150
 84 | 
 85 | # scale the values of the input variables (center to mean and scale to unit variance)
 86 | >>> dora.scale_input_values()
 87 | >>> dora.data
 88 |    0         1         2
 89 | 0  1 -1.224745 -1.224745
 90 | 1  2  0.000000  1.224745
 91 | 2  1  1.224745  0.000000
 92 | ```
 93 | 
 94 | <a name="feature" ></a>
 95 | #### Feature Selection & Extraction
 96 | 
 97 | ```python
 98 | # feature selection / removing a feature
 99 | >>> dora.data
100 |    A   B  C      D  useless_feature
101 | 0  1   2  0   left                1
102 | 1  4 NaN  1  right                1
103 | 2  7   8  2   left                1
104 | 
105 | >>> dora.remove_feature('useless_feature')
106 | >>> dora.data
107 |    A   B  C      D
108 | 0  1   2  0   left
109 | 1  4 NaN  1  right
110 | 2  7   8  2   left
111 | 
112 | # extract an ordinal feature through one-hot encoding
113 | >>> dora.extract_ordinal_feature('D')
114 | >>> dora.data
115 |    A   B  C  D=left  D=right
116 | 0  1   2  0       1        0
117 | 1  4 NaN  1       0        1
118 | 2  7   8  2       1        0
119 | 
120 | # extract a transformation of another feature
121 | >>> dora.extract_feature('C', 'twoC', lambda x: x * 2)
122 | >>> dora.data
123 |    A   B  C  D=left  D=right  twoC
124 | 0  1   2  0       1        0     0
125 | 1  4 NaN  1       0        1     2
126 | 2  7   8  2       1        0     4
127 | ```
128 | 
129 | <a name="visual" ></a>
130 | #### Visualization
131 | 
132 | ```python
133 | # plot a single feature against the output variable
134 | dora.plot_feature('column-name')
135 | 
136 | # render plots of each feature against the output variable
137 | dora.explore()
138 | ```
139 | 
140 | <a name="model" ></a>
141 | #### Model Validation
142 | 
143 | ```python
144 | # create random partition of training / validation data (~ 80/20 split)
145 | dora.set_training_and_validation()
146 | 
147 | # train a model on the data
148 | X = dora.training_data[dora.input_columns()]
149 | y = dora.training_data[dora.output]
150 | 
151 | some_model.fit(X, y)
152 | 
153 | # validate the model
154 | X = dora.validation_data[dora.input_columns()]
155 | y = dora.validation_data[dora.output]
156 | 
157 | some_model.score(X, y)
158 | ```
159 | 
160 | <a name="version" ></a>
161 | #### Data Versioning
162 | 
163 | ```python
164 | # save a version of your data
165 | >>> dora.data
166 |    A   B  C      D  useless_feature
167 | 0  1   2  0   left                1
168 | 1  4 NaN  1  right                1
169 | 2  7   8  2   left                1
170 | >>> dora.snapshot('initial_data')
171 | 
172 | # keep track of changes to data
173 | >>> dora.remove_feature('useless_feature')
174 | >>> dora.extract_ordinal_feature('D')
175 | >>> dora.impute_missing_values()
176 | >>> dora.scale_input_values()
177 | >>> dora.data
178 |    A         B         C    D=left   D=right
179 | 0  1 -1.224745 -1.224745  0.707107 -0.707107
180 | 1  4  0.000000  0.000000 -1.414214  1.414214
181 | 2  7  1.224745  1.224745  0.707107 -0.707107
182 | 
183 | >>> dora.logs
184 | ["self.remove_feature('useless_feature')", "self.extract_ordinal_feature('D')", 'self.impute_missing_values()', 'self.scale_input_values()']
185 | 
186 | # use a previous version of the data
187 | >>> dora.snapshot('transform1')
188 | >>> dora.use_snapshot('initial_data')
189 | >>> dora.data
190 |    A   B  C      D  useless_feature
191 | 0  1   2  0   left                1
192 | 1  4 NaN  1  right                1
193 | 2  7   8  2   left                1
194 | >>> dora.logs
195 | []
196 | 
197 | # switch back to your transformation
198 | >>> dora.use_snapshot('transform1')
199 | >>> dora.data
200 |    A         B         C    D=left   D=right
201 | 0  1 -1.224745 -1.224745  0.707107 -0.707107
202 | 1  4  0.000000  0.000000 -1.414214  1.414214
203 | 2  7  1.224745  1.224745  0.707107 -0.707107
204 | >>> dora.logs
205 | ["self.remove_feature('useless_feature')", "self.extract_ordinal_feature('D')", 'self.impute_missing_values()', 'self.scale_input_values()']
206 | ```
207 | 
208 | <a name="test" ></a>
209 | ## Testing
210 | 
211 | To run the test suite, simply run `python3 spec.py` from the `Dora` directory.
212 | 
213 | <a name="contribute" ></a>
214 | ## Contribute
215 | 
216 | Pull requests welcome! Feature requests / bugs will be addressed through issues on this repository. While not every feature request will necessarily be handled by me, maintaining a record for interested contributors is useful.
217 | 
218 | Additionally, feel free to submit pull requests which add features or address bugs yourself.
219 | 
220 | 
221 | <a name="license" ></a>
222 | ## License
223 | 
224 | **The MIT License (MIT)**
225 | 
226 | > Copyright (c) 2016 Nathan Epstein
227 | >
228 | > Permission is hereby granted, free of charge, to any person obtaining a copy
229 | > of this software and associated documentation files (the "Software"), to deal
230 | > in the Software without restriction, including without limitation the rights
231 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
232 | > copies of the Software, and to permit persons to whom the Software is
233 | > furnished to do so, subject to the following conditions:
234 | >
235 | > The above copyright notice and this permission notice shall be included in
236 | > all copies or substantial portions of the Software.
237 | >
238 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
239 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
240 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
241 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
242 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
243 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
244 | > THE SOFTWARE.
245 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |   name = "Dora",
 5 |   version = "0.0.4",
 6 |   author = "Nathan Epstein",
 7 |   author_email = "ne2210@columbia.edu",
 8 |   description = ("Exploratory data analysis toolkit for Python"),
 9 |   license = "MIT",
10 |   keywords = "exploratory data analysis",
11 |   install_requires = [
12 |     "matplotlib>=1.5.1",
13 |     "pandas>=0.17.1",
14 |     "numpy>=1.10.4",
15 |     "scipy>=0.17.0",
16 |     "scikit-learn",
17 |   ],
18 |   packages = ['Dora']
19 | )
20 | 


--------------------------------------------------------------------------------