├── .gitignore
├── Dora
├── __init__.py
├── main.py
├── spec.py
└── spec_data.csv
├── README.md
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
--------------------------------------------------------------------------------
/Dora/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import Dora
--------------------------------------------------------------------------------
/Dora/main.py:
--------------------------------------------------------------------------------
1 | import math
2 | import pandas as pd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from sklearn import preprocessing
6 | from sklearn.feature_extraction import DictVectorizer
7 |
8 | class Dora:
9 | def __init__(self, data = None, output = None):
10 | self.snapshots = {}
11 | self.logs = []
12 | self.configure(data = data, output = output)
13 |
14 | def configure(self, data = None, output = None):
15 | if (type(output) is str or type(output) is int):
16 | self.output = output
17 | if (type(data) is str):
18 | self.initial_data = pd.read_csv(data)
19 | self.data = self.initial_data.copy()
20 | self.logs = []
21 | if (type(data) is pd.DataFrame):
22 | self.initial_data = data
23 | self.data = self.initial_data.copy()
24 | self.logs = []
25 |
26 | def remove_feature(self, feature_name):
27 | del self.data[feature_name]
28 | self._log("self.remove_feature('{0}')".format(feature_name))
29 |
30 | def extract_feature(self, old_feat, new_feat, mapper):
31 | new_feature_column = map(mapper, self.data[old_feat])
32 | self.data[new_feat] = list(new_feature_column)
33 | self._log("self.extract_feature({0}, {1}, {2})".format(old_feat, new_feat, mapper))
34 |
35 | def impute_missing_values(self):
36 | column_names = self.input_columns()
37 | imp = preprocessing.Imputer()
38 | imp.fit(self.data[column_names])
39 | self.data[column_names] = imp.transform(self.data[column_names])
40 | self._log("self.impute_missing_values()")
41 |
42 | def scale_input_values(self):
43 | column_names = self.input_columns()
44 | self.data[column_names] = preprocessing.scale(self.data[column_names])
45 | self._log("self.scale_input_values()")
46 |
47 | def extract_ordinal_feature(self, feature_name):
48 | feature = self.data[feature_name]
49 | feature_dictionaries = map(
50 | lambda x: { str(feature_name): str(x) },
51 | feature
52 | )
53 | vec = DictVectorizer()
54 | one_hot_matrix = vec.fit_transform(feature_dictionaries).toarray()
55 | one_hot_matrix = pd.DataFrame(one_hot_matrix)
56 | one_hot_matrix.columns = vec.get_feature_names()
57 | self.data = pd.concat(
58 | [
59 | self.data,
60 | one_hot_matrix
61 | ],
62 | axis = 1
63 | )
64 | del self.data[feature_name]
65 | self._log("self.extract_ordinal_feature('{0}')".format(feature_name))
66 |
67 | def set_training_and_validation(self):
68 | training_rows = np.random.rand(len(self.data)) < 0.8
69 | self.training_data = self.data[training_rows]
70 | self.validation_data = self.data[~training_rows]
71 |
72 | def plot_feature(self, feature_name):
73 | x = self.data[feature_name]
74 | y = self.data[self.output]
75 | fit = np.polyfit(x, y, deg = 1)
76 | fig, ax = plt.subplots()
77 | ax.plot(x, fit[1] + fit[0] * x)
78 | ax.scatter(x, y)
79 | ax.set_title("{0} vs. {1}".format(feature_name, self.output))
80 | fig.show()
81 |
82 | def input_columns(self):
83 | column_names = list(self.data.columns)
84 | column_names.remove(self.output)
85 | return column_names
86 |
87 | def explore(self):
88 | features = self.input_columns()
89 | row_count = math.floor(math.sqrt(len(features)))
90 | col_count = math.ceil(len(features) / row_count)
91 | figure = plt.figure(1)
92 |
93 | for index, feature in enumerate(features):
94 | figure.add_subplot(row_count, col_count, index + 1)
95 | x = self.data[feature]
96 | y = self.data[self.output]
97 | fit = np.polyfit(x, y, deg = 1)
98 | plt.plot(x, fit[0] * x + fit[1])
99 | plt.scatter(x, y)
100 | plt.title("{0} vs. {1}".format(feature, self.output))
101 | plt.show()
102 |
103 | def snapshot(self, name):
104 | snapshot = {
105 | "data": self.data.copy(),
106 | "logs": self.logs.copy()
107 | }
108 | self.snapshots[name] = snapshot
109 |
110 | def use_snapshot(self, name):
111 | self.data = self.snapshots[name]["data"]
112 | self.logs = self.snapshots[name]["logs"]
113 |
114 | def _log(self, string):
115 | self.logs.append(string)
116 |
--------------------------------------------------------------------------------
/Dora/spec.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from main import Dora
3 | import pandas as pd
4 |
5 | class TestDora(unittest.TestCase):
6 | def setUp(self):
7 | self.dora = Dora()
8 | self.dora.configure(output = 'A', data = './spec_data.csv')
9 |
10 | def test_configure(self):
11 | data = pd.read_csv('./spec_data.csv')
12 | self.assertEqual(self.dora.output, 'A')
13 | self.assertTrue(self.dora.data.equals(data))
14 |
15 | def test_remove_feature(self):
16 | self.dora.remove_feature('useless_feature')
17 | self.assertFalse('useless_feature' in self.dora.data.columns)
18 |
19 | def test_extract_feature(self):
20 | self.dora.extract_feature(
21 | 'useless_feature',
22 | 'another_useless_feature',
23 | lambda x: x * 2
24 | )
25 |
26 | actual_column = list(self.dora.data['another_useless_feature'])
27 | expected_column = [2, 2, 2]
28 | self.assertEqual(actual_column, expected_column)
29 |
30 | def test_impute_missing_values(self):
31 | del self.dora.data['D']
32 | self.dora.impute_missing_values()
33 |
34 | actual_column = list(self.dora.data['B'])
35 | expected_column = [2.0, 5.0, 8.0]
36 | self.assertEqual(actual_column, expected_column)
37 |
38 | def test_scale_input_values(self):
39 | del self.dora.data['D'], self.dora.data['B']
40 | self.dora.scale_input_values()
41 |
42 | actual_column = list(self.dora.data['C'])
43 | expected_column = [-1.224745, 0.0, 1.224745]
44 | pairwise_diffs = map(
45 | lambda actual, expected: abs(actual - expected),
46 | actual_column,
47 | expected_column
48 | )
49 | total_diff = sum(pairwise_diffs)
50 | self.assertAlmostEqual(total_diff, 0, places = 6)
51 |
52 | def test_extract_ordinal_feature(self):
53 | self.dora.extract_ordinal_feature('D')
54 | features = self.dora.data.columns
55 | self.assertTrue('D=left' in features and 'D=right' in features)
56 |
57 | def test_input_columns(self):
58 | actual_input_columns = list(self.dora.input_columns())
59 | expected_input_columns = list(self.dora.data.columns)
60 | expected_input_columns.remove(self.dora.output)
61 | self.assertEqual(actual_input_columns, expected_input_columns)
62 |
63 | def test_logs(self):
64 | self.dora.extract_ordinal_feature('D')
65 | self.dora.impute_missing_values()
66 | self.dora.scale_input_values()
67 |
68 | actual_logs = self.dora.logs
69 | expected_logs = [
70 | "self.extract_ordinal_feature('D')",
71 | 'self.impute_missing_values()',
72 | 'self.scale_input_values()'
73 | ]
74 | self.assertEqual(actual_logs, expected_logs)
75 |
76 | def test_snapshots(self):
77 | self.dora.snapshot('start')
78 | self.dora.extract_ordinal_feature('D')
79 | self.dora.use_snapshot('start')
80 |
81 | self.assertEqual(self.dora.logs, [])
82 | self.assertTrue(self.dora.data.equals(self.dora.initial_data))
83 |
84 | if __name__ == '__main__':
85 | unittest.main()
86 |
--------------------------------------------------------------------------------
/Dora/spec_data.csv:
--------------------------------------------------------------------------------
1 | "A","B","C","D","useless_feature"
2 | 1,2,0,"left",1
3 | 4,,1,"right",1
4 | 7,8,2,"left",1
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Dora
2 | Exploratory data analysis toolkit for Python.
3 |
4 | ## Contents
5 | - [Summary](#summary)
6 | - [Setup](#setup)
7 | - [Usage](#use)
8 | - [Reading Data & Configuration](#config)
9 | - [Cleaning](#clean)
10 | - [Feature Selection & Extraction](#feature)
11 | - [Visualization](#visual)
12 | - [Model Validation](#model)
13 | - [Data Versioning](#version)
14 | - [Testing](#test)
15 | - [Contribute](#contribute)
16 | - [License](#license)
17 |
18 |
19 | ## Summary
20 |
21 | Dora is a Python library designed to automate the painful parts of exploratory data analysis.
22 |
23 | The library contains convenience functions for data cleaning, feature selection & extraction, visualization, partitioning data for model validation, and versioning transformations of data.
24 |
25 | The library uses and is intended to be a helpful addition to common Python data analysis tools such as pandas, scikit-learn, and matplotlib.
26 |
27 |
28 | ## Setup
29 |
30 | To ensure latest code, install this library from the Github repo.
31 |
32 | ```
33 | >>> from Dora import Dora
34 | ```
35 |
36 |
37 | ## Usage
38 |
39 |
40 | #### Reading Data & Configuration
41 |
42 | ```python
43 | # without initial config
44 | >>> dora = Dora()
45 | >>> dora.configure(output = 'A', data = 'path/to/data.csv')
46 |
47 | # is the same as
48 | >>> import pandas as pd
49 | >>> dataframe = pd.read_csv('path/to/data.csv')
50 | >>> dora = Dora(output = 'A', data = dataframe)
51 |
52 | >>> dora.data
53 | A B C D useless_feature
54 | 0 1 2 0 left 1
55 | 1 4 NaN 1 right 1
56 | 2 7 8 2 left 1
57 | ```
58 |
59 |
60 | #### Cleaning
61 |
62 | ```python
63 | # read data with missing and poorly scaled values
64 | >>> import pandas as pd
65 | >>> df = pd.DataFrame([
66 | ... [1, 2, 100],
67 | ... [2, None, 200],
68 | ... [1, 6, None]
69 | ... ])
70 | >>> dora = Dora(output = 0, data = df)
71 | >>> dora.data
72 | 0 1 2
73 | 0 1 2 100
74 | 1 2 NaN 200
75 | 2 1 6 NaN
76 |
77 | # impute the missing values (using the average of each column)
78 | >>> dora.impute_missing_values()
79 | >>> dora.data
80 | 0 1 2
81 | 0 1 2 100
82 | 1 2 4 200
83 | 2 1 6 150
84 |
85 | # scale the values of the input variables (center to mean and scale to unit variance)
86 | >>> dora.scale_input_values()
87 | >>> dora.data
88 | 0 1 2
89 | 0 1 -1.224745 -1.224745
90 | 1 2 0.000000 1.224745
91 | 2 1 1.224745 0.000000
92 | ```
93 |
94 |
95 | #### Feature Selection & Extraction
96 |
97 | ```python
98 | # feature selection / removing a feature
99 | >>> dora.data
100 | A B C D useless_feature
101 | 0 1 2 0 left 1
102 | 1 4 NaN 1 right 1
103 | 2 7 8 2 left 1
104 |
105 | >>> dora.remove_feature('useless_feature')
106 | >>> dora.data
107 | A B C D
108 | 0 1 2 0 left
109 | 1 4 NaN 1 right
110 | 2 7 8 2 left
111 |
112 | # extract an ordinal feature through one-hot encoding
113 | >>> dora.extract_ordinal_feature('D')
114 | >>> dora.data
115 | A B C D=left D=right
116 | 0 1 2 0 1 0
117 | 1 4 NaN 1 0 1
118 | 2 7 8 2 1 0
119 |
120 | # extract a transformation of another feature
121 | >>> dora.extract_feature('C', 'twoC', lambda x: x * 2)
122 | >>> dora.data
123 | A B C D=left D=right twoC
124 | 0 1 2 0 1 0 0
125 | 1 4 NaN 1 0 1 2
126 | 2 7 8 2 1 0 4
127 | ```
128 |
129 |
130 | #### Visualization
131 |
132 | ```python
133 | # plot a single feature against the output variable
134 | dora.plot_feature('column-name')
135 |
136 | # render plots of each feature against the output variable
137 | dora.explore()
138 | ```
139 |
140 |
141 | #### Model Validation
142 |
143 | ```python
144 | # create random partition of training / validation data (~ 80/20 split)
145 | dora.set_training_and_validation()
146 |
147 | # train a model on the data
148 | X = dora.training_data[dora.input_columns()]
149 | y = dora.training_data[dora.output]
150 |
151 | some_model.fit(X, y)
152 |
153 | # validate the model
154 | X = dora.validation_data[dora.input_columns()]
155 | y = dora.validation_data[dora.output]
156 |
157 | some_model.score(X, y)
158 | ```
159 |
160 |
161 | #### Data Versioning
162 |
163 | ```python
164 | # save a version of your data
165 | >>> dora.data
166 | A B C D useless_feature
167 | 0 1 2 0 left 1
168 | 1 4 NaN 1 right 1
169 | 2 7 8 2 left 1
170 | >>> dora.snapshot('initial_data')
171 |
172 | # keep track of changes to data
173 | >>> dora.remove_feature('useless_feature')
174 | >>> dora.extract_ordinal_feature('D')
175 | >>> dora.impute_missing_values()
176 | >>> dora.scale_input_values()
177 | >>> dora.data
178 | A B C D=left D=right
179 | 0 1 -1.224745 -1.224745 0.707107 -0.707107
180 | 1 4 0.000000 0.000000 -1.414214 1.414214
181 | 2 7 1.224745 1.224745 0.707107 -0.707107
182 |
183 | >>> dora.logs
184 | ["self.remove_feature('useless_feature')", "self.extract_ordinal_feature('D')", 'self.impute_missing_values()', 'self.scale_input_values()']
185 |
186 | # use a previous version of the data
187 | >>> dora.snapshot('transform1')
188 | >>> dora.use_snapshot('initial_data')
189 | >>> dora.data
190 | A B C D useless_feature
191 | 0 1 2 0 left 1
192 | 1 4 NaN 1 right 1
193 | 2 7 8 2 left 1
194 | >>> dora.logs
195 | []
196 |
197 | # switch back to your transformation
198 | >>> dora.use_snapshot('transform1')
199 | >>> dora.data
200 | A B C D=left D=right
201 | 0 1 -1.224745 -1.224745 0.707107 -0.707107
202 | 1 4 0.000000 0.000000 -1.414214 1.414214
203 | 2 7 1.224745 1.224745 0.707107 -0.707107
204 | >>> dora.logs
205 | ["self.remove_feature('useless_feature')", "self.extract_ordinal_feature('D')", 'self.impute_missing_values()', 'self.scale_input_values()']
206 | ```
207 |
208 |
209 | ## Testing
210 |
211 | To run the test suite, simply run `python3 spec.py` from the `Dora` directory.
212 |
213 |
214 | ## Contribute
215 |
216 | Pull requests welcome! Feature requests / bugs will be addressed through issues on this repository. While not every feature request will necessarily be handled by me, maintaining a record for interested contributors is useful.
217 |
218 | Additionally, feel free to submit pull requests which add features or address bugs yourself.
219 |
220 |
221 |
222 | ## License
223 |
224 | **The MIT License (MIT)**
225 |
226 | > Copyright (c) 2016 Nathan Epstein
227 | >
228 | > Permission is hereby granted, free of charge, to any person obtaining a copy
229 | > of this software and associated documentation files (the "Software"), to deal
230 | > in the Software without restriction, including without limitation the rights
231 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
232 | > copies of the Software, and to permit persons to whom the Software is
233 | > furnished to do so, subject to the following conditions:
234 | >
235 | > The above copyright notice and this permission notice shall be included in
236 | > all copies or substantial portions of the Software.
237 | >
238 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
239 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
240 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
241 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
242 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
243 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
244 | > THE SOFTWARE.
245 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name = "Dora",
5 | version = "0.0.4",
6 | author = "Nathan Epstein",
7 | author_email = "ne2210@columbia.edu",
8 | description = ("Exploratory data analysis toolkit for Python"),
9 | license = "MIT",
10 | keywords = "exploratory data analysis",
11 | install_requires = [
12 | "matplotlib>=1.5.1",
13 | "pandas>=0.17.1",
14 | "numpy>=1.10.4",
15 | "scipy>=0.17.0",
16 | "scikit-learn",
17 | ],
18 | packages = ['Dora']
19 | )
20 |
--------------------------------------------------------------------------------