├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── dldb
    ├── __init__.py
    ├── dldb.py
    └── preprocessor.py
├── dldb_fig.png
├── setup.cfg
├── setup.py
└── tests
    ├── labeling_utils.py
    ├── test.py
    └── testing_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | **/.DS_Store
  6 | .DS_Store
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 
105 | # pickle files
106 | *.p
107 | *.pickle
108 | 
109 | src
110 | 
111 | retail_binary_files
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Feature Labs, Inc.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include LICENSE
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co]
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DLDB
 2 | 
 3 | Deep learning for time-varying multi-entity datasets
 4 | 
 5 | # Installation
 6 | 
 7 | You should be able to just run:
 8 | ```
 9 | pip install dldb
10 | ```
11 | 
12 | If that fails due to Tensor Flow, please visit [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/) and follow their instructions for installing Tensor Flow on your system.
13 | You can also follow their instructions to install the GPU version to allow DLDB to use the GPU.
14 | 
15 | Be aware that recently users have reported issues installing Tensor Flow on Macs due to a new version of GRPC failing to make. If that happens, try installing grpc==1.9.1 and tensorflow without "-U" or "--upgrade":
16 | 
17 | ```
18 | pip install gprc==1.9.1 tensorflow
19 | ```
20 | 
21 | # API
22 | 
23 | See docstrings in `dldb/preprocessing.py` and `dldb/dldb.py`
24 | 
25 | # Graphic
26 | 
27 | ![DL Layers Graphic](dldb_fig.png)
28 | 
29 | # Usage
30 | ### `DLDB` class
31 | 
32 | Builds a recurrent neural network model using Keras from a feature tensor (flattened along the time/sequence dimension into a 2D Pandas DataFrame), and list of categorical feature names.
33 | 
34 | Specify hyperparameters in the constructor:
35 | 
36 | ```
37 | dldb = DLDB(regression=False, classes=[False, True],
38 |             cell_type='GRU')
39 | ```
40 | 
41 | Then compile with the feature tensor and definitions:
42 | ```
43 | dldb.compile(feature_tensor, feature_defs)
44 | ```
45 | 
46 | Or, if feature tensor was not generated from DFS, explicitly pass in the categorical feature names:
47 | ```
48 | dldb.compile(feature_tensor_not_from_dfs,
49 |              categorical_feature_names=['categorical1', 'categorical2'])
50 | ```
51 | 
52 | And fit:
53 | 
54 | ```
55 | labels = pd.Series([False, True, True],
56 |                    index=[13458, 13602, 15222])
57 | dldb.fit(feature_tensor, labels, batch_size=3, epochs=1)
58 | predictions = dldb.predict(feature_tensor)
59 | predictions
60 | >>> array([[0.50211424],
61 |            [0.5629099 ],
62 |            [0.57218206]], dtype=float32)
63 | ```
64 | 
65 | ### `MLPreprocessing` class
66 | 


--------------------------------------------------------------------------------
/dldb/__init__.py:
--------------------------------------------------------------------------------
1 | from .dldb import DLDB
2 | from .preprocessor import MLPreprocessor
3 | __version__ = '0.0.2'
4 | 


--------------------------------------------------------------------------------
/dldb/dldb.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import Dense, LSTM, GRU, Embedding, Input, Dropout, BatchNormalization, Conv1D, MaxPooling1D
  2 | from keras.models import Model
  3 | from keras.preprocessing.sequence import pad_sequences
  4 | from keras.utils import Sequence
  5 | from .preprocessor import MLPreprocessor
  6 | from itertools import groupby
  7 | import keras
  8 | import numpy as np
  9 | import re
 10 | import uuid
 11 | from math import ceil
 12 | 
 13 | 
 14 | RNN_CELLS = {
 15 |     'lstm': LSTM,
 16 |     'gru': GRU,
 17 | }
 18 | 
 19 | 
 20 | def feature_name_to_valid_keras_name(fname):
 21 |     return re.sub(r'[(.]', '_', fname).replace(')', '')
 22 | 
 23 | 
 24 | class DLDBInputGenerator(Sequence):
 25 |     def __init__(self, ftens,
 26 |                  categorical_feature_names,
 27 |                  numeric_input_name,
 28 |                  name_mapping,
 29 |                  numeric_columns,
 30 |                  batch_size=32,
 31 |                  labels=None):
 32 |         self.ftens = ftens
 33 |         self.labels = labels
 34 |         self.instance_id_name = self.ftens.index.names[0]
 35 |         self.ftens.reset_index(self.instance_id_name, drop=False, inplace=True)
 36 |         self.batch_size = batch_size
 37 |         if self.batch_size:
 38 |             self.batch_col = uuid.uuid4()
 39 | 
 40 |             self.ftens[self.batch_col] = self.ftens[self.instance_id_name].astype(
 41 |                 'category').cat.codes // self.batch_size
 42 |             self.ftens.set_index(self.batch_col, inplace=True)
 43 | 
 44 |         # TODO: figure out what to do about these
 45 |         self.name_mapping = name_mapping
 46 |         self.categorical_feature_names = categorical_feature_names
 47 |         self.numeric_input_name = numeric_input_name
 48 |         self.numeric_columns = numeric_columns
 49 | 
 50 |         self._length = 1
 51 |         if self.labels is not None:
 52 |             self.labels = self.labels.to_frame().reset_index(
 53 |                 self.instance_id_name,
 54 |                 drop=False)
 55 |             if self.batch_size:
 56 |                 self.labels[self.batch_col] = self.labels[self.instance_id_name].astype(
 57 |                     'category').cat.codes // self.batch_size
 58 |                 self.labels.set_index(self.batch_col, inplace=True)
 59 | 
 60 |                 self._length = int(ceil(self.labels.shape[0] / self.batch_size))
 61 |             else:
 62 |                 self._length = int(ceil(self.labels.shape[0] / self.batch_size))
 63 |         elif self.batch_size:
 64 |             self._length = int(ceil(self.ftens[self.instance_id_name].nunique() / self.batch_size))
 65 | 
 66 |     def __len__(self):
 67 |         return self._length
 68 | 
 69 |     def __getitem__(self, idx):
 70 |         labels = None
 71 |         if self.batch_size:
 72 |             ftens = self.ftens.loc[idx].set_index(self.instance_id_name)
 73 |             if self.labels is not None:
 74 |                 labels = self.labels.loc[idx].set_index(self.instance_id_name)
 75 |         else:
 76 |             ftens = self.ftens.set_index(self.instance_id_name)
 77 |             if self.labels is not None:
 78 |                 labels = self.labels.set_index(self.instance_id_name)
 79 | 
 80 |         inputs = {self.name_mapping[f]: self._sequences_from_ftens(
 81 |                 ftens[[f]])[:, :, 0]
 82 |               for f in self.categorical_feature_names}
 83 |         if self.numeric_columns:
 84 |             inputs[self.numeric_input_name] = self._sequences_from_ftens(
 85 |                 ftens[self.numeric_columns])
 86 |         if labels is None:
 87 |             return inputs
 88 |         else:
 89 |             return inputs, labels
 90 | 
 91 |     def _sequences_from_ftens(self, ftens):
 92 |         cols = list(ftens.columns)
 93 |         instance_id_name = ftens.index.names[0]
 94 |         ftens.reset_index(inplace=True, drop=False)
 95 |         ftens = ftens[cols + [instance_id_name]]
 96 |         # TODO: revert back to pandas here? since its batched
 97 |         sequences = [np.array(list(group))[:, :-1]
 98 |                      for _, group in groupby(ftens.values, lambda row: row[-1])]
 99 |         sequence_input = pad_sequences(sequences,
100 |                                        padding='pre')
101 |         return sequence_input
102 | 
103 | 
104 | class DLDB(object):
105 |     numeric_input_name = 'numeric_input'
106 | 
107 |     def __init__(self,
108 |                  regression=False,
109 |                  classes=None,
110 |                  cell_type='lstm',
111 |                  recurrent_layer_sizes=(64, 64),
112 |                  dense_layer_sizes=(10,),
113 |                  dense_activation='relu',
114 |                  dropout_fraction=0.2,
115 |                  recurrent_dropout_fraction=0.2,
116 |                  categorical_max_vocab=None,
117 |                  categorical_embedding_size=10,
118 |                  conv_kernel_dim=None,
119 |                  conv_activation='relu',
120 |                  pool_size=4,
121 |                  conv_batch_normalization=False,
122 |                  loss=None,
123 |                  metrics=None,
124 |                  optimizer='rmsprop'):
125 |         '''
126 |         regression (bool): If True, labels represent continuous values to predict (otherwise represent class labels)
127 |         classes (list[object] or np.ndarray[object] or pd.Series[object]): If regression is False, classes contains all possible class labels
128 |         cell_type (str or keras.layers.Layer, optional): Type of Keras cell to use for the recurrent layers. Either provide
129 |             a Keras layer object, or one of ['lstm', 'gru']
130 |         recurrent_layer_sizes (tuple, optional): Number of units in each recurrent layer in network
131 |         dense_layer_sizes (tuple, optional): Number of units in each dense layer in network (which come after recurrent layers)
132 |         dense_activation (str, optional): Keras activation function to use for each dense layer
133 |         dropout_fraction (float, optional): Fraction of outputs to drop out of each (non-recurrent portion of each) layer
134 |         recurrent_dropout_fraction (float, optional): Fraction of outputs to drop out of each recurrent iteration
135 |         categorical_max_vocab (int, optional): If provided, will take the top categorical_max_vocab - 1 categories from
136 |             each categorical variable, and will set the rest to a single "unknown" category.
137 |         categorical_embedding_size (int, optional): If categorical features provided, will embed them each into
138 |             a dense vector of this size
139 |         conv_kernel_dim (int, optional): If provided, will add a 1D Convolutional layer prior to the recurrent layers
140 |         conv_activation (str, optional): Activation to use for the optional convolutional layer
141 |         pool_size (int, optional): Size of max pooling layer that will be used after the convolutional layer if it is present
142 |         conv_batch_normalization (bool, optional): If true, will apply batch normalization to the outputs of the convolutional layer
143 |         loss (str, optional): loss function to use for gradient calculation. If labels is a Boolean Series, defaults
144 |             to `binary_crossentropy`. If labels is an object (multiclass), defaults to `categorical_crossentropy`.
145 |             If labels is numeric, defaults to 'mse'.
146 |         metrics (list[str], optional): List of metrics for Keras to compute internally on validation set.
147 |             If labels is a Boolean Series, defaults
148 |             to ['accuracy', 'f1', 'roc_auc']. If labels is an object (multiclass), defaults to ['accuracy', 'f1_macro'].
149 |             If labels is numeric, defaults to ['mse', 'r2'].
150 |         optimizer (str, optional): Optimizer to use for gradient descent
151 | 
152 |         '''
153 |         self.regression = regression
154 |         self.classes = classes
155 | 
156 |         if self.regression:
157 |             self.output_size = 1
158 |             self.loss = loss or 'mse'
159 |         elif len(self.classes) == 2:
160 |             self.output_size = 1
161 |             self.loss = loss or 'binary_crossentropy'
162 |         else:
163 |             self.output_size = len(self.classes)
164 |             self.loss = loss or 'categorical_crossentropy'
165 | 
166 |         self.cell_type = cell_type
167 |         self.recurrent_layer_sizes = recurrent_layer_sizes
168 |         self.dense_layer_sizes = dense_layer_sizes
169 |         self.dense_activation = dense_activation
170 |         self.dropout_fraction = dropout_fraction
171 |         self.recurrent_dropout_fraction = recurrent_dropout_fraction
172 |         self.categorical_max_vocab = categorical_max_vocab
173 |         self.categorical_embedding_size = categorical_embedding_size
174 |         self.conv_kernel_dim = conv_kernel_dim
175 |         self.conv_activation = conv_activation
176 |         self.pool_size = pool_size
177 |         self.conv_batch_normalization = conv_batch_normalization
178 |         self.metrics = metrics
179 |         self.optimizer = optimizer
180 |         self.max_values_per_instance = None
181 |         self.name_mapping = None
182 |         self.ml_preprocessor = MLPreprocessor(
183 |             categorical_max_vocab=self.categorical_max_vocab,
184 |             classes=self.classes,
185 |             regression=self.regression)
186 | 
187 |     @property
188 |     def categorical_vocab(self):
189 |         return self.ml_preprocessor.categorical_vocab
190 | 
191 |     @property
192 |     def numeric_columns(self):
193 |         return self.ml_preprocessor.numeric_columns
194 | 
195 |     @property
196 |     def categorical_feature_names(self):
197 |         return self.ml_preprocessor.categorical_feature_names
198 | 
199 |     def _preprocess(self, ftens, labels=None,
200 |                     fl=None, categorical_feature_names=None,
201 |                     batch_size=32,
202 |                     fit=True):
203 |         if fit:
204 |             ftens = self.ml_preprocessor.fit_transform(
205 |                 ftens, fl=fl,
206 |                 categorical_feature_names=categorical_feature_names)
207 |             self.name_mapping = {c: feature_name_to_valid_keras_name(c)
208 |                                  for c in ftens.columns}
209 |         else:
210 |             ftens = self.ml_preprocessor.transform(ftens)
211 |         return DLDBInputGenerator(ftens,
212 |                                   self.categorical_feature_names,
213 |                                   self.numeric_input_name,
214 |                                   self.name_mapping,
215 |                                   self.numeric_columns,
216 |                                   batch_size=batch_size,
217 |                                   labels=labels)
218 | 
219 |     def partial_fit(self,
220 |                     ftens=None,
221 |                     labels=None,
222 |                     generator=None,
223 |                     batch_size=32,
224 |                     **kwargs):
225 |         if generator is None:
226 |             generator = self._preprocess(ftens,
227 |                                          labels,
228 |                                          batch_size=batch_size,
229 |                                          fit=False)
230 | 
231 |         return (self.model.fit_generator(generator,
232 |                                          **kwargs),
233 |                 generator)
234 | 
235 |     def fit(self,
236 |             ftens,
237 |             labels,
238 |             fl=None, categorical_feature_names=None,
239 |             batch_size=32,
240 |             **kwargs):
241 |         generator = self._preprocess(
242 |             ftens,
243 |             labels,
244 |             fl=fl,
245 |             categorical_feature_names=categorical_feature_names,
246 |             batch_size=batch_size,
247 |             fit=True)
248 |         self._compile_keras_model()
249 |         return (self.model.fit_generator(generator,
250 |                                          **kwargs),
251 |                 generator)
252 | 
253 |     def predict(self, ftens, verbose=1, **kwargs):
254 |         if verbose > 0:
255 |             print("Transforming input tensor into numeric sequences")
256 |         generator = self._preprocess(ftens, batch_size=None, fit=False)
257 |         if verbose > 0:
258 |             print("Predicting using Keras model")
259 |         predictions = self.model.predict_generator(generator, **kwargs)
260 |         if verbose > 0:
261 |             print("Transforming outputs")
262 |         if not self.regression and len(self.classes) > 2:
263 |             predictions = np.array([self.lb.classes_[i]
264 |                                     for i in predictions.argmax(axis=1)])
265 |         return predictions
266 | 
267 |     def _compile_keras_model(self):
268 |         inputs = []
269 |         cat_embedding_layers = []
270 |         for i, f in enumerate(self.categorical_feature_names):
271 |             feature_max_vocab = len(self.categorical_vocab[f]) + 1
272 |             if self.categorical_max_vocab is not None:
273 |                 feature_max_vocab = min(feature_max_vocab,
274 |                                         self.categorical_max_vocab + 1)
275 |             cat_input = Input(shape=(None,),
276 |                               dtype='int32',
277 |                               name=self.name_mapping[f])
278 |             inputs.append(cat_input)
279 |             embedding = Embedding(output_dim=self.categorical_embedding_size,
280 |                                   input_dim=feature_max_vocab,
281 |                                   mask_zero=True)
282 |             embedding = embedding(cat_input)
283 |             cat_embedding_layers.append(embedding)
284 | 
285 |         numeric_input = None
286 |         if len(self.numeric_columns) > 0:
287 |             numeric_input = Input(shape=(None,
288 |                                          len(self.numeric_columns)),
289 |                                   dtype='float32',
290 |                                   name=self.numeric_input_name)
291 |             inputs.append(numeric_input)
292 | 
293 |         rnn_inputs = []
294 |         rnn_input_size = 0
295 |         if len(cat_embedding_layers):
296 |             rnn_inputs.extend(cat_embedding_layers)
297 |             rnn_input_size += (self.categorical_embedding_size *
298 |                                len(cat_embedding_layers))
299 |         if numeric_input is not None:
300 |             rnn_inputs.append(numeric_input)
301 |             rnn_input_size += len(self.numeric_columns)
302 |         if len(rnn_inputs) > 1:
303 |             rnn_inputs = keras.layers.concatenate(rnn_inputs)
304 |         else:
305 |             rnn_inputs = rnn_inputs[0]
306 | 
307 |         if self.conv_kernel_dim is not None:
308 | 
309 |             conv_layer = Conv1D(self.categorical_embedding_size//2,
310 |                                 self.conv_kernel_dim,
311 |                                 activation=self.conv_activation)
312 |             if self.conv_batch_normalization:
313 |                 rnn_inputs = BatchNormalization()(rnn_inputs)
314 |             conv_layer = conv_layer(rnn_inputs)
315 |             mp_layer = MaxPooling1D(pool_size=self.pool_size)
316 |             rnn_inputs = mp_layer(conv_layer)
317 | 
318 |         if isinstance(self.cell_type, str):
319 |             self.RNNCell = RNN_CELLS[self.cell_type]
320 |         else:
321 |             self.RNNCell = self.cell_type
322 |         prev_layer = rnn_inputs
323 |         for i, layer_size in enumerate(self.recurrent_layer_sizes):
324 |             return_sequences = True
325 |             if i == len(self.recurrent_layer_sizes) - 1:
326 |                 return_sequences = False
327 |             layer = self.RNNCell(
328 |                 layer_size,
329 |                 return_sequences=return_sequences,
330 |                 dropout=self.dropout_fraction,
331 |                 recurrent_dropout=self.recurrent_dropout_fraction)
332 |             layer = layer(prev_layer)
333 |             prev_layer = layer
334 |         for layer_size in self.dense_layer_sizes:
335 |             layer = Dense(layer_size,
336 |                           activation=self.dense_activation)(prev_layer)
337 |             dropout_layer = Dropout(self.dropout_fraction)(layer)
338 |             prev_layer = dropout_layer
339 | 
340 |         output_layer = Dense(self.output_size, activation='sigmoid',
341 |                              name='target')(prev_layer)
342 |         self.model = Model(inputs=inputs, outputs=output_layer)
343 |         self.model.compile(optimizer=self.optimizer, loss=self.loss)
344 | 


--------------------------------------------------------------------------------
/dldb/preprocessor.py:
--------------------------------------------------------------------------------
  1 | from featuretools.variable_types import Discrete, Boolean
  2 | import numpy as np
  3 | from sklearn.preprocessing import MinMaxScaler, LabelBinarizer
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class MLPreprocessor(object):
  8 |     def __init__(self,
  9 |                  categorical_max_vocab=None,
 10 |                  classes=None,
 11 |                  regression=False):
 12 |         self.categorical_max_vocab = categorical_max_vocab
 13 |         self.classes = classes
 14 |         self.regression = regression
 15 |         self.categorical_vocab = None
 16 | 
 17 |     def fit_transform(self, ftens, fl=None, categorical_feature_names=None, labels=None):
 18 |         if categorical_feature_names is not None:
 19 |             self.categorical_feature_names = categorical_feature_names
 20 |         elif fl is not None:
 21 |             self.categorical_feature_names = [f.get_name() for f in fl
 22 |                                               if issubclass(f.variable_type,
 23 |                                                             Discrete)
 24 |                                               and not
 25 |                                               f.variable_type == Boolean]
 26 |         else:
 27 |             self.categorical_feature_names = [c for c in ftens.columns
 28 |                                               if ftens[c].dtype == object]
 29 | 
 30 |         # Can't handle multiindex
 31 |         if len(ftens.index.names) > 1:
 32 |             index_name = ftens.index.names[0]
 33 |             ftens = ftens.reset_index(index_name, drop=False).set_index(index_name)
 34 |         self.categorical_vocab = self._gen_categorical_mapping(ftens)
 35 | 
 36 |         self.numeric_columns = [f for f in ftens.columns
 37 |                                 if f not in self.categorical_feature_names]
 38 | 
 39 |         ftens = self.fit_transform_scaler_imputer(ftens)
 40 | 
 41 |         if not self.regression:
 42 |             self.lb = LabelBinarizer().fit(self.classes)
 43 | 
 44 |         if labels is not None:
 45 |             return ftens, self.transform_labels(labels)
 46 |         else:
 47 |             return ftens
 48 | 
 49 |     def fit_transform_scaler_imputer(self, ftens):
 50 |         self.fill_vals = {}
 51 |         new_ftens = ftens
 52 |         if len(self.numeric_columns) > 0:
 53 |             numeric_ftens = ftens[self.numeric_columns]
 54 | 
 55 |             numeric_ftens = numeric_ftens.astype(np.float32)
 56 |             for f in self.numeric_columns:
 57 |                 if ftens[f].dropna().shape[0] == 0:
 58 |                     fill_val = 0
 59 |                 else:
 60 |                     fill_val = numeric_ftens[f].dropna().mean()
 61 |                 self.fill_vals[f] = fill_val
 62 |                 numeric_ftens.loc[~np.isfinite(numeric_ftens[f]), f] = np.nan
 63 |             numeric_ftens.fillna(value=self.fill_vals, inplace=True)
 64 |             self.scaler = MinMaxScaler()
 65 |             numeric_ftens = self.scaler.fit_transform(numeric_ftens)
 66 |             new_ftens[self.numeric_columns] = numeric_ftens
 67 | 
 68 |         return self._map_categorical_ftens_to_int(new_ftens)
 69 | 
 70 |     def transform(self, ftens, labels=None):
 71 |         ftens = self._map_categorical_ftens_to_int(ftens)
 72 |         if len(self.numeric_columns) > 0:
 73 |             numeric_ftens = ftens[self.numeric_columns]
 74 |             numeric_ftens = numeric_ftens.astype(np.float32)
 75 |             for f in self.numeric_columns:
 76 |                 vals = numeric_ftens[f]
 77 |                 numeric_ftens.loc[~np.isfinite(numeric_ftens[f]), f] = np.nan
 78 |                 if vals.dropna().shape[0] != vals.shape[0]:
 79 |                     numeric_ftens[f].fillna(self.fill_vals[f], inplace=True)
 80 |             numeric_ftens = self.scaler.transform(numeric_ftens)
 81 |             ftens[self.numeric_columns] = numeric_ftens
 82 |         if labels is not None:
 83 |             return ftens, self.transform_labels(labels)
 84 |         else:
 85 |             return ftens
 86 | 
 87 |     def transform_labels(self, labels):
 88 |         if not self.regression:
 89 |             labels = pd.Series(labels).astype(int)
 90 |             if len(self.classes) > 2:
 91 |                 labels = self.lb.transform(labels)
 92 |         return labels
 93 | 
 94 |     def _map_categorical_ftens_to_int(self, ftens):
 95 |         new_ftens = ftens
 96 |         for f in self.categorical_feature_names:
 97 |             numeric_series, new_mapping = self._map_categorical_series_to_int(
 98 |                 ftens[f],
 99 |                 self.categorical_vocab.get(f, None))
100 |             new_ftens[f] = numeric_series
101 |             self.categorical_vocab[f] = new_mapping
102 |         return new_ftens
103 | 
104 |     def _gen_categorical_mapping(self, ftens):
105 |         categorical_vocab = {}
106 |         for f in self.categorical_feature_names:
107 |             val_counts = ftens[f].dropna().value_counts()
108 |             mapping = {v: k + 1 for k, v in enumerate(val_counts.index)}
109 |             mapping[np.nan] = 0
110 |             if (self.categorical_max_vocab is not None and
111 |                     self.categorical_max_vocab < len(val_counts)):
112 |                 num_unique = len(val_counts) - self.categorical_max_vocab
113 |                 unknown = val_counts.tail(num_unique).index.tolist()
114 |                 mapping.update({u: 0 for u in unknown})
115 |             categorical_vocab[f] = mapping
116 |         return categorical_vocab
117 | 
118 |     def _map_categorical_series_to_int(self, input_series,
119 |                                        mapping):
120 |         unique_vals = set(input_series.unique())
121 |         # make sure we don't add any new nans
122 |         # since id(np.float64('nan')) != id(np.nan),
123 |         # and so we could end up with multiple nans in the
124 |         # mapping dict
125 |         new_mapping = {u: 0 for u in unique_vals if not pd.isnull(u)}
126 |         new_mapping.update(mapping)
127 |         numeric = input_series.map(new_mapping)
128 |         numeric.fillna(0, inplace=True)
129 |         return numeric, new_mapping
130 | 


--------------------------------------------------------------------------------
/dldb_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alteryx/DL-DB/32e75ed9235aaf5a6183d7f20cfd2dbd59ccbe18/dldb_fig.png


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | PACKAGE_NAME = 'dldb'
 5 | 
 6 | 
 7 | def read_package_variable(key):
 8 |     """Read the value of a variable from the package without importing."""
 9 |     module_path = os.path.join(PACKAGE_NAME, '__init__.py')
10 |     with open(module_path) as module:
11 |         for line in module:
12 |             parts = line.strip().split(' ')
13 |             if parts and parts[0] == key:
14 |                 return parts[-1].strip("'")
15 |     assert False, "'{0}' not found in '{1}'".format(key, module_path)
16 | 
17 | 
18 | setup(
19 |     name=PACKAGE_NAME,
20 |     version=read_package_variable('__version__'),
21 |     description='Deep learning for relational datasets with a time-component',
22 |     packages=find_packages(),
23 |     python_requires='>=3',
24 |     install_requires=[
25 |         'featuretools>=0.1.20',
26 |         'keras>=2.1.4',
27 |         'scikit-learn>=0.19.1',
28 |         'tensorflow>=1.6.0',
29 |     ],
30 |     url='https://github.com/HDI-Project/DL-DB',
31 | )
32 | 


--------------------------------------------------------------------------------
/tests/labeling_utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | 
  5 | def create_labels(entityset,
  6 |                   min_training_data='28 days',
  7 |                   lead='7 days',
  8 |                   window='28 days',
  9 |                   reduce='sum',
 10 |                   binarize=None,
 11 |                   iterate_by=None):
 12 |     label_cols = ['quantity', 'price']
 13 |     time_index = "order_date"
 14 |     index = "customer_id"
 15 |     df = entityset['orders'].df.merge(
 16 |         entityset['order_products'].df, how='outer')
 17 | 
 18 |     tqdm.pandas(desc="Creating Labels", unit="customer")
 19 | 
 20 |     # # Only use data after one of the label columns has been non-null
 21 |     # for i, v in df[label_cols].iterrows():
 22 |         # if v.dropna(how='all').shape[0] > 0:
 23 |             # df = df.loc[slice(i, None), :]
 24 |             # break
 25 |     grouped = df.groupby(index, as_index=True)
 26 | 
 27 |     project_cutoff_dates = grouped.progress_apply(
 28 |         lambda df: make_labels_from_windows(
 29 |             df,
 30 |             cols=label_cols,
 31 |             min_training_data=min_training_data,
 32 |             lead=lead, window=window,
 33 |             index_col=index,
 34 |             date_col=time_index,
 35 |             reduce=reduce,
 36 |             iterate_by=iterate_by))
 37 | 
 38 |     project_cutoff_dates = project_cutoff_dates.dropna()
 39 | 
 40 |     cutoff_with_labels = (project_cutoff_dates.reset_index(level=0)
 41 |                                               .reset_index()
 42 |                                               .rename(columns={'index': 'time',
 43 |                                                                0: 'label'}))
 44 |     if binarize:
 45 |         cutoff_with_labels['label'] = binarize(cutoff_with_labels['label'])
 46 | 
 47 |     return (cutoff_with_labels[[index, "time", "label"]]
 48 |             .sort_values(["time", index]))
 49 | 
 50 | 
 51 | def sample_labels(labels, random_seed=1, n=1, gap=None):
 52 |     """
 53 |     Select 1 label per customer
 54 |     """
 55 |     def sample(df):
 56 |         if gap is not None:
 57 |             samples = [df.iloc[0]]
 58 |             for i, row in df.iloc[1:].iterrows():
 59 |                 if row['time'] - samples[-1]['time'] > gap:
 60 |                     samples.append(row)
 61 |             samples = pd.DataFrame(samples)
 62 |             return samples.sample(min(n, samples.shape[0]), random_state=random_seed)
 63 |         else:
 64 |             return df.sample(min(n, df.shape[0]), random_state=random_seed)
 65 | 
 66 |     labels = labels.groupby(labels['customer_id']).apply(sample)
 67 |     return labels.sort_values(['time', 'customer_id'])
 68 | 
 69 | 
 70 | def make_labels_from_windows(df, cols,
 71 |                              min_training_data, lead, window,
 72 |                              index_col, date_col,
 73 |                              reduce='min', iterate_by=None):
 74 |     customer_id = df[index_col].iloc[0]
 75 | 
 76 |     if iterate_by is not None:
 77 |         iterate_by = pd.Timedelta(iterate_by)
 78 | 
 79 |     vals = df[[date_col] + cols]
 80 | 
 81 |     date_series = vals[date_col]
 82 |     start = date_series.min() + pd.Timedelta(min_training_data)
 83 |     end = date_series.max()
 84 | 
 85 |     if end - start < pd.Timedelta(lead):
 86 |         return pd.Series([np.nan], index=[pd.NaT], name=customer_id)
 87 |     else:
 88 |         labels = iterate_through_cutoffs(vals, start, end,
 89 |                                          pd.Timedelta(window),
 90 |                                          pd.Timedelta(lead),
 91 |                                          cols,
 92 |                                          date_col,
 93 |                                          reduce,
 94 |                                          iterate_by=iterate_by)
 95 |         labels.name = customer_id
 96 |         return labels
 97 | 
 98 | 
 99 | def iterate_through_cutoffs(vals, start, end, window, lead, cols,
100 |                             date_col,
101 |                             reduce, iterate_by):
102 |     labels = []
103 |     cutoffs = []
104 |     cutoff = start
105 |     if iterate_by is None:
106 |         iterate_by = window
107 | 
108 |     while cutoff + lead < end:
109 |         start_window = cutoff + lead
110 |         end_window = start_window + window
111 |         _vals = vals[(vals[date_col] > start_window) &
112 |                      (vals[date_col] < end_window)]
113 | 
114 |         label_vals = np.multiply(*[_vals[c] for c in cols])
115 |         label = getattr(label_vals.dropna(), reduce)()
116 |         labels.append(label)
117 |         cutoffs.append(cutoff)
118 |         cutoff = cutoff + iterate_by
119 |     return pd.Series(labels, index=cutoffs)
120 | 


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
  1 | from featuretools.tests.testing_utils import make_ecommerce_entityset
  2 | from sklearn.model_selection import train_test_split
  3 | from sklearn.metrics import roc_auc_score, f1_score, mean_absolute_error
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from sklearn.svm import SVC
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn.preprocessing import StandardScaler, Imputer
  8 | import pandas as pd
  9 | import numpy as np
 10 | import featuretools as ft
 11 | from featuretools.selection import remove_low_information_features
 12 | from dldb import DLDB
 13 | from testing_utils import construct_retail_example
 14 | 
 15 | 
 16 | def f1_macro(actual, predicted):
 17 |     return f1_score(actual, predicted, average='macro')
 18 | 
 19 | 
 20 | def test_ecommerce():
 21 |     es = make_ecommerce_entityset()
 22 |     cutoffs = es['log'].df[['session_id', 'datetime']]
 23 |     cutoffs = cutoffs.rename(columns={'session_id': 'id'})
 24 |     ftens, fl = ft.dfs(entityset=es,
 25 |                     cutoff_time=cutoffs,
 26 |                     target_entity="sessions",
 27 |                     cutoff_time_in_index=True)
 28 |     ftens.sort_index(inplace=True)
 29 | 
 30 |     ids = ftens.index.get_level_values('id').drop_duplicates()
 31 |     n_instances = ids.shape[0]
 32 | 
 33 |     labels_binary = [i % 2 for i in range(n_instances)]
 34 |     labels_multiclass = np.random.randint(10, size=(n_instances,))
 35 |     labels_regression = np.random.random(size=(n_instances,))
 36 |     labels = pd.DataFrame({'label_binary': labels_binary,
 37 |                            'label_multiclass': labels_multiclass,
 38 |                            'label_regression': labels_regression},
 39 |                           index=ids)
 40 | 
 41 |     ftens = (ftens.reset_index('id', drop=False)
 42 |             .merge(labels, left_on='id',
 43 |                    right_index=True,
 44 |                    how='left')
 45 |             .set_index('id', append=True)
 46 |           )
 47 | 
 48 |     train_ftens, test_ftens = train_test_split(
 49 |         ftens, test_size=0.4, shuffle=False)
 50 |     train_labels = train_ftens[labels.columns]
 51 |     test_labels = test_ftens[labels.columns]
 52 |     for c in labels.columns:
 53 |         del train_ftens[c]
 54 |         del test_ftens[c]
 55 | 
 56 |     scores = {}
 57 |     scoring_functions = {'label_regression': mean_absolute_error,
 58 |                          'label_binary': roc_auc_score,
 59 |                          'label_multiclass': f1_macro}
 60 |     for label_type in labels.columns:
 61 |         classes = labels[label_type].unique()
 62 |         dl_model = DLDB(
 63 |             regression=label_type == 'label_regression',
 64 |             classes=classes,
 65 |             categorical_max_vocab=10)
 66 |         dl_model.fit(train_ftens,
 67 |                      train_labels[label_type].values,
 68 |                      fl=fl,
 69 |                      epochs=1,
 70 |                      batch_size=4)
 71 |         predictions = dl_model.predict(test_ftens)
 72 |         score = scoring_functions[label_type](test_labels[label_type].values,
 73 |                                               predictions)
 74 |         scores[label_type] = score
 75 |     return scores
 76 | 
 77 | 
 78 | def test_retail_binary(ftens_file='retail_binary_files/ftens.csv',
 79 |                        labels_file='retail_binary_files/labels.csv',
 80 |                        fl_file='retail_binary_files/fl.p'):
 81 |     ftens, labels, fl = construct_retail_example(ftens_file, labels_file, fl_file)
 82 |     baseline_ftens = (ftens.reset_index('customer_id', drop=False)
 83 |                      .drop_duplicates('customer_id', keep='last')
 84 |                      .set_index('customer_id'))
 85 |     baseline_ftens, baseline_fl = ft.encode_features(baseline_ftens, fl)
 86 |     baseline_ftens, baseline_fl = remove_low_information_features(baseline_ftens, baseline_fl)
 87 |     train_customers, test_customers = train_test_split(baseline_ftens.index.values, shuffle=True, test_size=0.1)
 88 |     train_labels = labels.loc[train_customers]
 89 |     test_labels = labels.loc[test_customers]
 90 |     train_ftens = ftens.loc[(train_customers, slice(None)), :]
 91 |     test_ftens = ftens.loc[(test_customers, slice(None)), :]
 92 |     baseline_train_fm = baseline_ftens.loc[train_customers, :]
 93 |     baseline_test_fm = baseline_ftens.loc[test_customers, :]
 94 | 
 95 |     dl_model = DLDB(
 96 |         regression=False,
 97 |         classes=[False, True],
 98 |         recurrent_layer_sizes=(32,),
 99 |         dense_layer_sizes=(32, 32),
100 |         categorical_max_vocab=10)
101 |     dl_model.fit(
102 |             train_ftens,
103 |             train_labels,
104 |             fl=fl,
105 |             epochs=1,
106 |             batch_size=32)
107 |     predictions = dl_model.predict(test_ftens)
108 |     score = roc_auc_score(test_labels, predictions)
109 | 
110 |     baseline_scores = score_baseline_pipeline(baseline_train_fm,
111 |                                               train_labels,
112 |                                               baseline_test_fm,
113 |                                               test_labels)
114 |     return score, baseline_scores
115 | 
116 | 
117 | def score_baseline_pipeline(X_train, y_train, X_test, y_test, **hyperparams):
118 |     feature_names = X_train.columns
119 |     imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
120 |     X_train = imputer.fit_transform(X_train)
121 |     scaler = StandardScaler()
122 |     X_train = scaler.fit_transform(X_train)
123 |     X_train = pd.DataFrame(X_train, columns=feature_names)
124 | 
125 |     original_train_fm = X_train
126 |     select_n_features = hyperparams.get('select_n_features', 200)
127 |     selector_rf = RandomForestClassifier(n_estimators=hyperparams.get('selector_n_estimators', 1000),
128 |                                          class_weight='balanced',
129 |                                          n_jobs=-1,
130 |                                          verbose=True)
131 |     selector_rf.fit(original_train_fm, y_train)
132 | 
133 |     importances = sorted(zip(selector_rf.feature_importances_, feature_names),
134 |                          key=lambda x: x[0], reverse=True)
135 |     selected = [i[1] for i in importances[:select_n_features]]
136 | 
137 |     X_train = original_train_fm[selected]
138 | 
139 |     # Train another Random Forest on selected features as our model
140 | 
141 |     model_rf = RandomForestClassifier(n_estimators=hyperparams.get('n_estimators', 400),
142 |                                       class_weight='balanced',
143 |                                       n_jobs=-1)
144 |     model_rf.fit(X_train, y_train)
145 | 
146 |     model_svm = SVC()
147 |     model_svm.fit(X_train, y_train)
148 | 
149 |     model_lr = LogisticRegression()
150 |     model_lr.fit(X_train, y_train)
151 | 
152 |     X_test = imputer.transform(X_test)
153 |     X_test = scaler.transform(X_test)
154 |     X_test = pd.DataFrame(X_test, columns=feature_names)
155 |     X_test = X_test[selected]
156 | 
157 |     # Predict targets for test data
158 | 
159 |     predicted_targets = model_rf.predict(X_test)
160 |     predicted_targets_svm = model_svm.predict(X_test)
161 |     predicted_targets_lr = model_lr.predict(X_test)
162 | 
163 |     # Compute metrics
164 | 
165 |     score_rf = roc_auc_score(y_test, predicted_targets)
166 |     score_svm = roc_auc_score(y_test, predicted_targets_svm)
167 |     score_lr = roc_auc_score(y_test, predicted_targets_lr)
168 |     return {'rf': score_rf, 'svm': score_svm, 'lr': score_lr}
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     #scores = test_ecommerce()
173 |     score, baseline_scores = test_retail_binary()
174 |     print("ROC score:", score)
175 |     print("Baseline ROC scores (using RF, SVM, LogisticRegression):", baseline_scores)
176 | 


--------------------------------------------------------------------------------
/tests/testing_utils.py:
--------------------------------------------------------------------------------
 1 | from labeling_utils import create_labels, sample_labels
 2 | import pandas as pd
 3 | import featuretools as ft
 4 | import os
 5 | 
 6 | 
 7 | def construct_retail_example(ftens_file='retail_binary_files/ftens.csv',
 8 |                              labels_file='retail_binary_files/labels.csv',
 9 |                              fl_file='retail_binary_files/fl.p'):
10 |     es = ft.demo.load_retail()
11 |     if os.path.exists(ftens_file):
12 |         ftens = pd.read_csv(ftens_file, index_col=['customer_id', 'time'], parse_dates=['time'])
13 |         labels = pd.read_csv(labels_file, index_col='customer_id')['label']
14 |         fl = ft.load_features(fl_file, es)
15 |     else:
16 |         labels = create_labels(es,
17 |                                min_training_data='8 days',
18 |                                lead='7 days',
19 |                                window='30 days',
20 |                                reduce='sum',
21 |                                binarize=None,
22 |                                iterate_by=None)
23 |         labels_binary = labels.copy()
24 |         labels_binary['label'] = labels_binary['label'] > 300
25 |         sampled = sample_labels(labels_binary, n=1)
26 |         sampled = sampled[['customer_id', 'time', 'label']]
27 |         sampled = sampled.sample(300)
28 | 
29 |         ftens, fl = ft.tdfs(target_entity='customers',
30 |                       entityset=es,
31 |                       cutoffs=sampled,
32 |                       window_size='30d',
33 |                       num_windows=5,
34 |                       verbose=True)
35 | 
36 |         ftens = (ftens.reset_index('customer_id', drop=False)
37 |                 .reset_index(drop=False)
38 |                 .merge(sampled[['customer_id', 'label']],
39 |                        on='customer_id',
40 |                        how='left')
41 |                 .set_index('customer_id')
42 |                 .set_index('time', append=True))
43 | 
44 |         labels = (ftens['label']
45 |                   .reset_index('customer_id', drop=False)
46 |                   .drop_duplicates('customer_id')
47 |                   .set_index('customer_id'))
48 |         del ftens['label']
49 |         ftens.to_csv(ftens_file)
50 |         labels.to_csv(labels_file)
51 |         labels = labels['label']
52 |         ft.save_features(fl, fl_file)
53 |     return ftens, labels, fl
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     if not os.path.exists('retail_binary_files'):
58 |         os.makedirs('retail_binary_files')
59 |     construct_retail_example()
60 | 


--------------------------------------------------------------------------------