├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── dldb ├── __init__.py ├── dldb.py └── preprocessor.py ├── dldb_fig.png ├── setup.cfg ├── setup.py └── tests ├── labeling_utils.py ├── test.py └── testing_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | **/.DS_Store 6 | .DS_Store 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # pickle files 106 | *.p 107 | *.pickle 108 | 109 | src 110 | 111 | retail_binary_files 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Feature Labs, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include LICENSE 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DLDB 2 | 3 | Deep learning for time-varying multi-entity datasets 4 | 5 | # Installation 6 | 7 | You should be able to just run: 8 | ``` 9 | pip install dldb 10 | ``` 11 | 12 | If that fails due to Tensor Flow, please visit [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/) and follow their instructions for installing Tensor Flow on your system. 13 | You can also follow their instructions to install the GPU version to allow DLDB to use the GPU. 14 | 15 | Be aware that recently users have reported issues installing Tensor Flow on Macs due to a new version of GRPC failing to make. If that happens, try installing grpc==1.9.1 and tensorflow without "-U" or "--upgrade": 16 | 17 | ``` 18 | pip install gprc==1.9.1 tensorflow 19 | ``` 20 | 21 | # API 22 | 23 | See docstrings in `dldb/preprocessing.py` and `dldb/dldb.py` 24 | 25 | # Graphic 26 | 27 | ![DL Layers Graphic](dldb_fig.png) 28 | 29 | # Usage 30 | ### `DLDB` class 31 | 32 | Builds a recurrent neural network model using Keras from a feature tensor (flattened along the time/sequence dimension into a 2D Pandas DataFrame), and list of categorical feature names. 33 | 34 | Specify hyperparameters in the constructor: 35 | 36 | ``` 37 | dldb = DLDB(regression=False, classes=[False, True], 38 | cell_type='GRU') 39 | ``` 40 | 41 | Then compile with the feature tensor and definitions: 42 | ``` 43 | dldb.compile(feature_tensor, feature_defs) 44 | ``` 45 | 46 | Or, if feature tensor was not generated from DFS, explicitly pass in the categorical feature names: 47 | ``` 48 | dldb.compile(feature_tensor_not_from_dfs, 49 | categorical_feature_names=['categorical1', 'categorical2']) 50 | ``` 51 | 52 | And fit: 53 | 54 | ``` 55 | labels = pd.Series([False, True, True], 56 | index=[13458, 13602, 15222]) 57 | dldb.fit(feature_tensor, labels, batch_size=3, epochs=1) 58 | predictions = dldb.predict(feature_tensor) 59 | predictions 60 | >>> array([[0.50211424], 61 | [0.5629099 ], 62 | [0.57218206]], dtype=float32) 63 | ``` 64 | 65 | ### `MLPreprocessing` class 66 | -------------------------------------------------------------------------------- /dldb/__init__.py: -------------------------------------------------------------------------------- 1 | from .dldb import DLDB 2 | from .preprocessor import MLPreprocessor 3 | __version__ = '0.0.2' 4 | -------------------------------------------------------------------------------- /dldb/dldb.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Dense, LSTM, GRU, Embedding, Input, Dropout, BatchNormalization, Conv1D, MaxPooling1D 2 | from keras.models import Model 3 | from keras.preprocessing.sequence import pad_sequences 4 | from keras.utils import Sequence 5 | from .preprocessor import MLPreprocessor 6 | from itertools import groupby 7 | import keras 8 | import numpy as np 9 | import re 10 | import uuid 11 | from math import ceil 12 | 13 | 14 | RNN_CELLS = { 15 | 'lstm': LSTM, 16 | 'gru': GRU, 17 | } 18 | 19 | 20 | def feature_name_to_valid_keras_name(fname): 21 | return re.sub(r'[(.]', '_', fname).replace(')', '') 22 | 23 | 24 | class DLDBInputGenerator(Sequence): 25 | def __init__(self, ftens, 26 | categorical_feature_names, 27 | numeric_input_name, 28 | name_mapping, 29 | numeric_columns, 30 | batch_size=32, 31 | labels=None): 32 | self.ftens = ftens 33 | self.labels = labels 34 | self.instance_id_name = self.ftens.index.names[0] 35 | self.ftens.reset_index(self.instance_id_name, drop=False, inplace=True) 36 | self.batch_size = batch_size 37 | if self.batch_size: 38 | self.batch_col = uuid.uuid4() 39 | 40 | self.ftens[self.batch_col] = self.ftens[self.instance_id_name].astype( 41 | 'category').cat.codes // self.batch_size 42 | self.ftens.set_index(self.batch_col, inplace=True) 43 | 44 | # TODO: figure out what to do about these 45 | self.name_mapping = name_mapping 46 | self.categorical_feature_names = categorical_feature_names 47 | self.numeric_input_name = numeric_input_name 48 | self.numeric_columns = numeric_columns 49 | 50 | self._length = 1 51 | if self.labels is not None: 52 | self.labels = self.labels.to_frame().reset_index( 53 | self.instance_id_name, 54 | drop=False) 55 | if self.batch_size: 56 | self.labels[self.batch_col] = self.labels[self.instance_id_name].astype( 57 | 'category').cat.codes // self.batch_size 58 | self.labels.set_index(self.batch_col, inplace=True) 59 | 60 | self._length = int(ceil(self.labels.shape[0] / self.batch_size)) 61 | else: 62 | self._length = int(ceil(self.labels.shape[0] / self.batch_size)) 63 | elif self.batch_size: 64 | self._length = int(ceil(self.ftens[self.instance_id_name].nunique() / self.batch_size)) 65 | 66 | def __len__(self): 67 | return self._length 68 | 69 | def __getitem__(self, idx): 70 | labels = None 71 | if self.batch_size: 72 | ftens = self.ftens.loc[idx].set_index(self.instance_id_name) 73 | if self.labels is not None: 74 | labels = self.labels.loc[idx].set_index(self.instance_id_name) 75 | else: 76 | ftens = self.ftens.set_index(self.instance_id_name) 77 | if self.labels is not None: 78 | labels = self.labels.set_index(self.instance_id_name) 79 | 80 | inputs = {self.name_mapping[f]: self._sequences_from_ftens( 81 | ftens[[f]])[:, :, 0] 82 | for f in self.categorical_feature_names} 83 | if self.numeric_columns: 84 | inputs[self.numeric_input_name] = self._sequences_from_ftens( 85 | ftens[self.numeric_columns]) 86 | if labels is None: 87 | return inputs 88 | else: 89 | return inputs, labels 90 | 91 | def _sequences_from_ftens(self, ftens): 92 | cols = list(ftens.columns) 93 | instance_id_name = ftens.index.names[0] 94 | ftens.reset_index(inplace=True, drop=False) 95 | ftens = ftens[cols + [instance_id_name]] 96 | # TODO: revert back to pandas here? since its batched 97 | sequences = [np.array(list(group))[:, :-1] 98 | for _, group in groupby(ftens.values, lambda row: row[-1])] 99 | sequence_input = pad_sequences(sequences, 100 | padding='pre') 101 | return sequence_input 102 | 103 | 104 | class DLDB(object): 105 | numeric_input_name = 'numeric_input' 106 | 107 | def __init__(self, 108 | regression=False, 109 | classes=None, 110 | cell_type='lstm', 111 | recurrent_layer_sizes=(64, 64), 112 | dense_layer_sizes=(10,), 113 | dense_activation='relu', 114 | dropout_fraction=0.2, 115 | recurrent_dropout_fraction=0.2, 116 | categorical_max_vocab=None, 117 | categorical_embedding_size=10, 118 | conv_kernel_dim=None, 119 | conv_activation='relu', 120 | pool_size=4, 121 | conv_batch_normalization=False, 122 | loss=None, 123 | metrics=None, 124 | optimizer='rmsprop'): 125 | ''' 126 | regression (bool): If True, labels represent continuous values to predict (otherwise represent class labels) 127 | classes (list[object] or np.ndarray[object] or pd.Series[object]): If regression is False, classes contains all possible class labels 128 | cell_type (str or keras.layers.Layer, optional): Type of Keras cell to use for the recurrent layers. Either provide 129 | a Keras layer object, or one of ['lstm', 'gru'] 130 | recurrent_layer_sizes (tuple, optional): Number of units in each recurrent layer in network 131 | dense_layer_sizes (tuple, optional): Number of units in each dense layer in network (which come after recurrent layers) 132 | dense_activation (str, optional): Keras activation function to use for each dense layer 133 | dropout_fraction (float, optional): Fraction of outputs to drop out of each (non-recurrent portion of each) layer 134 | recurrent_dropout_fraction (float, optional): Fraction of outputs to drop out of each recurrent iteration 135 | categorical_max_vocab (int, optional): If provided, will take the top categorical_max_vocab - 1 categories from 136 | each categorical variable, and will set the rest to a single "unknown" category. 137 | categorical_embedding_size (int, optional): If categorical features provided, will embed them each into 138 | a dense vector of this size 139 | conv_kernel_dim (int, optional): If provided, will add a 1D Convolutional layer prior to the recurrent layers 140 | conv_activation (str, optional): Activation to use for the optional convolutional layer 141 | pool_size (int, optional): Size of max pooling layer that will be used after the convolutional layer if it is present 142 | conv_batch_normalization (bool, optional): If true, will apply batch normalization to the outputs of the convolutional layer 143 | loss (str, optional): loss function to use for gradient calculation. If labels is a Boolean Series, defaults 144 | to `binary_crossentropy`. If labels is an object (multiclass), defaults to `categorical_crossentropy`. 145 | If labels is numeric, defaults to 'mse'. 146 | metrics (list[str], optional): List of metrics for Keras to compute internally on validation set. 147 | If labels is a Boolean Series, defaults 148 | to ['accuracy', 'f1', 'roc_auc']. If labels is an object (multiclass), defaults to ['accuracy', 'f1_macro']. 149 | If labels is numeric, defaults to ['mse', 'r2']. 150 | optimizer (str, optional): Optimizer to use for gradient descent 151 | 152 | ''' 153 | self.regression = regression 154 | self.classes = classes 155 | 156 | if self.regression: 157 | self.output_size = 1 158 | self.loss = loss or 'mse' 159 | elif len(self.classes) == 2: 160 | self.output_size = 1 161 | self.loss = loss or 'binary_crossentropy' 162 | else: 163 | self.output_size = len(self.classes) 164 | self.loss = loss or 'categorical_crossentropy' 165 | 166 | self.cell_type = cell_type 167 | self.recurrent_layer_sizes = recurrent_layer_sizes 168 | self.dense_layer_sizes = dense_layer_sizes 169 | self.dense_activation = dense_activation 170 | self.dropout_fraction = dropout_fraction 171 | self.recurrent_dropout_fraction = recurrent_dropout_fraction 172 | self.categorical_max_vocab = categorical_max_vocab 173 | self.categorical_embedding_size = categorical_embedding_size 174 | self.conv_kernel_dim = conv_kernel_dim 175 | self.conv_activation = conv_activation 176 | self.pool_size = pool_size 177 | self.conv_batch_normalization = conv_batch_normalization 178 | self.metrics = metrics 179 | self.optimizer = optimizer 180 | self.max_values_per_instance = None 181 | self.name_mapping = None 182 | self.ml_preprocessor = MLPreprocessor( 183 | categorical_max_vocab=self.categorical_max_vocab, 184 | classes=self.classes, 185 | regression=self.regression) 186 | 187 | @property 188 | def categorical_vocab(self): 189 | return self.ml_preprocessor.categorical_vocab 190 | 191 | @property 192 | def numeric_columns(self): 193 | return self.ml_preprocessor.numeric_columns 194 | 195 | @property 196 | def categorical_feature_names(self): 197 | return self.ml_preprocessor.categorical_feature_names 198 | 199 | def _preprocess(self, ftens, labels=None, 200 | fl=None, categorical_feature_names=None, 201 | batch_size=32, 202 | fit=True): 203 | if fit: 204 | ftens = self.ml_preprocessor.fit_transform( 205 | ftens, fl=fl, 206 | categorical_feature_names=categorical_feature_names) 207 | self.name_mapping = {c: feature_name_to_valid_keras_name(c) 208 | for c in ftens.columns} 209 | else: 210 | ftens = self.ml_preprocessor.transform(ftens) 211 | return DLDBInputGenerator(ftens, 212 | self.categorical_feature_names, 213 | self.numeric_input_name, 214 | self.name_mapping, 215 | self.numeric_columns, 216 | batch_size=batch_size, 217 | labels=labels) 218 | 219 | def partial_fit(self, 220 | ftens=None, 221 | labels=None, 222 | generator=None, 223 | batch_size=32, 224 | **kwargs): 225 | if generator is None: 226 | generator = self._preprocess(ftens, 227 | labels, 228 | batch_size=batch_size, 229 | fit=False) 230 | 231 | return (self.model.fit_generator(generator, 232 | **kwargs), 233 | generator) 234 | 235 | def fit(self, 236 | ftens, 237 | labels, 238 | fl=None, categorical_feature_names=None, 239 | batch_size=32, 240 | **kwargs): 241 | generator = self._preprocess( 242 | ftens, 243 | labels, 244 | fl=fl, 245 | categorical_feature_names=categorical_feature_names, 246 | batch_size=batch_size, 247 | fit=True) 248 | self._compile_keras_model() 249 | return (self.model.fit_generator(generator, 250 | **kwargs), 251 | generator) 252 | 253 | def predict(self, ftens, verbose=1, **kwargs): 254 | if verbose > 0: 255 | print("Transforming input tensor into numeric sequences") 256 | generator = self._preprocess(ftens, batch_size=None, fit=False) 257 | if verbose > 0: 258 | print("Predicting using Keras model") 259 | predictions = self.model.predict_generator(generator, **kwargs) 260 | if verbose > 0: 261 | print("Transforming outputs") 262 | if not self.regression and len(self.classes) > 2: 263 | predictions = np.array([self.lb.classes_[i] 264 | for i in predictions.argmax(axis=1)]) 265 | return predictions 266 | 267 | def _compile_keras_model(self): 268 | inputs = [] 269 | cat_embedding_layers = [] 270 | for i, f in enumerate(self.categorical_feature_names): 271 | feature_max_vocab = len(self.categorical_vocab[f]) + 1 272 | if self.categorical_max_vocab is not None: 273 | feature_max_vocab = min(feature_max_vocab, 274 | self.categorical_max_vocab + 1) 275 | cat_input = Input(shape=(None,), 276 | dtype='int32', 277 | name=self.name_mapping[f]) 278 | inputs.append(cat_input) 279 | embedding = Embedding(output_dim=self.categorical_embedding_size, 280 | input_dim=feature_max_vocab, 281 | mask_zero=True) 282 | embedding = embedding(cat_input) 283 | cat_embedding_layers.append(embedding) 284 | 285 | numeric_input = None 286 | if len(self.numeric_columns) > 0: 287 | numeric_input = Input(shape=(None, 288 | len(self.numeric_columns)), 289 | dtype='float32', 290 | name=self.numeric_input_name) 291 | inputs.append(numeric_input) 292 | 293 | rnn_inputs = [] 294 | rnn_input_size = 0 295 | if len(cat_embedding_layers): 296 | rnn_inputs.extend(cat_embedding_layers) 297 | rnn_input_size += (self.categorical_embedding_size * 298 | len(cat_embedding_layers)) 299 | if numeric_input is not None: 300 | rnn_inputs.append(numeric_input) 301 | rnn_input_size += len(self.numeric_columns) 302 | if len(rnn_inputs) > 1: 303 | rnn_inputs = keras.layers.concatenate(rnn_inputs) 304 | else: 305 | rnn_inputs = rnn_inputs[0] 306 | 307 | if self.conv_kernel_dim is not None: 308 | 309 | conv_layer = Conv1D(self.categorical_embedding_size//2, 310 | self.conv_kernel_dim, 311 | activation=self.conv_activation) 312 | if self.conv_batch_normalization: 313 | rnn_inputs = BatchNormalization()(rnn_inputs) 314 | conv_layer = conv_layer(rnn_inputs) 315 | mp_layer = MaxPooling1D(pool_size=self.pool_size) 316 | rnn_inputs = mp_layer(conv_layer) 317 | 318 | if isinstance(self.cell_type, str): 319 | self.RNNCell = RNN_CELLS[self.cell_type] 320 | else: 321 | self.RNNCell = self.cell_type 322 | prev_layer = rnn_inputs 323 | for i, layer_size in enumerate(self.recurrent_layer_sizes): 324 | return_sequences = True 325 | if i == len(self.recurrent_layer_sizes) - 1: 326 | return_sequences = False 327 | layer = self.RNNCell( 328 | layer_size, 329 | return_sequences=return_sequences, 330 | dropout=self.dropout_fraction, 331 | recurrent_dropout=self.recurrent_dropout_fraction) 332 | layer = layer(prev_layer) 333 | prev_layer = layer 334 | for layer_size in self.dense_layer_sizes: 335 | layer = Dense(layer_size, 336 | activation=self.dense_activation)(prev_layer) 337 | dropout_layer = Dropout(self.dropout_fraction)(layer) 338 | prev_layer = dropout_layer 339 | 340 | output_layer = Dense(self.output_size, activation='sigmoid', 341 | name='target')(prev_layer) 342 | self.model = Model(inputs=inputs, outputs=output_layer) 343 | self.model.compile(optimizer=self.optimizer, loss=self.loss) 344 | -------------------------------------------------------------------------------- /dldb/preprocessor.py: -------------------------------------------------------------------------------- 1 | from featuretools.variable_types import Discrete, Boolean 2 | import numpy as np 3 | from sklearn.preprocessing import MinMaxScaler, LabelBinarizer 4 | import pandas as pd 5 | 6 | 7 | class MLPreprocessor(object): 8 | def __init__(self, 9 | categorical_max_vocab=None, 10 | classes=None, 11 | regression=False): 12 | self.categorical_max_vocab = categorical_max_vocab 13 | self.classes = classes 14 | self.regression = regression 15 | self.categorical_vocab = None 16 | 17 | def fit_transform(self, ftens, fl=None, categorical_feature_names=None, labels=None): 18 | if categorical_feature_names is not None: 19 | self.categorical_feature_names = categorical_feature_names 20 | elif fl is not None: 21 | self.categorical_feature_names = [f.get_name() for f in fl 22 | if issubclass(f.variable_type, 23 | Discrete) 24 | and not 25 | f.variable_type == Boolean] 26 | else: 27 | self.categorical_feature_names = [c for c in ftens.columns 28 | if ftens[c].dtype == object] 29 | 30 | # Can't handle multiindex 31 | if len(ftens.index.names) > 1: 32 | index_name = ftens.index.names[0] 33 | ftens = ftens.reset_index(index_name, drop=False).set_index(index_name) 34 | self.categorical_vocab = self._gen_categorical_mapping(ftens) 35 | 36 | self.numeric_columns = [f for f in ftens.columns 37 | if f not in self.categorical_feature_names] 38 | 39 | ftens = self.fit_transform_scaler_imputer(ftens) 40 | 41 | if not self.regression: 42 | self.lb = LabelBinarizer().fit(self.classes) 43 | 44 | if labels is not None: 45 | return ftens, self.transform_labels(labels) 46 | else: 47 | return ftens 48 | 49 | def fit_transform_scaler_imputer(self, ftens): 50 | self.fill_vals = {} 51 | new_ftens = ftens 52 | if len(self.numeric_columns) > 0: 53 | numeric_ftens = ftens[self.numeric_columns] 54 | 55 | numeric_ftens = numeric_ftens.astype(np.float32) 56 | for f in self.numeric_columns: 57 | if ftens[f].dropna().shape[0] == 0: 58 | fill_val = 0 59 | else: 60 | fill_val = numeric_ftens[f].dropna().mean() 61 | self.fill_vals[f] = fill_val 62 | numeric_ftens.loc[~np.isfinite(numeric_ftens[f]), f] = np.nan 63 | numeric_ftens.fillna(value=self.fill_vals, inplace=True) 64 | self.scaler = MinMaxScaler() 65 | numeric_ftens = self.scaler.fit_transform(numeric_ftens) 66 | new_ftens[self.numeric_columns] = numeric_ftens 67 | 68 | return self._map_categorical_ftens_to_int(new_ftens) 69 | 70 | def transform(self, ftens, labels=None): 71 | ftens = self._map_categorical_ftens_to_int(ftens) 72 | if len(self.numeric_columns) > 0: 73 | numeric_ftens = ftens[self.numeric_columns] 74 | numeric_ftens = numeric_ftens.astype(np.float32) 75 | for f in self.numeric_columns: 76 | vals = numeric_ftens[f] 77 | numeric_ftens.loc[~np.isfinite(numeric_ftens[f]), f] = np.nan 78 | if vals.dropna().shape[0] != vals.shape[0]: 79 | numeric_ftens[f].fillna(self.fill_vals[f], inplace=True) 80 | numeric_ftens = self.scaler.transform(numeric_ftens) 81 | ftens[self.numeric_columns] = numeric_ftens 82 | if labels is not None: 83 | return ftens, self.transform_labels(labels) 84 | else: 85 | return ftens 86 | 87 | def transform_labels(self, labels): 88 | if not self.regression: 89 | labels = pd.Series(labels).astype(int) 90 | if len(self.classes) > 2: 91 | labels = self.lb.transform(labels) 92 | return labels 93 | 94 | def _map_categorical_ftens_to_int(self, ftens): 95 | new_ftens = ftens 96 | for f in self.categorical_feature_names: 97 | numeric_series, new_mapping = self._map_categorical_series_to_int( 98 | ftens[f], 99 | self.categorical_vocab.get(f, None)) 100 | new_ftens[f] = numeric_series 101 | self.categorical_vocab[f] = new_mapping 102 | return new_ftens 103 | 104 | def _gen_categorical_mapping(self, ftens): 105 | categorical_vocab = {} 106 | for f in self.categorical_feature_names: 107 | val_counts = ftens[f].dropna().value_counts() 108 | mapping = {v: k + 1 for k, v in enumerate(val_counts.index)} 109 | mapping[np.nan] = 0 110 | if (self.categorical_max_vocab is not None and 111 | self.categorical_max_vocab < len(val_counts)): 112 | num_unique = len(val_counts) - self.categorical_max_vocab 113 | unknown = val_counts.tail(num_unique).index.tolist() 114 | mapping.update({u: 0 for u in unknown}) 115 | categorical_vocab[f] = mapping 116 | return categorical_vocab 117 | 118 | def _map_categorical_series_to_int(self, input_series, 119 | mapping): 120 | unique_vals = set(input_series.unique()) 121 | # make sure we don't add any new nans 122 | # since id(np.float64('nan')) != id(np.nan), 123 | # and so we could end up with multiple nans in the 124 | # mapping dict 125 | new_mapping = {u: 0 for u in unique_vals if not pd.isnull(u)} 126 | new_mapping.update(mapping) 127 | numeric = input_series.map(new_mapping) 128 | numeric.fillna(0, inplace=True) 129 | return numeric, new_mapping 130 | -------------------------------------------------------------------------------- /dldb_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alteryx/DL-DB/32e75ed9235aaf5a6183d7f20cfd2dbd59ccbe18/dldb_fig.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | PACKAGE_NAME = 'dldb' 5 | 6 | 7 | def read_package_variable(key): 8 | """Read the value of a variable from the package without importing.""" 9 | module_path = os.path.join(PACKAGE_NAME, '__init__.py') 10 | with open(module_path) as module: 11 | for line in module: 12 | parts = line.strip().split(' ') 13 | if parts and parts[0] == key: 14 | return parts[-1].strip("'") 15 | assert False, "'{0}' not found in '{1}'".format(key, module_path) 16 | 17 | 18 | setup( 19 | name=PACKAGE_NAME, 20 | version=read_package_variable('__version__'), 21 | description='Deep learning for relational datasets with a time-component', 22 | packages=find_packages(), 23 | python_requires='>=3', 24 | install_requires=[ 25 | 'featuretools>=0.1.20', 26 | 'keras>=2.1.4', 27 | 'scikit-learn>=0.19.1', 28 | 'tensorflow>=1.6.0', 29 | ], 30 | url='https://github.com/HDI-Project/DL-DB', 31 | ) 32 | -------------------------------------------------------------------------------- /tests/labeling_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | def create_labels(entityset, 6 | min_training_data='28 days', 7 | lead='7 days', 8 | window='28 days', 9 | reduce='sum', 10 | binarize=None, 11 | iterate_by=None): 12 | label_cols = ['quantity', 'price'] 13 | time_index = "order_date" 14 | index = "customer_id" 15 | df = entityset['orders'].df.merge( 16 | entityset['order_products'].df, how='outer') 17 | 18 | tqdm.pandas(desc="Creating Labels", unit="customer") 19 | 20 | # # Only use data after one of the label columns has been non-null 21 | # for i, v in df[label_cols].iterrows(): 22 | # if v.dropna(how='all').shape[0] > 0: 23 | # df = df.loc[slice(i, None), :] 24 | # break 25 | grouped = df.groupby(index, as_index=True) 26 | 27 | project_cutoff_dates = grouped.progress_apply( 28 | lambda df: make_labels_from_windows( 29 | df, 30 | cols=label_cols, 31 | min_training_data=min_training_data, 32 | lead=lead, window=window, 33 | index_col=index, 34 | date_col=time_index, 35 | reduce=reduce, 36 | iterate_by=iterate_by)) 37 | 38 | project_cutoff_dates = project_cutoff_dates.dropna() 39 | 40 | cutoff_with_labels = (project_cutoff_dates.reset_index(level=0) 41 | .reset_index() 42 | .rename(columns={'index': 'time', 43 | 0: 'label'})) 44 | if binarize: 45 | cutoff_with_labels['label'] = binarize(cutoff_with_labels['label']) 46 | 47 | return (cutoff_with_labels[[index, "time", "label"]] 48 | .sort_values(["time", index])) 49 | 50 | 51 | def sample_labels(labels, random_seed=1, n=1, gap=None): 52 | """ 53 | Select 1 label per customer 54 | """ 55 | def sample(df): 56 | if gap is not None: 57 | samples = [df.iloc[0]] 58 | for i, row in df.iloc[1:].iterrows(): 59 | if row['time'] - samples[-1]['time'] > gap: 60 | samples.append(row) 61 | samples = pd.DataFrame(samples) 62 | return samples.sample(min(n, samples.shape[0]), random_state=random_seed) 63 | else: 64 | return df.sample(min(n, df.shape[0]), random_state=random_seed) 65 | 66 | labels = labels.groupby(labels['customer_id']).apply(sample) 67 | return labels.sort_values(['time', 'customer_id']) 68 | 69 | 70 | def make_labels_from_windows(df, cols, 71 | min_training_data, lead, window, 72 | index_col, date_col, 73 | reduce='min', iterate_by=None): 74 | customer_id = df[index_col].iloc[0] 75 | 76 | if iterate_by is not None: 77 | iterate_by = pd.Timedelta(iterate_by) 78 | 79 | vals = df[[date_col] + cols] 80 | 81 | date_series = vals[date_col] 82 | start = date_series.min() + pd.Timedelta(min_training_data) 83 | end = date_series.max() 84 | 85 | if end - start < pd.Timedelta(lead): 86 | return pd.Series([np.nan], index=[pd.NaT], name=customer_id) 87 | else: 88 | labels = iterate_through_cutoffs(vals, start, end, 89 | pd.Timedelta(window), 90 | pd.Timedelta(lead), 91 | cols, 92 | date_col, 93 | reduce, 94 | iterate_by=iterate_by) 95 | labels.name = customer_id 96 | return labels 97 | 98 | 99 | def iterate_through_cutoffs(vals, start, end, window, lead, cols, 100 | date_col, 101 | reduce, iterate_by): 102 | labels = [] 103 | cutoffs = [] 104 | cutoff = start 105 | if iterate_by is None: 106 | iterate_by = window 107 | 108 | while cutoff + lead < end: 109 | start_window = cutoff + lead 110 | end_window = start_window + window 111 | _vals = vals[(vals[date_col] > start_window) & 112 | (vals[date_col] < end_window)] 113 | 114 | label_vals = np.multiply(*[_vals[c] for c in cols]) 115 | label = getattr(label_vals.dropna(), reduce)() 116 | labels.append(label) 117 | cutoffs.append(cutoff) 118 | cutoff = cutoff + iterate_by 119 | return pd.Series(labels, index=cutoffs) 120 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | from featuretools.tests.testing_utils import make_ecommerce_entityset 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.metrics import roc_auc_score, f1_score, mean_absolute_error 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.svm import SVC 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.preprocessing import StandardScaler, Imputer 8 | import pandas as pd 9 | import numpy as np 10 | import featuretools as ft 11 | from featuretools.selection import remove_low_information_features 12 | from dldb import DLDB 13 | from testing_utils import construct_retail_example 14 | 15 | 16 | def f1_macro(actual, predicted): 17 | return f1_score(actual, predicted, average='macro') 18 | 19 | 20 | def test_ecommerce(): 21 | es = make_ecommerce_entityset() 22 | cutoffs = es['log'].df[['session_id', 'datetime']] 23 | cutoffs = cutoffs.rename(columns={'session_id': 'id'}) 24 | ftens, fl = ft.dfs(entityset=es, 25 | cutoff_time=cutoffs, 26 | target_entity="sessions", 27 | cutoff_time_in_index=True) 28 | ftens.sort_index(inplace=True) 29 | 30 | ids = ftens.index.get_level_values('id').drop_duplicates() 31 | n_instances = ids.shape[0] 32 | 33 | labels_binary = [i % 2 for i in range(n_instances)] 34 | labels_multiclass = np.random.randint(10, size=(n_instances,)) 35 | labels_regression = np.random.random(size=(n_instances,)) 36 | labels = pd.DataFrame({'label_binary': labels_binary, 37 | 'label_multiclass': labels_multiclass, 38 | 'label_regression': labels_regression}, 39 | index=ids) 40 | 41 | ftens = (ftens.reset_index('id', drop=False) 42 | .merge(labels, left_on='id', 43 | right_index=True, 44 | how='left') 45 | .set_index('id', append=True) 46 | ) 47 | 48 | train_ftens, test_ftens = train_test_split( 49 | ftens, test_size=0.4, shuffle=False) 50 | train_labels = train_ftens[labels.columns] 51 | test_labels = test_ftens[labels.columns] 52 | for c in labels.columns: 53 | del train_ftens[c] 54 | del test_ftens[c] 55 | 56 | scores = {} 57 | scoring_functions = {'label_regression': mean_absolute_error, 58 | 'label_binary': roc_auc_score, 59 | 'label_multiclass': f1_macro} 60 | for label_type in labels.columns: 61 | classes = labels[label_type].unique() 62 | dl_model = DLDB( 63 | regression=label_type == 'label_regression', 64 | classes=classes, 65 | categorical_max_vocab=10) 66 | dl_model.fit(train_ftens, 67 | train_labels[label_type].values, 68 | fl=fl, 69 | epochs=1, 70 | batch_size=4) 71 | predictions = dl_model.predict(test_ftens) 72 | score = scoring_functions[label_type](test_labels[label_type].values, 73 | predictions) 74 | scores[label_type] = score 75 | return scores 76 | 77 | 78 | def test_retail_binary(ftens_file='retail_binary_files/ftens.csv', 79 | labels_file='retail_binary_files/labels.csv', 80 | fl_file='retail_binary_files/fl.p'): 81 | ftens, labels, fl = construct_retail_example(ftens_file, labels_file, fl_file) 82 | baseline_ftens = (ftens.reset_index('customer_id', drop=False) 83 | .drop_duplicates('customer_id', keep='last') 84 | .set_index('customer_id')) 85 | baseline_ftens, baseline_fl = ft.encode_features(baseline_ftens, fl) 86 | baseline_ftens, baseline_fl = remove_low_information_features(baseline_ftens, baseline_fl) 87 | train_customers, test_customers = train_test_split(baseline_ftens.index.values, shuffle=True, test_size=0.1) 88 | train_labels = labels.loc[train_customers] 89 | test_labels = labels.loc[test_customers] 90 | train_ftens = ftens.loc[(train_customers, slice(None)), :] 91 | test_ftens = ftens.loc[(test_customers, slice(None)), :] 92 | baseline_train_fm = baseline_ftens.loc[train_customers, :] 93 | baseline_test_fm = baseline_ftens.loc[test_customers, :] 94 | 95 | dl_model = DLDB( 96 | regression=False, 97 | classes=[False, True], 98 | recurrent_layer_sizes=(32,), 99 | dense_layer_sizes=(32, 32), 100 | categorical_max_vocab=10) 101 | dl_model.fit( 102 | train_ftens, 103 | train_labels, 104 | fl=fl, 105 | epochs=1, 106 | batch_size=32) 107 | predictions = dl_model.predict(test_ftens) 108 | score = roc_auc_score(test_labels, predictions) 109 | 110 | baseline_scores = score_baseline_pipeline(baseline_train_fm, 111 | train_labels, 112 | baseline_test_fm, 113 | test_labels) 114 | return score, baseline_scores 115 | 116 | 117 | def score_baseline_pipeline(X_train, y_train, X_test, y_test, **hyperparams): 118 | feature_names = X_train.columns 119 | imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) 120 | X_train = imputer.fit_transform(X_train) 121 | scaler = StandardScaler() 122 | X_train = scaler.fit_transform(X_train) 123 | X_train = pd.DataFrame(X_train, columns=feature_names) 124 | 125 | original_train_fm = X_train 126 | select_n_features = hyperparams.get('select_n_features', 200) 127 | selector_rf = RandomForestClassifier(n_estimators=hyperparams.get('selector_n_estimators', 1000), 128 | class_weight='balanced', 129 | n_jobs=-1, 130 | verbose=True) 131 | selector_rf.fit(original_train_fm, y_train) 132 | 133 | importances = sorted(zip(selector_rf.feature_importances_, feature_names), 134 | key=lambda x: x[0], reverse=True) 135 | selected = [i[1] for i in importances[:select_n_features]] 136 | 137 | X_train = original_train_fm[selected] 138 | 139 | # Train another Random Forest on selected features as our model 140 | 141 | model_rf = RandomForestClassifier(n_estimators=hyperparams.get('n_estimators', 400), 142 | class_weight='balanced', 143 | n_jobs=-1) 144 | model_rf.fit(X_train, y_train) 145 | 146 | model_svm = SVC() 147 | model_svm.fit(X_train, y_train) 148 | 149 | model_lr = LogisticRegression() 150 | model_lr.fit(X_train, y_train) 151 | 152 | X_test = imputer.transform(X_test) 153 | X_test = scaler.transform(X_test) 154 | X_test = pd.DataFrame(X_test, columns=feature_names) 155 | X_test = X_test[selected] 156 | 157 | # Predict targets for test data 158 | 159 | predicted_targets = model_rf.predict(X_test) 160 | predicted_targets_svm = model_svm.predict(X_test) 161 | predicted_targets_lr = model_lr.predict(X_test) 162 | 163 | # Compute metrics 164 | 165 | score_rf = roc_auc_score(y_test, predicted_targets) 166 | score_svm = roc_auc_score(y_test, predicted_targets_svm) 167 | score_lr = roc_auc_score(y_test, predicted_targets_lr) 168 | return {'rf': score_rf, 'svm': score_svm, 'lr': score_lr} 169 | 170 | 171 | if __name__ == '__main__': 172 | #scores = test_ecommerce() 173 | score, baseline_scores = test_retail_binary() 174 | print("ROC score:", score) 175 | print("Baseline ROC scores (using RF, SVM, LogisticRegression):", baseline_scores) 176 | -------------------------------------------------------------------------------- /tests/testing_utils.py: -------------------------------------------------------------------------------- 1 | from labeling_utils import create_labels, sample_labels 2 | import pandas as pd 3 | import featuretools as ft 4 | import os 5 | 6 | 7 | def construct_retail_example(ftens_file='retail_binary_files/ftens.csv', 8 | labels_file='retail_binary_files/labels.csv', 9 | fl_file='retail_binary_files/fl.p'): 10 | es = ft.demo.load_retail() 11 | if os.path.exists(ftens_file): 12 | ftens = pd.read_csv(ftens_file, index_col=['customer_id', 'time'], parse_dates=['time']) 13 | labels = pd.read_csv(labels_file, index_col='customer_id')['label'] 14 | fl = ft.load_features(fl_file, es) 15 | else: 16 | labels = create_labels(es, 17 | min_training_data='8 days', 18 | lead='7 days', 19 | window='30 days', 20 | reduce='sum', 21 | binarize=None, 22 | iterate_by=None) 23 | labels_binary = labels.copy() 24 | labels_binary['label'] = labels_binary['label'] > 300 25 | sampled = sample_labels(labels_binary, n=1) 26 | sampled = sampled[['customer_id', 'time', 'label']] 27 | sampled = sampled.sample(300) 28 | 29 | ftens, fl = ft.tdfs(target_entity='customers', 30 | entityset=es, 31 | cutoffs=sampled, 32 | window_size='30d', 33 | num_windows=5, 34 | verbose=True) 35 | 36 | ftens = (ftens.reset_index('customer_id', drop=False) 37 | .reset_index(drop=False) 38 | .merge(sampled[['customer_id', 'label']], 39 | on='customer_id', 40 | how='left') 41 | .set_index('customer_id') 42 | .set_index('time', append=True)) 43 | 44 | labels = (ftens['label'] 45 | .reset_index('customer_id', drop=False) 46 | .drop_duplicates('customer_id') 47 | .set_index('customer_id')) 48 | del ftens['label'] 49 | ftens.to_csv(ftens_file) 50 | labels.to_csv(labels_file) 51 | labels = labels['label'] 52 | ft.save_features(fl, fl_file) 53 | return ftens, labels, fl 54 | 55 | 56 | if __name__ == '__main__': 57 | if not os.path.exists('retail_binary_files'): 58 | os.makedirs('retail_binary_files') 59 | construct_retail_example() 60 | --------------------------------------------------------------------------------