├── .gitignore ├── utils ├── misc.py ├── deep_learning.py ├── data.py └── measuring_performance.py ├── 01_data_splitting.ipynb ├── 03_creating_xlearn_dataset.ipynb ├── 05_creating_deepctr_dataset.ipynb ├── 02_data_preprocessing.ipynb └── 06_training_deepctr_model.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .ipynb_checkpoints/ 3 | *.DS_Store 4 | utils/__pycache__/ 5 | logs/ 6 | models/ 7 | -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import time 3 | from contextlib import contextmanager 4 | 5 | 6 | def dump_pickle(file_path, obj): 7 | with open(file_path, 'wb') as f: 8 | pickle.dump(obj, f) 9 | 10 | 11 | def load_pickle(file_path): 12 | with open(file_path, 'rb') as f: 13 | obj = pickle.load(f) 14 | return obj 15 | 16 | 17 | @contextmanager 18 | def get_elapsed_time(format_string='Elapsed time: %d sec', verbose=True): 19 | start_time = time.perf_counter() 20 | yield 21 | elapsed_time = time.perf_counter() - start_time 22 | if verbose: 23 | print(format_string % elapsed_time) 24 | -------------------------------------------------------------------------------- /utils/deep_learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.python.client import device_lib 4 | 5 | 6 | def _from_tfrecord(serialized, features, target_name, key_names): 7 | example = tf.io.parse_single_example(serialized=serialized, features=features) 8 | if key_names is not None: 9 | for key_name in key_names: 10 | _ = example.pop(key_name, None) 11 | if target_name is not None: 12 | target = example.pop(target_name, None) 13 | return example, target 14 | else: 15 | return example 16 | 17 | 18 | def extract_dataset(file_paths, compression_type=None, shuffle_buffer_size=1024, is_training=True): 19 | files = tf.data.Dataset.list_files(file_paths, shuffle=False) 20 | if is_training: 21 | dataset = files.interleave( 22 | lambda file: tf.data.TFRecordDataset(file, compression_type=compression_type), 23 | num_parallel_calls=tf.data.experimental.AUTOTUNE).shuffle(shuffle_buffer_size, seed=42) 24 | else: 25 | dataset = tf.data.TFRecordDataset(files, compression_type=compression_type) 26 | return dataset 27 | 28 | 29 | def transform_dataset(dataset, num_feature_names, cat_feature_names, target_name=None, key_names=None): 30 | features = dict() 31 | if key_names is not None: 32 | for key_name in key_names: 33 | features[key_name] = tf.io.FixedLenFeature([], tf.string) 34 | if target_name is not None: 35 | features[target_name] = tf.io.FixedLenFeature([], tf.int64) 36 | for feature in num_feature_names: 37 | features[feature] = tf.io.FixedLenFeature([], tf.float32) 38 | for feature in cat_feature_names: 39 | features[feature] = tf.io.FixedLenFeature([], tf.int64) 40 | dataset = dataset.map(lambda serialized: _from_tfrecord( 41 | serialized=serialized, features=features, target_name=target_name, key_names=key_names), 42 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 43 | return dataset 44 | 45 | 46 | def load_dataset(dataset, batch_size=32, is_training=True): 47 | dataset = dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 48 | if is_training: 49 | return dataset.repeat() 50 | else: 51 | return dataset 52 | 53 | 54 | def get_available_gpus(): 55 | local_device_protos = device_lib.list_local_devices() 56 | return [x.name for x in local_device_protos if x.device_type == 'GPU'] 57 | 58 | 59 | def get_n_examples(dataset): 60 | n_examples = 0 61 | for batch in dataset.take(-1): 62 | shape = list(batch[0].values())[0].shape 63 | if len(shape) == 0: 64 | n_examples += 1 65 | else: 66 | n_examples += shape[0] 67 | return n_examples 68 | 69 | 70 | def get_n_steps(total_size, batch_size): 71 | n_steps = total_size // batch_size 72 | if total_size % batch_size > 0: 73 | n_steps += 1 74 | return int(n_steps) 75 | 76 | 77 | # This function is not yet checked whether it is thread-safe or not. 78 | def get_target(dataset): 79 | target = np.array([]) 80 | for batch in dataset.take(-1): 81 | target = np.concatenate([target, batch[1].numpy()]) 82 | return target 83 | -------------------------------------------------------------------------------- /utils/data.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import tensorflow as tf 3 | 4 | 5 | def _hash_str(string, n_bins): 6 | return int(hashlib.md5(string.encode('utf8')).hexdigest(), 16) % (n_bins - 1) + 1 7 | 8 | 9 | def _get_feature_to_index(num_feature_names, cat_feature_names, n_categories, use_field): 10 | if use_field: 11 | feature_to_index = {feature: i for i, feature in enumerate(num_feature_names | cat_feature_names)} 12 | else: 13 | feature_to_index = {feature: i for i, feature in enumerate(num_feature_names)} 14 | j = 0 15 | for feature in cat_feature_names: 16 | for label in range(n_categories[feature]): 17 | feature_to_index['_'.join([feature, str(label)])] = len(num_feature_names) + j 18 | j += 1 19 | return feature_to_index 20 | 21 | 22 | def dump_libsvm_file(X, y, file, num_feature_names, cat_feature_names, n_categories, use_field=False, decimals=8, 23 | use_hash=False, n_bins=1e6): 24 | feature_to_index = _get_feature_to_index(num_feature_names, cat_feature_names, n_categories, use_field) 25 | with open(file, 'w') as f: 26 | for i, row in X.iterrows(): 27 | if y is not None: 28 | serialized_row = str(y.loc[i]) 29 | else: 30 | serialized_row = '' 31 | for feature in num_feature_names: 32 | index = str(feature_to_index[feature]) 33 | field = ''.join([index, ':']) if use_field else '' 34 | serialized_row = ''.join( 35 | [serialized_row, ' ', index, ':', field, str(round(row[feature], decimals))]) 36 | for feature in cat_feature_names: 37 | if use_field: 38 | field = feature_to_index[feature] 39 | index = int(row[feature]) 40 | index = _hash_str(str(index), n_bins) if use_hash and n_categories[feature] > n_bins else index 41 | serialized_row = ''.join([serialized_row, ' ', str(field), ':', str(index), ':1']) 42 | else: 43 | index = int(row[feature]) 44 | index = _hash_str(str(index), n_bins) if use_hash and n_categories[feature] > n_bins else index 45 | index = feature_to_index['_'.join([feature, str(index)])] 46 | serialized_row = ''.join([serialized_row, ' ', str(index), ':1']) 47 | f.write(serialized_row.lstrip() + '\n') 48 | 49 | 50 | def _bytes_feature(value): 51 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 52 | 53 | 54 | def _float_feature(value): 55 | return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) 56 | 57 | 58 | def _int64_feature(value): 59 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 60 | 61 | 62 | def _serialize_example(feature): 63 | example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) 64 | return example_proto.SerializeToString() 65 | 66 | 67 | def dump_tfrecord_file(X, y, file, num_feature_names, cat_feature_names, target_name=None, key_names=None, 68 | decimals=8, compression_type=None): 69 | options = tf.io.TFRecordOptions(compression_type=compression_type) 70 | with tf.io.TFRecordWriter(path=file, options=options) as writer: 71 | serialized_row = dict() 72 | for i, row in X.iterrows(): 73 | if key_names is not None: 74 | for key_name in key_names: 75 | serialized_row[key_name] = _bytes_feature(row[key_name]) 76 | if y is not None: 77 | serialized_row[target_name] = _int64_feature(y.loc[i]) 78 | for feature in num_feature_names: 79 | serialized_row[feature] = _float_feature(round(row[feature], decimals)) 80 | for feature in cat_feature_names: 81 | serialized_row[feature] = _int64_feature(int(row[feature])) 82 | writer.write(_serialize_example(serialized_row)) 83 | -------------------------------------------------------------------------------- /utils/measuring_performance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | from sklearn.calibration import calibration_curve 5 | from sklearn.metrics import (average_precision_score, confusion_matrix, log_loss, precision_recall_curve, 6 | roc_auc_score, roc_curve) 7 | plt.style.use('seaborn') 8 | 9 | 10 | def get_norm_entropy(y_true, y_score, eps=1e-08): 11 | p = y_true.mean() 12 | y_score = np.where(y_score < eps, eps, y_score) 13 | y_score = np.where(y_score > 1.0 - eps, 1.0 - eps, y_score) 14 | return log_loss(y_true, y_score) / (-1.0 * (p * np.log(p) + (1.0 - p) * np.log(1.0 - p))) 15 | 16 | 17 | def get_threshold_at_precision(y_true, y_score, precision): 18 | sorted_index = (-1.0 * y_score).argsort() 19 | precisions = y_true[sorted_index].cumsum() / (np.arange(y_true.shape[0]) + 1) 20 | return y_score[sorted_index][(precisions > precision).sum()] 21 | 22 | 23 | def get_y_pred(y_score, threshold=0.5): 24 | return np.where(y_score >= threshold, 1, 0) 25 | 26 | 27 | def plot_class_density(y_true, y_score, threshold=0.5, class_names=('0', '1')): 28 | fig, ax = plt.subplots(figsize=(6, 4)) 29 | sns.kdeplot(y_score[y_true.astype('int') == 1], shade=True, linewidth=0.8, label=class_names[1], ax=ax) 30 | sns.kdeplot(y_score[y_true.astype('int') == 0], shade=True, linewidth=0.8, label=class_names[0], ax=ax) 31 | ax.axvline(x=threshold, color='black', linestyle='--', linewidth=1) 32 | ax.set_xlabel('Score') 33 | ax.legend(title='Class', loc='best') 34 | 35 | 36 | def plot_calibration_curve(y_true, y_score): 37 | fig = plt.figure(figsize=(6, 6)) 38 | ax = plt.subplot2grid((3, 1), (0, 0), rowspan=2) 39 | fraction_of_positives, mean_predicted_value = calibration_curve(y_true, y_score, n_bins=20) 40 | ax.plot(mean_predicted_value, fraction_of_positives, color='mediumblue', marker='s', label='Model', linewidth=1.0) 41 | ax.plot([0.0, 1.0], [0.0, 1.0], color='orange', linestyle='--', label='Perfectly Calibrated', linewidth=0.8) 42 | ax.legend(loc='best') 43 | ax.set_title('Calibration Plot (Reliability Curve)', fontsize=12, fontweight='bold') 44 | ax.set_xlabel('Mean Predicted Value') 45 | ax.set_ylabel('Fraction of Positives') 46 | ax.set_xlim([-0.05, 1.05]) 47 | ax.set_ylim([-0.05, 1.05]) 48 | 49 | 50 | def plot_confusion_matrix(y_true, y_pred, normalize=False, class_names=('0', '1')): 51 | conf_mat = confusion_matrix(y_true, y_pred) 52 | fig, ax = plt.subplots(figsize=(4, 4)) 53 | fmt = 'd' 54 | if normalize: 55 | conf_mat = conf_mat / conf_mat.sum(axis=1)[:, np.newaxis] 56 | fmt = '.2%' 57 | sns.heatmap(conf_mat, cmap='coolwarm', annot=True, fmt=fmt, linewidths=0.5, square=True, 58 | xticklabels=class_names, yticklabels=class_names, ax=ax) 59 | ax.set_xlabel('Predicted Class') 60 | ax.set_ylabel('Actual Class') 61 | ax.set_title('Confusion Matrix') 62 | return conf_mat 63 | 64 | 65 | def plot_lift_curve(y_true, y_score): 66 | tested_sample_percent = (np.arange(len(y_true)) + 1) / len(y_true) 67 | found_sample_percent = y_true[(-1.0 * y_score).argsort()].cumsum() / y_true.sum() 68 | fig, ax = plt.subplots(figsize=(6, 4)) 69 | ax.plot(tested_sample_percent, found_sample_percent, color='mediumblue', label='Lift Curve', linewidth=1.0) 70 | ax.fill_between([0.0, y_true.sum() / (len(y_true)), 1.0], [0.0, 1.0, 1.0], [0.0, y_true.sum() / (len(y_true)), 1.0], 71 | alpha=0.3, color='lightsteelblue') 72 | ax.set_xlabel('% Samples Tested') 73 | ax.set_ylabel('% Samples Found') 74 | ax.legend(loc='best') 75 | ax.set_title('Lift Chart', fontsize=12, fontweight='bold') 76 | ax.set_xlim([-0.01, 1.01]) 77 | ax.set_ylim([-0.01, 1.01]) 78 | 79 | 80 | def plot_pr_curve(y_true, y_score): 81 | precision, recall, thresholds = precision_recall_curve(y_true, y_score) 82 | auprc = average_precision_score(y_true, y_score) 83 | fig, ax = plt.subplots(figsize=(6, 4)) 84 | ax.plot(recall, precision, color='mediumblue', linewidth=1.0, label='PR Curve (AUPRC: {0:0.4%})'.format(auprc)) 85 | ax.fill_between(recall, precision, step='mid', alpha=0.3, color='lightsteelblue') 86 | ax.set_xlabel('Recall') 87 | ax.set_ylabel('Precision') 88 | ax.legend(loc='best') 89 | ax.set_title('Precision - Recall', fontsize=12, fontweight='bold') 90 | ax.set_xlim([-0.01, 1.01]) 91 | ax.set_ylim([0, 1.01]) 92 | return auprc 93 | 94 | 95 | def plot_roc_curve(y_true, y_score): 96 | fpr, tpr, thresholds = roc_curve(y_true, y_score) 97 | auroc = roc_auc_score(y_true, y_score) 98 | fig, ax = plt.subplots(figsize=(6, 4)) 99 | ax.plot(fpr, tpr, color='mediumblue', linewidth=1.0, label='ROC Curve (AUROC: {0:0.4%})'.format(auroc)) 100 | ax.plot([0.0, 1.0], [0.0, 1.0], color='orange', linestyle='--', linewidth=0.8) 101 | ax.set_xlabel('False Positive Rate') 102 | ax.set_ylabel('True Positive Rate') 103 | ax.legend(loc='best') 104 | ax.set_title('Receiver Operating Characteristic', fontsize=12, fontweight='bold') 105 | ax.set_xlim([-0.01, 1.01]) 106 | ax.set_ylim([0, 1.01]) 107 | return auroc 108 | -------------------------------------------------------------------------------- /01_data_splitting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import gc\n", 20 | "import os\n", 21 | "import pandas as pd\n", 22 | "from utils.misc import *" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "The dataset can be downloaded from https://labs.criteo.com/2014/02/download-kaggle-display-advertising-challenge-dataset." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge')\n", 39 | "old_data_dir = os.path.join(DATA_DIR, 'dac')\n", 40 | "new_data_dir = os.path.join(DATA_DIR, 'processed')\n", 41 | "\n", 42 | "if not os.path.exists(new_data_dir):\n", 43 | " os.makedirs(new_data_dir)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "column_names = pd.Index(['label'] + ['I%d' % (i + 1) for i in range(13)] + ['C%d' % (i + 1) for i in range(26)])" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Elapsed time: 1357 sec\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "with get_elapsed_time():\n", 70 | " df_train = pd.read_csv(os.path.join(old_data_dir, 'train.txt'), sep='\\t', names=column_names)\n", 71 | " df_train.to_pickle(os.path.join(new_data_dir, 'df_train+valid+test.pkl'))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "# of obs in train set: 45840617\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "print('# of obs in train set:', df_train.shape[0])" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "test_size = 0.2\n", 98 | "split_index = round(df_train.shape[0] * test_size)\n", 99 | "df_train, df_test = df_train[:-split_index], df_train[-split_index:]" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Elapsed time: 711 sec\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "with get_elapsed_time():\n", 117 | " df_train.to_pickle(os.path.join(new_data_dir, 'df_train+valid.pkl'))\n", 118 | " df_test.to_pickle(os.path.join(new_data_dir, 'df_test.pkl'))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 9, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "# of obs in train set: 36672494 \n", 131 | "# of obs in test set: 9168123\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "print('# of obs in train set:', df_train.shape[0], '\\n# of obs in test set:', df_test.shape[0])" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 10, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "valid_size = 0.2\n", 146 | "split_index = round(df_train.shape[0] * valid_size)\n", 147 | "df_train, df_valid = df_train[:-split_index], df_train[-split_index:]" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 11, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "Elapsed time: 507 sec\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "with get_elapsed_time():\n", 165 | " df_train.to_pickle(os.path.join(new_data_dir, 'df_train.pkl'))\n", 166 | " df_valid.to_pickle(os.path.join(new_data_dir, 'df_valid.pkl'))" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 12, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "# of obs in train set: 29337995 \n", 179 | "# of obs in valid set: 7334499\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "print('# of obs in train set:', df_train.shape[0], '\\n# of obs in valid set:', df_valid.shape[0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 13, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "del df_train, df_valid, df_test\n", 194 | "_ = gc.collect()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 14, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "Elapsed time: 53 sec\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "with get_elapsed_time():\n", 212 | " df_quiz = pd.read_csv(os.path.join(old_data_dir, 'test.txt'), sep='\\t', names=column_names[1:])\n", 213 | " df_quiz.to_pickle(os.path.join(new_data_dir, 'df_quiz.pkl'))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 15, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "# of obs in quiz set: 6042135\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "print('# of obs in quiz set:', df_quiz.shape[0])" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 16, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "del df_quiz\n", 240 | "_ = gc.collect()" 241 | ] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.7.6" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 2 265 | } 266 | -------------------------------------------------------------------------------- /03_creating_xlearn_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import gc\n", 20 | "import os\n", 21 | "import pickle\n", 22 | "import pandas as pd\n", 23 | "from utils.data import *\n", 24 | "from utils.misc import *" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')\n", 34 | "USE_QUIZ_SET = False\n", 35 | "USE_TEST_SET = False\n", 36 | "USE_FIELD = False\n", 37 | "USE_HASH = False\n", 38 | "TRAIN_SAMPLING_RATE = 1.0\n", 39 | "TEST_SAMPLING_RATE = 1.0" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "if USE_QUIZ_SET:\n", 49 | " train_dataset_type = 'train+valid+test'\n", 50 | " test_dataset_type = 'quiz'\n", 51 | " \n", 52 | "elif USE_TEST_SET:\n", 53 | " train_dataset_type = 'train+valid'\n", 54 | " test_dataset_type = 'test'\n", 55 | " \n", 56 | "else: \n", 57 | " train_dataset_type = 'train'\n", 58 | " test_dataset_type = 'valid'" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "df_y_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))\n", 68 | "df_X_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 6, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "df_y_train.index = list(range(df_y_train.shape[0]))\n", 78 | "df_X_train.index = list(range(df_X_train.shape[0]))\n", 79 | "\n", 80 | "if TRAIN_SAMPLING_RATE < 1.0:\n", 81 | " df_y_train = df_y_train.sample(frac=TRAIN_SAMPLING_RATE, random_state=42)\n", 82 | " df_X_train = df_X_train.loc[df_y_train.index, :]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 7, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "# of obs in sampled train set: 45840617\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "print('# of obs in sampled train set:', df_X_train.shape[0])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "full_pipeline = load_pickle(os.path.join(DATA_DIR, '_'.join(['pipeline', train_dataset_type]) + '.pkl'))\n", 109 | "target_name, num_feature_names, cat_feature_names, n_categories = load_pickle(\n", 110 | " os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 9, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Elapsed time: 24827 sec\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "with get_elapsed_time():\n", 128 | " model_type = 'ffm' if USE_FIELD else 'fm'\n", 129 | " train_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', train_dataset_type]) + '.libsvm')\n", 130 | " dump_libsvm_file(df_X_train, df_y_train, train_dataset_path, num_feature_names, cat_feature_names, \n", 131 | " n_categories, use_field=USE_FIELD, decimals=6, use_hash=USE_HASH)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 10, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "del df_y_train, df_X_train\n", 141 | "_ = gc.collect()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 11, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "df_y_test = pd.read_pickle(\n", 151 | " os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl')) if not USE_QUIZ_SET else None\n", 152 | "df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 12, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "if not USE_QUIZ_SET:\n", 162 | " df_y_test.index = list(range(df_y_test.shape[0]))\n", 163 | "df_X_test.index = list(range(df_X_test.shape[0]))\n", 164 | "\n", 165 | "if not USE_QUIZ_SET and TEST_SAMPLING_RATE < 1.0:\n", 166 | " df_y_test = df_y_test.sample(frac=TEST_SAMPLING_RATE, random_state=42)\n", 167 | " df_X_test = df_X_test.loc[df_y_test.index, :]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 13, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "# of obs in sampled test set: 6042135\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "print('# of obs in sampled test set:', df_X_test.shape[0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 14, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "with get_elapsed_time():\n", 194 | " test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.libsvm')\n", 195 | " dump_libsvm_file(df_X_test, df_y_test, test_dataset_path, num_feature_names, cat_feature_names, \n", 196 | " n_categories, use_field=USE_FIELD, decimals=8, use_hash=USE_HASH)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 15, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "if not USE_QUIZ_SET:\n", 206 | " del df_y_test, df_X_test\n", 207 | " \n", 208 | "else:\n", 209 | " del df_X_test\n", 210 | "_ = gc.collect()" 211 | ] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.7.5" 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 2 235 | } 236 | -------------------------------------------------------------------------------- /05_creating_deepctr_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import gc\n", 20 | "import os\n", 21 | "import pandas as pd\n", 22 | "from utils.data import *\n", 23 | "from utils.misc import *" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')\n", 33 | "MODEL_DIR = os.path.abspath('models')\n", 34 | "USE_QUIZ_SET = False\n", 35 | "USE_TEST_SET = True\n", 36 | "TRAIN_SAMPLING_RATE = 1.0\n", 37 | "TEST_SAMPLING_RATE = 1.0" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "if USE_QUIZ_SET:\n", 47 | " train_dataset_type = 'train+valid+test'\n", 48 | " test_dataset_type = 'quiz'\n", 49 | " \n", 50 | "elif USE_TEST_SET:\n", 51 | " train_dataset_type = 'train+valid'\n", 52 | " test_dataset_type = 'test'\n", 53 | " \n", 54 | "else: \n", 55 | " train_dataset_type = 'train'\n", 56 | " test_dataset_type = 'valid'" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "df_y_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))\n", 66 | "df_X_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 6, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "df_y_train.index = list(range(df_y_train.shape[0]))\n", 76 | "df_X_train.index = list(range(df_X_train.shape[0]))\n", 77 | "\n", 78 | "if TRAIN_SAMPLING_RATE < 1.0:\n", 79 | " df_y_train = df_y_train.sample(frac=TRAIN_SAMPLING_RATE, random_state=42)\n", 80 | " df_X_train = df_X_train.loc[df_y_train.index, :]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "# of obs in sampled train set: 36672494\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "print('# of obs in sampled train set:', df_X_train.shape[0])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 8, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "full_pipeline = load_pickle(os.path.join(DATA_DIR, '_'.join(['pipeline', train_dataset_type]) + '.pkl'))\n", 107 | "target_name, num_feature_names, cat_feature_names, n_categories = load_pickle(\n", 108 | " os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])))" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Elapsed time: 38383 sec\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "with get_elapsed_time():\n", 126 | " train_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', train_dataset_type]) + '.tfrecord')\n", 127 | " dump_tfrecord_file(df_X_train, df_y_train, train_dataset_path, num_feature_names, cat_feature_names, \n", 128 | " target_name=target_name, decimals=6, compression_type='GZIP')" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 10, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "del df_y_train, df_X_train\n", 138 | "_ = gc.collect()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 11, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "df_y_test = pd.read_pickle(\n", 148 | " os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl')) if not USE_QUIZ_SET else None\n", 149 | "df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 12, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "df_y_test.index = list(range(df_y_test.shape[0]))\n", 159 | "df_X_test.index = list(range(df_X_test.shape[0]))\n", 160 | "\n", 161 | "if not USE_QUIZ_SET and TEST_SAMPLING_RATE < 1.0:\n", 162 | " df_y_test = df_y_test.sample(frac=TEST_SAMPLING_RATE, random_state=42)\n", 163 | " df_X_test = df_X_test.loc[df_y_test.index, :]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 13, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "# of obs in sampled test set: 9168123\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "print('# of obs in sampled test set:', df_X_test.shape[0])" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 14, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "Elapsed time: 9490 sec\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "with get_elapsed_time():\n", 198 | " test_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', test_dataset_type]) + '.tfrecord')\n", 199 | " dump_tfrecord_file(df_X_test, df_y_test, test_dataset_path, num_feature_names, cat_feature_names, \n", 200 | " target_name=target_name, decimals=8, compression_type='GZIP')" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 15, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "if not USE_QUIZ_SET:\n", 210 | " del df_y_test, df_X_test\n", 211 | " \n", 212 | "else:\n", 213 | " del df_X_test\n", 214 | "_ = gc.collect()" 215 | ] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.7.5" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 2 239 | } 240 | -------------------------------------------------------------------------------- /02_data_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import bisect\n", 20 | "import gc\n", 21 | "import os\n", 22 | "import pickle\n", 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "from sklearn.compose import make_column_transformer\n", 26 | "from sklearn.impute import SimpleImputer\n", 27 | "from sklearn.pipeline import make_pipeline\n", 28 | "from sklearn.preprocessing import OrdinalEncoder, StandardScaler\n", 29 | "from utils.misc import *" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')\n", 39 | "USE_QUIZ_SET = False\n", 40 | "USE_TEST_SET = False" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "if USE_QUIZ_SET:\n", 50 | " train_dataset_type = 'train+valid+test'\n", 51 | " test_dataset_type = 'quiz'\n", 52 | " \n", 53 | "elif USE_TEST_SET:\n", 54 | " train_dataset_type = 'train+valid'\n", 55 | " test_dataset_type = 'test'\n", 56 | " \n", 57 | "else: \n", 58 | " train_dataset_type = 'train'\n", 59 | " test_dataset_type = 'valid'" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "df_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', train_dataset_type]) + '.pkl'))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 6, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "target_name = 'label'\n", 78 | "num_feature_names = df_train.columns[df_train.columns.str.startswith('I')]\n", 79 | "cat_feature_names = df_train.columns[df_train.columns.str.startswith('C')]\n", 80 | "all_feature_names = pd.Index(num_feature_names.to_list() + cat_feature_names.to_list())" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "# of num features: 13 \n", 93 | "# of cat features: 26\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "print('# of num features:', len(num_feature_names), '\\n# of cat features:', len(cat_feature_names))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "num_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=0.0, copy=False), \n", 108 | " StandardScaler(copy=False))\n", 109 | "cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='', copy=False), \n", 110 | " OrdinalEncoder(dtype=np.int))\n", 111 | "full_pipeline = make_column_transformer((num_pipeline, num_feature_names), (cat_pipeline, cat_feature_names))" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 9, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "df_y_train = df_train[target_name]\n", 121 | "df_y_train.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 10, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "full_pipeline = full_pipeline.fit(df_train[all_feature_names])\n", 131 | "\n", 132 | "for i in range(len(full_pipeline.transformers_[1][2])):\n", 133 | " categories = set(full_pipeline.transformers_[1][1].steps[1][1].categories_[i])\n", 134 | " if '' not in categories:\n", 135 | " categories = list(categories)\n", 136 | " bisect.insort_left(categories, '')\n", 137 | " full_pipeline.transformers_[1][1].steps[1][1].categories_[i] = np.array(categories)\n", 138 | " \n", 139 | "n_categories = {feature: len(categories) for feature, categories in zip(\n", 140 | " full_pipeline.transformers_[1][2], full_pipeline.transformers_[1][1].steps[1][1].categories_)}" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 11, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Elapsed time: 3881 sec\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "with get_elapsed_time():\n", 158 | " df_X_train = full_pipeline.transform(df_train[all_feature_names])\n", 159 | " df_X_train = pd.DataFrame(df_X_train, columns=all_feature_names)\n", 160 | " df_X_train = df_X_train.astype({feature_name: 'int' for feature_name in cat_feature_names}, copy=False)\n", 161 | " df_X_train.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 12, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "dump_pickle(os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'pipeline.pkl'])), full_pipeline)\n", 171 | "dump_pickle(os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])), \n", 172 | " (target_name, num_feature_names, cat_feature_names, n_categories))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 13, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "del df_train, df_y_train, df_X_train\n", 182 | "_ = gc.collect()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 14, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "df_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', test_dataset_type]) + '.pkl'))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 15, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "for i, feature in enumerate(full_pipeline.transformers_[1][2]):\n", 201 | " categories = set(full_pipeline.transformers_[1][1].steps[1][1].categories_[i])\n", 202 | " df_test[feature] = df_test[feature].map(lambda x: np.nan if x not in categories else x)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 16, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "if not USE_QUIZ_SET:\n", 212 | " df_y_test = df_test[target_name]\n", 213 | " df_y_test.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl'))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 17, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "Elapsed time: 114 sec\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "with get_elapsed_time():\n", 231 | " df_X_test = full_pipeline.transform(df_test)\n", 232 | " df_X_test = pd.DataFrame(df_X_test, columns=all_feature_names)\n", 233 | " df_X_test = df_X_test.astype({feature_name: 'int' for feature_name in cat_feature_names}, copy=False)\n", 234 | " df_X_test.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 18, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "if not USE_QUIZ_SET:\n", 244 | " del df_y_test, df_X_test\n", 245 | " \n", 246 | "else:\n", 247 | " del df_X_test\n", 248 | "_ = gc.collect()" 249 | ] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 3", 255 | "language": "python", 256 | "name": "python3" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 3 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython3", 268 | "version": "3.7.5" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | -------------------------------------------------------------------------------- /06_training_deepctr_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import os\n", 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "import tensorflow as tf\n", 23 | "from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)\n", 24 | "from utils.data import *\n", 25 | "from utils.deep_learning import *\n", 26 | "from utils.measuring_performance import *\n", 27 | "from utils.misc import *\n", 28 | "from deepctr.inputs import DenseFeat, SparseFeat, get_feature_names\n", 29 | "from deepctr.layers import custom_objects\n", 30 | "from deepctr.models import DeepFM" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "def show_metrics_per_epoch(history, smoothing=False):\n", 40 | " metrics_per_epoch = pd.DataFrame(history)\n", 41 | " \n", 42 | " if smoothing:\n", 43 | " losses = metrics_per_epoch['loss'].rolling(window=10).mean()\n", 44 | " val_losses = metrics_per_epoch['val_loss'].rolling(window=10).mean()\n", 45 | " aucs = metrics_per_epoch['auc'].rolling(window=10).mean()\n", 46 | " val_aucs = metrics_per_epoch['val_auc'].rolling(window=10).mean()\n", 47 | " \n", 48 | " else:\n", 49 | " losses = metrics_per_epoch['loss']\n", 50 | " val_losses = metrics_per_epoch['val_loss']\n", 51 | " aucs = metrics_per_epoch['auc']\n", 52 | " val_aucs = metrics_per_epoch['val_auc']\n", 53 | " \n", 54 | " fig = plt.figure(figsize=(10, 4))\n", 55 | "\n", 56 | " ax1 = plt.subplot(1, 2, 1)\n", 57 | " _ = ax1.plot(losses, linewidth=1.2, label='Training Loss')\n", 58 | " _ = ax1.plot(val_losses, linestyle='--', linewidth=1.2, label='Validation Loss')\n", 59 | " _ = ax1.set_title('Loss per Epoch')\n", 60 | " _ = ax1.legend(loc='best')\n", 61 | " \n", 62 | " ax2 = plt.subplot(1, 2, 2)\n", 63 | " _ = ax2.plot(aucs, linewidth=1.2, label='Training AUC')\n", 64 | " _ = ax2.plot(val_aucs, linestyle='--', linewidth=1.2, label='Validation AUC')\n", 65 | " _ = ax2.set_title('AUC per Epoch')\n", 66 | " _ = ax2.legend(loc='best')\n", 67 | " \n", 68 | " return metrics_per_epoch" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')\n", 78 | "MODEL_DIR = os.path.abspath('models')\n", 79 | "USE_TFRECORD = True" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "if not os.path.exists(MODEL_DIR):\n", 89 | " os.makedirs(MODEL_DIR)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "train_dataset_type = 'train+valid'\n", 99 | "test_dataset_type = 'test'\n", 100 | "model_type = 'deepfm'\n", 101 | "model_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'model', train_dataset_type]))\n", 102 | "score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.pkl')" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 7, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "target_name, num_feature_names, cat_feature_names, n_categories = load_pickle(\n", 112 | " os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])))" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 8, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "if USE_TFRECORD:\n", 122 | " train_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', train_dataset_type]) + '.tfrecord')\n", 123 | " test_dataset_path = os.path.join(DATA_DIR, '_'.join(['dataset', test_dataset_type]) + '.tfrecord')\n", 124 | " \n", 125 | " shuffle_buffer_size = 2 ** 20\n", 126 | " train_dataset = extract_dataset(train_dataset_path, compression_type='GZIP', \n", 127 | " shuffle_buffer_size=shuffle_buffer_size, is_training=True)\n", 128 | " test_dataset = extract_dataset(test_dataset_path, compression_type='GZIP', \n", 129 | " shuffle_buffer_size=shuffle_buffer_size, is_training=True)\n", 130 | " \n", 131 | " n = get_n_examples(train_dataset)\n", 132 | " m = get_n_examples(test_dataset)\n", 133 | " \n", 134 | "else:\n", 135 | " df_y_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))\n", 136 | " df_X_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))\n", 137 | " df_y_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl'))\n", 138 | " df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))\n", 139 | "\n", 140 | " train_model_input = {column: df_X_train[column].values for column in df_X_train.columns}\n", 141 | " test_model_input = {column: df_X_test[column].values for column in df_X_test.columns}\n", 142 | " \n", 143 | " n = df_y_train.shape[0]\n", 144 | " m = df_y_test.shape[0]" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 9, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "embedding_dim = 4\n", 154 | "num_features = [DenseFeat(feature, 1) for feature in num_feature_names]\n", 155 | "cat_features = [SparseFeat(feature, vocabulary_size=n_categories[feature], \n", 156 | " embedding_dim=embedding_dim, use_hash=False) for feature in cat_feature_names]\n", 157 | "linear_features = num_features + cat_features\n", 158 | "dnn_features = num_features + cat_features\n", 159 | "all_feature_names = get_feature_names(num_features + cat_features)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 10, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "model = DeepFM(linear_features, dnn_features, task='binary')\n", 169 | "\n", 170 | "if len(get_available_gpus()) > 1:\n", 171 | " model = tf.keras.utils.multi_gpu_model(model, gpus=n_gpus)\n", 172 | " \n", 173 | "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 11, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=3, mode='max'),\n", 183 | " tf.keras.callbacks.ModelCheckpoint(\n", 184 | " filepath=model_path + '.h5', monitor='val_auc', save_best_only=True)]" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 12, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "epochs = 300\n", 194 | "batch_size = 2 ** 17" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 13, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "if USE_TFRECORD:\n", 204 | " steps_per_epoch = n // batch_size\n", 205 | " validation_steps = m // batch_size\n", 206 | " \n", 207 | " train_dataset = transform_dataset(train_dataset, num_feature_names, cat_feature_names, \n", 208 | " target_name=target_name)\n", 209 | " test_dataset = transform_dataset(test_dataset, num_feature_names, cat_feature_names, \n", 210 | " target_name=target_name)\n", 211 | " \n", 212 | " train_generator = load_dataset(train_dataset, batch_size=batch_size, is_training=True)\n", 213 | " valid_generator = load_dataset(test_dataset, batch_size=batch_size, is_training=True)\n", 214 | " \n", 215 | " history = model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, \n", 216 | " verbose=False, validation_data=valid_generator, \n", 217 | " validation_steps=validation_steps, callbacks=callbacks)\n", 218 | " \n", 219 | "else:\n", 220 | " history = model.fit(train_model_input, df_y_train.values, batch_size=batch_size, epochs=epochs, \n", 221 | " verbose=True, validation_data=(test_model_input, df_y_test.values), callbacks=callbacks)\n", 222 | " \n", 223 | "history = history.history" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 14, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "model.save(model_path + '.h5')\n", 233 | "dump_pickle(model_path + '_history.pkl', history)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 15, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stderr", 243 | "output_type": "stream", 244 | "text": [ 245 | "/Users/a406127/anaconda3/envs/ctr_prediction/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", 246 | " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n", 247 | "/Users/a406127/anaconda3/envs/ctr_prediction/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", 248 | " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "model = tf.keras.models.load_model(model_path + '.h5', custom_objects=custom_objects)\n", 254 | "history = load_pickle(model_path + '_history.pkl')" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 16, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "image/png": "\n", 265 | "text/plain": [ 266 | "
" 267 | ] 268 | }, 269 | "metadata": { 270 | "needs_background": "light" 271 | }, 272 | "output_type": "display_data" 273 | } 274 | ], 275 | "source": [ 276 | "metrics_per_epoch = show_metrics_per_epoch(history, smoothing=True)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 17, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "if USE_TFRECORD:\n", 286 | " test_dataset = extract_dataset(test_dataset_path, compression_type='GZIP', \n", 287 | " shuffle_buffer_size=shuffle_buffer_size, is_training=False)\n", 288 | " test_dataset = transform_dataset(test_dataset, num_feature_names, cat_feature_names, \n", 289 | " target_name=target_name)\n", 290 | " test_generator = load_dataset(test_dataset, batch_size=batch_size, is_training=False)\n", 291 | " \n", 292 | " y_true = get_target(test_generator)\n", 293 | " y_score = model.predict_generator(test_generator).ravel()\n", 294 | " \n", 295 | "else:\n", 296 | " y_true = df_y_test.values\n", 297 | " y_score = model.predict(test_model_input).ravel()\n", 298 | "\n", 299 | "dump_pickle(score_path, y_score)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 18, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "image/png": "\n", 310 | "text/plain": [ 311 | "
" 312 | ] 313 | }, 314 | "metadata": { 315 | "needs_background": "light" 316 | }, 317 | "output_type": "display_data" 318 | }, 319 | { 320 | "data": { 321 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAETCAYAAADUAmpRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdd3gU1RrA4d+29A6B0Jt4aAoIKPbeAbFeRcV+EdGrotgLAqKCAiqioqICNsQCKs1rR8BLUxDkACJChEB63z73j9nEgBACZLPZzfc+T56tM/Ptye755pwzc8ZiGAZCCCEaNmuoAxBCCBF6kgyEEEJIMhBCCCHJQAghBJIMhBBCIMlACCEEYA91AOLwKaXaAn9UecoASoHVwDCt9dogbPMt4DogVWtdUNvrD2zDBowIbKc95mf6FnhEa70+GNvcTxwdgWeBx7TWvwTrswc+773ATUBrYAcwGxiltS5RSo0EHgd6aq1/rq3tHkR8vYCJwICD/dw1jb2uylr8k7QMIsvHwNnAucADwInA5CBta1xgWyVBWj+YlcJTwCrgKuAR4DhgqVJKBXG7e7saGABYAo+D9dlfwfy884ArgDmYyXB6LW/nUN0BnHyIy07HLLPNB3hfXZW12Iu0DCLLDmAxZpLPBjwE/sdKqRTgBaA/5h72DMw9bJ9SqnPgteOAMuBVYKTW2lBKPQLcCsQBXwJ3aK13A/dh7rGlKaWWAVatdcfAth4FRgG9gSxgCnA6kAu8oLWeWKU1MxfoA6zTWp9d8UECr98FLNJaX13l+ZXAMuBB4Hql1LdAU2AR5h71n8BtWuvvlFLRwDPAIMzW0qfAcK11qVJqK7AbcACtgA6BmAcByYHY7sGshB4PbH61Uup04PrAZ09VSvUAvsGsxPsB7QKxXKO1LldK9cGs5DtjVu6tgJZa67ZV/3FKqXaB+D/UWt8VeHquUmoLsIs9DVRKfQQkAhO11k8FWhUT9o5fa/15YO/6UmA+cAFwEdAWM7m2CPxfXtJajwnEciPmzkQrQAN3Bj7XdYHt5wfijcP83/YGMjFbMO8qpU4LlMmHwJmBct8eKMeeSqk/gGnAGUA0Zgv230B6dWUNFOwrNq31d4jDJi2DyHI7UI5Z2a8C1gLDAq9NAM7B3NMcDwwHhiqlHJgVsgJuAT4HHgX6KaUGA6OBlwLv743546/KCDx3hFLqFKWUBfMH/D+t9UrMpKMCsb0JTFBK9auy/CmYle4Te623V+B2QdUntdY/AUWYCaRCJyABuBGzgpytlEoE7sesZEZjVnyX7bWdPsC7wM1AF8zKfApwOWZF9wTwS+AzgJkUf2HfbsDci/0SuAS4WCkVBXwENMes6P/CbK3tSy/MveHFe33eF7XWs/Z67xmYFWIWMEYp1RQ4dj/xV0gAXIHPsBzze7EsEOuvwCilVHKgIn8Ds4IejFlZv4NZuS8KrGsgkAN8Enh9SOBzz1BKda+yzd6B7e3dOr0usN0HqmxjEAco6/3FFkj64jBJMogsH2Du+a0DCoH7q/TP9sPcg34NmAREBd57JHAE8LrW+gPMH2Gi1vqzwDIAYzEr8vbA+fvY7luYCehG4LTA+15WSsUHHivgbf6unC6osuw8rfV7Wus9KkH+brVa+Ke9n/MCtwYqzeeBxkCPQPyxmK2eqUDaXtveobUer7X+VGu9FLPVVIjZRZMEpGmt84Etgff/FHi8Ly9rrWditgLA3Ms9EnMPdmrgM96LmRD2xRa4rclv8mGt9YeYSdwKNN1f/HstN0prPVNrXQScB3yFWUYdMcs0Bbgw8N4RgW0cC7TWWv8J7Ay89h3mmMaRQF9gJmayt2J2UVZ4T2v9odZ69V5x/IjZAn0Es5XyEjC2BmW9v9hc1ZaWqBFJBpElS2s9H/MH5gA+VUq1CrzmwGwtHB/4Ox+zq6dC1S7DPkqpRoFlfMBJgWXOA85USu3xvdFaF2JWCJdiJpM8zMRkx6xkvggsfxJm62RilcVz9/NZVgVu90g+SqmTMPf+V1R52lol/orJtvyB+Hfu9Zmv3de2lVKXAGuAZsDLmHul+0pE+1MUuHUHbi3sO6HtbzKwigrz+KpPKqU+V0q9GmhxVSgO3HoDt9Yaxp8bWGd7zB2GizATyof7iLMi9sZA90ALsqqKx68HYj4Ns8Xy/t7b21ugxdgVs0vIidlS/d++3rsfB4pNHAJJBhFIa/078DBmpTkp8PQC4GjMpvup/N1/rDEH9W5WSl0JPIfZJXBBYBkbZhO+A/Ae5lEe/n1s9iXMrogrgLe01uWBJLEMc9CxE2ayWAT0rLLcvtaF1noTZmviLKXU+0qpS5RSt2MeXVOC2UdfwQq8qpS6DLP7YzdmZbgAs3I8N/DZ52J21+xr26cFPmseZquiN3/vrVdU8OcppVrsK979WI/Zl36LUupKpdRzQMv9fN6NmAn1cqXUeKXUAKXU65h7wx6t9YFmlKwu/goVn7cHZkVajNkaGBh43oaZuAGeDZTnbMwjuKL5uxwuA7YC2zB3ENpgdjH+F7NLbO/t7UEp9RhmC6AZ5jhKJtAqMO5RXVlXF5s4TJIMIteLBPqElVLnAkMxK/NHMQdfXwOe11p7MY/e+C3w3BXAaK31jMDjRzH3qF/D3Fu/dV8bCxy++gPmnu8rVV66DPMHOwFzr3wsf++JHsjNmP3KPTD3OMcAPwEnaK03VHlfMWaCeB1zzOQyrXUJMBKz2+gWzCQ3F3hoP9uaHPh8jwO3AUuADKVUemC5rZjjLd1qGDtaazdmRZuFubcejTmw697PIjdiDmJfzt+Dr6MwB3APpLr49zYfsxIeiLlXXtEqOUpr/S1mwjwKs//eClwUKM8PMAezx2B2f50HbML8bpwG3KW1XlaDWJ8NLHNn4HMamP8zH9WU9QFiE4fJIlNYi3AWOJqoh9Y6JdSx7C3QtTMHKABmAY0wE+V8rfUloYxNiL1Jy0CIIAl07XyMOdD5IWbr4HvMo6eEqFekZSCEEEJaBkIIISQZCCGEIMjTUSiljgOe0Vqfttfz/YHHMI+Tnqa1fu1A68rOLq7sz0pNjSM/v6yWow0/Ug4mKQcpgwpSDqaq5ZCenlij82WC1jJQSt2HeahfzF7POzBPOjoH83j3fwdOp68xu33vw6cbJikHk5SDlEEFKQfToZRDMFsGv2POPzJjr+c7A5srTjVXSi3GnJ+mpseeCyFE2DAMA6fToLjYT0mJ+Vdc7Ke01E9JiUFpqXm/rMygrMxPeblBebn52On043KZyzudBm63+edymbdeb8UfeL0GJx7xEyd32cCDMyYeOLC9BC0ZaK0/Csw8ubckzPlTKhRjzrJYrdTUuD2yXXp64uGGGBGkHExSDlIGFYJVDmVlfnJyvOTmesnL85KX5/vHbUFBxZ+XggIfhYXmn81mISnJRnKyjcREK4mJNhISzNvERCvx8Vbi4200aWLexsdbiY21EBtrJSbGSmyslehoCzEx5q35Z8XhsGC3g8NhISZzKo7d86DPy9jsloMuh1BMYV2EOU1ChUTMk3KqVbUfMD09kezs4mre3TBIOZikHKQMKhxMOTidfrKzfWRn+9i920tOjnk/J8cXqPR95OX5yc/3kZfnwzAgLc1GaqqVlBQbKSlW0tLMCj411UrXrjZSUhyBSt9KUpKVxETzNjq6NnrkDcypwvYUlbMIiyef4iaXQJNrwGkhPZHKcqhpUghFMvgN6KiUSsOcQuAUzNPThRDisLlcfnbu9LFzp5esrL//du3ysWuXl+xs87aszE96up0mTWykp9to3NhOerqN1q3t9OwZTVqajcaNbaSm2khLM/fW6xVvCYkb7sHiK6O407Ngiz+s1dVZMlBKDQIStNZTlVLDgYWYA9jTtNb7m9ZXCCEq+XwGO3d6ycz0smOHl8xMDzt2eCv/srJ85Od7adrUTrNm5l9Ghp2mTW107RpN06Zm5d+0qZ2UFCtW68FMTFt/2Av+hzfpGJwZl+FpfPaBF6iBsDkDueqhpdIkNkk5mKQcIqcMDMNg924f27Z5+PNPD9u2edm61c327V62b/eQleUjLc1KixYOWra007y5nRYt7DRv7qB5cztHHZWM1erEZgvPSv5ALO5sEjaMAIud4q5TwBq1z/dV/T7U9NBSueylEKJO+f0GO3Z42bLFw5Ytbv74w8Mff5iV/59/eoiJsdCmjYPWrR20aeOgT59YLr3UQatWdlq2dBAVtf+6LT09iuzsCLzWjWGA4SVmx/u4mv0Ld/q+rjF1eCQZCCGCoqDAx+bNbjZv9vD77242b3bz++9mhZ+cbKV9+yjatXPQrp2D3r1jaNs2irZt7SQmyrkCVVmdO0jYMBxXxmWUt70jaNuRZCCEOCzZ2V60dlf+bdrkZuNGN2Vlfjp2jKJDhyg6dHBw0UWJdOgQRfv2jvo3GFtPWZ2ZJK/+FyVHjsbT6IygbkuSgRCiRoqKfKxf7+a331xo7ea338xK3+czUCoq8BfN+efHo1QUzZrZsVgis+8+2Kzl24jd9gqlRz5J/rH/BVts0LcpyUAIsQfDMMjM9LJunYu1a138+quLdetc5OT46Nw5mk6dzIr/ggsSUCqKJk1sUunXopi/phOz/TVK1dNgsdRJIgBJBkI0aIZhsHWrhzVrXKxZ4+KXX5ysXevC4bDQrVs03bpFc8kliTz2WGPatnVE7FE69YG17HewxuGN60hBn4Vgi6vT7UsyEKKBMAzzKJ5Vq5z8/LNZ8a9Z4yIuzsrRR0dz9NHRDBmSytFHm8fjizpi+Ij9cwrRu2ZT3PVlvKnHhyQM+Y8LEaGKinysXu1i5Uonq1aZfwA9e0bTs2eMVPz1gd+NrXQTFm8+BX0WgTU6ZKHIt0CICGAYBps3u1m+vJwVK5ysWOHkzz89HH10NMccE8O//pXIM8+k07y5DOrWC34PcVsnYS9eS1H36ZQldg11RJIMhAhHXq/BqlVOVq50smxZOStWuIiOhj59YujTJ5Zrr02ma9doHA6p+OujpLU34U06mqKj3gh1KJUkGQgRBjweg9WrnSxZUs7Spebef7Nmdo45JpqLLkrklVfaERsbgWfeRhK/i7gt43A1vYSio14LaZfQvkgyEKIe8vkM1qxxsXhxGT/8YFb+bds6OPHEWAYPTmbKlAwaNap6fY8InYYhQthK1pO4biiujMvxJXQCS/07y1qSgRD1gGEYbNzo5vvvy/nhhzKWLi0nI8POiSfGct11ybz6agapqfWvAhEH4CvHXvQz/tg2FHd7A1/8EaGOaL8kGQgRIrm5PhYtKmHp0nK++64cux1OOSWOgQMTefbZJjRpIj/PcObIX0K8fgBny5twhuhw0YMh3zYh6ojXa7B8uZPvvitj/vwSNmxwc8458fTtG8udd6bRvr1DjvSJBD4nWKOJ2v0ZRd1n4I9tE+qIakSSgRBBlJ/v4+uvS/nyyzK++aaUJk3snH12PE8+mU7v3jHExMiEbZHEkfsN8ZsepajH+5Sqp0IdzkGRZCBELfvjDzcLFpSycGEpa9a4OPHEWM46K55HH21EixaOUIcngiQ6azbRWbMp6vEh/phmoQ7noEkyEOIwGYbBr7+6+eKLEubPLyEnx8e558Zz220pnHxyHLGxsvcfyaKy52Mr3Ux561txNb3UnFwuDEkyEOIQ+HwG//ufk3nzSpg3rwSr1cKFF8YzfnwTevWKkQndGgK/m8R1t4HhpaTTeLCGd6tPkoEQNeT3GyxdWs6cOWYCaNzYRr9+CcyY0ZzOnaNk8LcBceR8iSftdMpb3YI35bhQh1MrJBkIUQ3DMFixwsncuSXMmVNMWpqNiy9OZO7clrRvv++LkYvIZXHtInHDvRi2ODwpx0dMIgBJBkL8gzkG4OLjj0uYO7eYmBgLF12UyOzZLTnySEkADZJhgL+MqLxvcTa/Gnf6eaGOqNZJMhAiYPt2Dx9/XMyHHxbjdPq5+OJE3n67OV27ShdQQ2Z1/kXCb3fjbnwOzlY3hzqcoJFkIBq00lI/8+aV8N57Raxf76J//0See64Jxx4bIwlAYPEUkvTLIEqPeAJPo9NCHU5QSTIQDY5hGKxc6eSdd4r4/PMSeveO4frrkzn33Hiio+UwUAHW8q3E/z6W4i6TKTj263o5sVxtk2QgGozcXB8ffljEO+8U4XYbXH11EosXt5ErfYk9RO94l9htr1DSaRxYG84YkfwKREQzDIMffihn5sxCvv66jHPOiWfcuCb07SvdQGJPttJNYHjwJvUMXJA+NtQh1SlJBiIi5ef7+OCDIt5+u5CoKAuDByczblwTUlIiv7kvDpLfS+y2l4je9QnFXV7Al3h0qCMKCUkGIqL88ouTadMKmTevhLPOimfSpKYyGCz2y+ItxuItweJ3Bi5I33C6hfYmyUCEPbfbz+zZRbzxRiG7dnm5/vpkli5tQ+PG8vUW++H3ELd1Ao78JRQe8yll7e8PdUQhJ78WEbby8nxMn17IW29tpUMHO3fckcq558bLvEDigBI2jMAX25rCnh+F7cRytU2SgQg7mze7efnlfObOLeH88+NZsOAIMjK8oQ5L1Hc+J/FbnsGddgolnSeARQ4jrkpKQ4SN5cvLGTx4BwMGZNKkiZ0lS9rwwgsZHHVUwzrqQxw8W+lmUpafiz+qMZ60UyQR7IO0DES9ZhgGX31Vxgsv5LFjh5ehQ1N55ZUM4uLkxyxqwFdGVO43uBudRtHRb+KPax/qiOotSQaiXvL5DObMKeGFF/IwDLjzzjQGDEjAbpf+XVEzjrwfiN/4EM6WN4ItXhLBAQQtGSilrMAUoDvgAm7WWm+u8vo9wCDAD4zVWn8SrFhE+PB4DGbPLuKFF/JJS7PxyCONOfPMODk0VNSYxVuMYY3FXrSSou7v4o9tFeqQwkIwWwYDgRit9fFKqb7Ac8BFAEqpFOBO4AggHvgZkGTQgDmdft5/v4jJk/Np08bBuHFNOOmkWEkC4uDsWEDKivso6jaV8rZ3hTqasBLMZHASsABAa71MKdW7ymulwJ+YiSAes3UgGiCXy8/MmWZLoEuXKKZMyeDYY2VAWBw8R+7XsPstCo/5CH90RqjDCTvBTAZJQGGVxz6llF1rXXEM4HZgPWADnjrQylJT47Db/55KID09sRZDDV/hWg4ej8Hbb+cyatROjjoqljlzOtCnT/whry9cy6E2NdgyyJwD2YuhxzhQA2gkrUng4L8PwUwGRUDVaKxVEsH5QDOgXeDxQqXUj1rr/+1vZfn5ZZX309MTyc4uruVww084loPfbzB3bglPP51L8+Z2XnmlKX36xAL+Q/4s4VgOta1BloFhkLju32D4KFHjMHJKGmY57EPVcqhpUghmMvgR6A/MCowZrK3yWj5QDri01oZSqgBICWIsIsQMw+Dbb8sYMyYXmw2eeaYJp5wiYwLiEBgGUbvn4Ek7nbK29+JLUKGOKCIEMxl8ApytlFoCWIAblFLDgc1a67lKqbOAZUopP7AY+DKIsYgQ+vlnJ6NG5bBzp5eHHmpEv34JkgTEIbG6skj4bTiGIxlP2umSCGpR0JKB1toP3LrX0xuqvP448Hiwti9CLzPTw9ixufzwQxkjRjRi0KAkOU9AHBrDwOLJw1aygfKWN+JpfFaoI4o4ctKZqHXFxT5eeCGf6dMLueGGFJYubUtCgpwxLA6NtXw7CRvuxpN6khwuGkSSDESt8fsNPvigiLFjczn11Di++aY1zZs7Qh2WCGd+N4nrbqWs/QN40k4OdTQRTZKBqBWrVjl58MHdWK0Wpk9vTs+eMaEOSYQxa9kWEjY+RHHXVyns9blMM10HJBmIw1JQ4OPJJ3OZP7+ERx9tzOWXJ2K1yg9XHLrorI+J3fo8JZ3GYTiSQx1OgyHJQBwSwzCYNauY0aNzuOCCBBYvbiPXFxaHxVa6EatrJ56U43D1WQg2aV3WJUkG4qBp7eK++7IpK/MzY4Z0CYnD5PcS++cLRO/+jJLOE/HHtAh1RA2SJANRY06nn0mT8nn77ULuvTeN669PlktMisNice3GsMeDNYqCPgsb9AXpQ02SgaiRn34q5667dqFUFF9/3ZpmzeSrIw6D303cH8/iyPuewl5zKW9ze6gjavDkFy2qVVrq56mncpkzp5innmpCv34JoQ5JRIC4Lc9g2JPMI4WsUg3VB/JfEPu1ZEkZd921m969Y/juuzakpckAsTgMvnLitzyNN6ErZR0ekcNF6xlJBuIfSkr8PPlkDvPmlTJuXDrnniutAXF4rM4dJP18Fc7mg3BlXCaJoB6SOQLEHpYtK+eMM7ZRXOznu+9aSyIQh8dbQkzmNPzRTSnq8Q7O1kPAItVOfSQtAwGA220wblwu779fxLhxTbjgAkkC4vA48r4jfuPDOFveDFjxx7QMdUiiGpIMBJs3u7n11iyaNbPzzTetSU+Xr4U4dBZvEWDB6txBUY8P5LyBMFGj9ppSKl4pdbRSyqKUOvRrE4p6xTAMZswopF+/7QwalMT06c0kEYjDEpW9kJTl52MrWY+r+VWSCMLIAX/5SqkzgVcxr1V8ArBGKXW11npRsIMTwVNQ4GP48N388YebOXNaolR0qEMSYc5WvJaYHTMpOOZjjOimoQ5HHKSatAzGAicBBVrrncCpwPigRiWCatUqJ2edtY2MDBvz57eSRCAOS9Tuz0j89d/4Eo+iqPsMSQRhqibJwKq1zqp4oLVeH8R4RBAZhsErr+RzzTU7GDkynbFjmxATI0d2iEOXsP4/RO+aQ8mRT4U6FHGYatJBnKmU6gcYSqkUYBiwLbhhidpWWOjjP//ZRVaWl/nzW9GmjVx0RhwiwyA660M8KcdS1v4B/DHNQx2RqAU12S0cAlwNtAJ+B3oAtwQzKFG7fv3Vxdlnb6dZMztz57aURCAOmdW5k6RfrsKR/wOGPUUSQQSpScugu9b6qqpPKKUuAT4OTkiiNr3/fhFPPJHDk0+mc8kliaEOR4Qrw8Dq3I7FV0J5qyF4Gp0e6ohELdtvMlBK/QuIBkYppR7ba5mHkGRQr7lcfh56KJulS8v55JMWdOokg8Ti0FjLt5H42514kvtQ1uEhfAldQh2SCILqWgZJmIeSJgJVdwO8wMPBDEocnt27vdxww04aN7axaFFrEhJkkFgcAsMAIGHjI5S1G4En9YQQBySCab/JQGv9GvCaUupMrfVXdRiTOAy//OLk+ut3cuWVSYwYkSbXIxaHxFa6mYQNwynu8hJF3aeHOhxRB2oyZuBSSs0BEgAL5slnbbTWbYMZmDh4c+cWc//92YwfL9cdEIcuKns+cVuepqTTs/hjW4U6HFFHapIMXgeeAa4HXgDOB1YFMSZxkAzDYOLEfGbMKGTWrBYcdZSMD4iDZyvZgL14Le7GZ1PQZxFY5XvUkNQkGZRrrd9USrUF8jEPK10Z1KhEjblcfoYP383GjW4WLGhF06Yyt5A4SH4vcVsnEpWzgOLOEzEcKaGOSIRATUYWnUqpNEADfbXWBiCT1dUDeXleLr/8L8rK/MyZ01ISgTho1rLfwWLBH51BQe8F+BKPDnVIIkRqkgwmAB8AnwGDlVLrkJZByP35p4cTTtD06BHDG280Iy5OjhgSB8HvIm7zKJLW3YbFW4KzxbVglZMRG7ID1iBa6w+Bc7TWxUAv4BrMs5JFiKxb56J//+0MG5bOqFHpcsSQODiGQcxfb2M40ijoPQ/DkRzqiEQ9UN1JZ+nAcCAPmIh5fkE55rkHCwCZmjAEfvqpnBtu2MnYsenccksTsrOLQx2SCBe+MuJ/fxJ/dDPK29we6mhEPVNdJ/M7QDHQGIhSSs0DZgBxwN11EJvYy3//W8odd+zipZeacsYZMmwjas7iKSR5ZT+cLQbjbHlTqMMR9VB13UQdtNaXAv2Aq4DPgZlAJ631u3URnPjbRx8V8Z//7GL69GaSCETNeUuI2zIewxZH4TGf4mx1i1yQXuxTdd+KIoDAWEEacJnW+mmttbtOIhOV3nijgFGjcvnooxb06RMb6nBEmHDkfUfKivPwRTcDix0jqlGoQxL1WHXdREaV+7u01kuDHYzYk2EYTJiQxwcfFDNnTkvatpWjPcSBWTwFWHzlYHgp6vEh/phmoQ5JhIHqkkGiUupkzNZDfOB+5WErWuvvq1uxUsoKTAG6Ay7gZq315iqvnw88HljnSmBY4BwGgZkInnwyly+/LOWzz+QcAlEzUdnzid88mpIjx+JpdGaowxFhpLoaJhMYFbj/V5X7YLYazjjAugcCMVrr45VSfYHngIsAlFKJmNdRPk1rnaOUug9zoDr74D9C5DEMg1Gjcvn221I+/rgljRrZQh2SqO8MA6szk+is2RT0moMRlR7qiESYqW7W0sO9esVJmIegorVeppTqXeW1E4C1wHNKqfbA61prSQSYieDxx3NYsqScjz9uSWqqJAJRvahdn8Lamfi7fUjxUW+EOhwRpoLZ95AEFFZ57FNK2bXWXsxWwOmYl9AsAX5QSi3VWm/c38pSU+Ow2/+uGNPTI++qXYZh8MADO/jf/1x8+60iNfXA/55ILIdD0WDLYeXd4M6DE98jPTop1NHUCw32u7CXgy2HYCaDIswL41SwBhIBQC6wXGudBaCU+h4zMew3GeTnl1XeT09PjMiTrZ5+OpcFC0r4+OOWeL3lZB+grRSp5XCwGlw5GAbRWR/gi22Pr9lwDEcK6dENrAz2o8F9F/ajajnUNCkE84DjH4ELAAJjBmurvLYK6KaUaqyUsgN9gfVBjKXee+GFPD7/vITZs1uQliZdQ2LfrK4skn7+F478ZfgSOskMo6LWHLBloJRKBcYBHYDLMQd+79Fa5x9g0U+As5VSSzCPGLpBKTUc2Ky1nquUehBYGHjvLK31r4f6IcLd668XMHNmEXPmtKRxYzlqSOyDYWArWYc/qgnlbYbhSTs11BGJCFOTmuc1YBFwLOb0FDsxz0S+sLqFtNZ+4Na9nt5Q5fX3gfcPJthI9OGHRUyenM9nn7WkWTNJBOKfrOVbSVx/J96k7pR2HIUnukmoQxIRqCbdRO201lMBv9barbV+GGgZ5LgahC+/LGXkyBw++KA5rVrJCWViL4YfDIO4P1+ktMODlHYcdeBlhDhENUkGXqVUMnIvT8sAACAASURBVIEzkpVSHQF/UKNqAJYuLefOO3fx9tvNUEouLyj2ZCvdRPLK/thK1lLS6Tm8KX1DHZKIcDXpl3gc+BZorZT6FDgeuDGYQUW6detc3HTTTl5+OYPevWWuIbEnR95i4jc9Skmn5+TKY6LO1CQZfAmsAI4DbMAQrfWuoEYVwf76y8PVV+9g7Nh0Tj01LtThiHrEVryOqLxvKG91CwV9FoI1KtQhiQakJslgG+aRQTO11suCHE9EKyz0ceWVOxgyJIWBA+XEGBFgGMRteYao3P9S3HkSWKXbUNS9miSDbsClwJNKqRaYRwDNrDrpnDgwj8fgxht3cvLJsQwdmhrqcEQ9YSteiy9e4U08mrJ294JVjigToVGTayDna61f11qfiXn94/5UOURUHJg5zcRuYmOtjB4tE4gJwOckftMTJGy4D4snH3eTCyQRiJCqyUln6Zgnm12JeZGbd4GLgxxXRHn11QJWrHDyxRetsNnk4vUNnuEjKmcB/uh0Cnt/DhY541yEXk12RX4GZgF3a61XBjmeiPP116VMnpzP/PmtSEiQyw02aL4y4jePxrDFUnbEY6GORog91CQZtAqcTSwO0ubNbm6/fRfTpjWTk8oaOr+blJX9cDa/FmeL60IdjRD/sN9koJRapbU+BvOks6pXILMAhtZa2rbVKCnxM3jwDh58sBF9+8q5BA2VxVtE3JZxlLUbQUGvz8EmhxOL+qm6i9scE7j9R9+GklNmq+X3GwwblsUJJ8Ry7bXJoQ5HhIgj73sS9IOUtRmGYU8Ci4wXifrrgJ3YSqmlez22Yp6EJvZj8uR8du/2MXasTCjWEFk8+dhKNIY9icJjPsLVfJAkAlHvVddN9DVwWuB+1TEDLzA3uGGFryVLynj11QIWLWpFVJRUAA1N1O4viP/9SUqPeBx3+rmhDkeIGquum+gMAKXU81rrO+supPC1a5eXW2/NYvLkprRoIQPGDYrfg8VXRlTOQgp6zcWIahzqiIQ4KNW1DPpprT8HVimlBu/9utZ6elAjCzM+n8GQIVlcc00yp58eH+pwRF0xDKJ3fUTstikU9F5ASZcXQh2REIekukNL+wCfE+gq2osBSDKoYuLEPKxWuOeetFCHIupQ3JaxWJ1/UdjzI5lYToS16rqJHg/c3lDxnFIqCfO8g3V1EFvY+PHHMt56q5CvvmotZxg3BIZB9M53MezJlLW9B2wxoY5IiMNWk6OJblJKTQtMS7EemK2UGhP80MJDfr6PYcN28cILTWnaVOaWiXQWdw5Jqy/DUbgCT9opkghExKjJ/Ai3AfcCVwFzgKOA84IZVLgwDIN7791Nv34JnHGGjBNENMOPI28xhi2Bsnb3UtJ5onnugBARokaT5Wit84ALgC+01l5ATqkF3n+/iM2b3TzySKNQhyKCyFq2heRVA4nKmQ/WaLypx4c6JCFqXU36NdYppT4H2gP/VUrNApYHN6z6b8sWN6NG5fLxxy2IiZEJ6CKS4QPDT0zWbEo7PII35dhQRyRE0NSkFrsRGAccp7V2AzOAm4MaVT3n9RrcdlsW99yTRufOMjNHJLKVaJJXXIijYAll7e+TRCAiXk2SQRTQD/hSKfUzcAbQoGvAKVPyiY+3ctNNMu9QJLIVryVx/VBK1dN40k4NdThC1ImaJIPJQBxmC+E6wAG8Esyg6rONG91MmZLPhAlNsch8MxHFVvwr8RsfxZfQjYLeC/Em9Qh1SELUmZqMGfTSWnev8vh2pdT6YAVUn/l8BnfeuYv7729EmzYy3UQkidsynqicRRR3ed6cVM4i/1/RsNSkZWBVSqVUPAjc9wYvpPrrzTcLsdvhuuukeyhS2AtXYfHk4U47hYLe8/EldAl1SEKERE1aBhOA5UqpiplKBwBPBS+k+mnnTi/PPpvLZ5+1wmqV7qGw5ysnfstT2AtXUtz1Fbwpx4U6IiFC6oAtA631m8DFwBZgK3CJ1npakOOqdx56aDfXX59Cx44y/0zY85XjKFqNL7oFhb0+wx/bKtQRCRFy1c1aagWGAUcCi7XWL9VZVPXMwoUl/Pabm5dfzgh1KOJweEuI3/wEFsNLSeeJeFJPCHVEQtQb1bUMpgCXA6XAQ0qpx+ompPqltNTPQw9lM358Ezm5LJwZBsm/XI0voRslnSaEOhoh6p3qardTgVO11g9gnltwad2EVL9MmpRHnz4xnHyyXMg8HFk8hST8djdWlznNtLPldXIJSiH2obpk4NRaGwBa61zMaxg0KL//7mbGjEJGjkwPdSjiENjzl5Ky4gI8KX3xR7cAq8wqK8T+VPfr2Lvy9+/zXRFszJgcbrstlYwMqUTCicWTh610I/6Y5hQc8wlGdJNQhyREvVddLddGKTVtf4+11jcGL6zQW7GinNWrXUyZIoPG4SRq11zitzwdmFiub6jDESJsVJcMhu/1+LuDWXHgaKQpQHfABdystd68j/d8AczRWtebKS4Mw2DMmFxGjEgjNlYGjcOCtwSsUTgKl1PQ6zOMKJlWXIiDUd1lL98+zHUPBGK01scrpfoCzwEX7fWeMUDqYW6n1n39dRnZ2V7+9S+5eEm9Zxiw9V1S1j5DYa/PKT1ydKgjEiIsBXO39yRgAYDWehnQu+qLSqnLMMchFgQxhoPm9xuMHp3Dww83xm6Xo07qu9jtL8Oubyns9QWGI+XACwgh9imYI6NJQGGVxz6llF1r7VVKdQMGAZcBNTp/ITU1DrvdVvk4PT2xNmOtNHNmLsnJDgYPzgiLWUmDVQ71mmHAljfB74Ge94LVRuNQx1QPNMjvwj5IOZgOthxqlAyUUvFAB2AtEKe1Lq3BYkVA1WisgUtmAgwGWgBfA20Bt1Jqq9Z6v62E/Pyyyvvp6YlkZxfXJPSD4nL5efjhv3jxxQxyckpqff21LVjlUJ9ZvEUkrbkOb9wRlB7xOOSWNchy2JuUgUnKwVS1HGqaFA6YDJRSZwKvAjbgBGCNUupqrfWiAyz6I9AfmBUYM1hb8YLW+r4q6x8JZFWXCOrK9OlFKBXF8cfLJZ7rHcNP1O7PcTfpR0nHJ/AlHh3qiISIKDUZMxiL2f9foLXeiXlm8vgaLPcJ4FRKLQEmAncrpYYrpQYccrRBVFzsY+LEPB5+WDoc6htb6WaSVw7AUbgcDJ8kAiGCoCbdRFatdZZSCgCt9fqK+9XRWvuBW/d6esM+3jeyBjEE3ZQpBZx+ehxdujToK3rWL4YP/E4c+T9S2vFxvMl9Qh2REBGrJskgUynVDzACF7YZBmwLblh1a/duL9OmFfDll61DHYoIsJX8RuJvd1HWepg5n5AQIqhqkgyGAM8DrTCvafAV8O9gBlXXJk7M44orkmjdWi51WB9YnZkkrr+D4s4TpEtIiDpywGSgtd4NXFUHsYTEjh0ePvqomCVL2oQ6lAbPXvQLsdteorjrqxT0WQQWOftbiLpSk6OJ/mAfM5ZqrdsHJaI6NmVKAVdemUTjxjIZXSjFbp1EVPYCSjpPCkwxXf/P8RAiktSkBjytyn0H5iUwI2KUNSfHy6xZRXz/vbQKQsVeuBzDnoq78fmUt7kDLLYDLySEqHU16Sb6c6+nxiulVmDOKxTWXnutgAEDEmSK6lDwlRG/eQz2krUUd3kRf+wRoY5IiAatJt1Ep1R5aAG6AmF/VlZJiZ+33y5k/ny5GHpds3jysbpz8MW1o/TIMTI2IEQ9UJNd4ieq3DeAHCDsj/V7770iTjghjnbtokIdSoNh8RYTv3kkFk8hxUe9ji++Y6hDEkIE1CQZzNJavxz0SOqQz2fw6qv5vPyyXLimLiWsvwN3o7NwNb861KEIIfZSk/b5sKBHUccWLCglPd1Onz5h39tV71k8BSSsG4ateA3FR72Jq8U1ckF6IeqhmrQMtiulvgZ+AsorntRajwpaVEE2dWoB//63zH0fbPbCVSSuv52ytnfjSzhKkoAQ9VhNksGyKvfD/te8dq2Tbds89OuXEOpQIpbFnYujYCme1BMp6DUHIyo91CEJIQ5gv8lAKXWd1vptrfUT+3tPOHrttQJuuCEZhyPs81r9YxhE7f6U+C3jKe3wMIaj3l3RVAixH9W1DO4EDvc6yPVKbq6P+fNLWbpUpqmubRZ3LoYjFVv5nxT0/hzDkRbqkIQQB6FBHeD97ruFnHdevEw9UZsMg+gd75GycgBW1w7K294liUCIMFRdrdhVKbVlH89bACPc5iby+w2mTy/i1VflcNLaFJ01C0fBTxT0mY9hTwp1OEKIQ1RdMtgMXFBXgQTbd9+VkZRkpWfPiJhWKbQMg5i/3sLqyqKs/QO4mv0r1BEJIQ5TdcnAvY95icLWu+8WcfXVSVjk8MbD43eT/PMVeOMVpUc8JoeLChEhqhsz+LHOogiy3Fwf33xTxiWXJIY6lPBl+In+ayYYfoo7TaRUPQO2+FBHJYSoJftNBlrr2+sykGCaPbuIs8+OJyVFpkc+FLbSTSSv7Ie9bBMA/rh2IY5ICFHbGsRhNe+/X8SoUXLi00Hze7F4C7CV/U5pxzF4k48JdURCiCCJ+ENL1693kZ/v58QTZR6ig2ErXkfKivOJylmEO/08SQRCRLiIbxnMnl3MpZcmYrXKQGdNWTyFJGx8gOLOE/Eldgt1OEKIOhDRLQO/3+Djj4u57DIZOK4Je9FqklZfhmGLpbDXZ5IIhGhAIrplsHRpOSkpVjp3lnMLDiRm2ytE75pDSZcXwCoX/BGioYnoZPDppyVyOOkB2At+AgzcTfrjbHWLXJBeiAYqYruJvF6DL74oYcAASQb75CslXj9A/O9j8Uc1xR/TQhKBEA1YxLYMli4tp3lzO23bOkIdSr1jdWZi2OLwJXSh9Min5CxiIUTkJoMvviihf3+5gE1VFm8R8Zsex+raRVH3d3C2GBzqkIQQ9UREJgO/32DevBI++aRlqEOpV+I2j8GT3AdXs6ukNSCE2ENEJoNVq5wkJ9vo0EGOirF48knY+BDOjMso7TQu1OEIIeqpiBxAXrCglPPOk0nUbCUbSFnRD1fjc/CknRHqcIQQ9VhEJoOFC0s577yGO15gcecQs/0NfHHtKeg1F3fTi6VbSAhRrYhLBtu2ecjN9TXYi9hEZ80mZeUA/NHNwBqFEdUo1CEJIcJAxI0ZfPNNGaedFtfg5iKyurLw21PA76Gg9zwMR0qoQxJChJGgJQOllBWYAnQHXMDNWuvNVV6/G7gy8HCe1vqJ2tju11+X0q9fA+oiMgyid7xD3LaXKTr6TVzNrwp1REKIMBTMbqKBQIzW+njgAeC5iheUUu2Bq4ETgL7AOUqpow93gx6PwY8/lnPaaXGHu6rwYBg48r7BUbSSgj4L8MUfGeqIhBBhKpjdRCcBCwC01suUUr2rvLYdOE9r7QNQSjkA5+FucOVKJ23bOkhPj7jerz0ZfmIyp8HWTXjaPoOnkRwpJIQ4PMGsNZOAwiqPfUopu9baq7X2ADlKKQswHlittd5Y3cpSU+Ow2/+eOyc9/Z9zDv30UzEXXpiyz9cihmHAdwMg6Ujo+Szp9gbSCjqAiP6f15CUgUnKwXSw5RDMZFAEVI3GqrX2VjxQSsUA04Bi4LYDrSw/v6zyfnp6ItnZxf94zxdf5DNyZON9vhb2DB8x21/D1fQS6Pg8RlQ66fa4yPysB2l/34eGRMrAJOVgqloONU0KwRwz+BG4AEAp1RdYW/FCoEUwB/hFaz2korvocOTkePn9dw+9e0fe5S1tpRtJXnEhVvduDHsSRpRcz1kIUbuC2TL4BDhbKbUEsAA3KKWGA5sBG3AqEK2UOj/w/ge11ksPdWNLl5Zz3HExREVF0CGlfg9W119YvCWUqmfwJnUPdURCiAgVtGSgtfYDt+719IYq92Nqc3vLljnp2zdyWgW24jUk/nY3zmaDcLa6KdThCCEiXMScgbxsWXlkJAPDAL+b+C3PUNzlRUkEQog6ERHJoLjYx++/u+nRo1YbG3XOXriSlBXnYvGVUdT9HXwJXUIdkhCigYiIA/KXL3fSo0d4jxdE/zWTmJ3vUdzlJZlKQghR5yIiGfzvf06OOy48WwX2/KVY3btxN7kQV/NBYImIxpoQIsxERM2zfHk5ffqE2XiBr5z4DSOI3/I03qTuGI5USQRCiJAJ+9rH7zdYtqycnj3Dp2VgK9kAFhvelL4UHvMp/ti2oQ5JCNHAhX0y2LTJTVqajUaNbAd+c4hZPIUkrL+D+E2PA+DKuFQuOiOEqBfCfsxg9WoXJ5wQBl1Ehp+YzNfxpJ6EK+MKSQJCiHolApKBs153EVncuSRsfAB3ozMpb3dPqMMRQoh9CvtuojVrXHTvXj+TgdWZScrK/rjS++FqduWBFxBCiBAJ62Tg9RqsXOmka9eoUIeyB4trN3G/P4k/ugUFvefjbnpRqEMSQohqhXUy2LzZTevWdpKS6s/gcXTWR6SsGog3qRdYLBiO5FCHJIQQBxTWYwa//lp/uoiszh0Y1ij89mQKes+XJCCECCth3TJYt85Nt27RoQ3CMIj5azpJP1+BzZmJp/FZkgiEEGEnrJPBb7+56Nw5hOMFhg9byTpsxWsp6L0Ab1KP0MUihBCHIayTwcaNbo48MgTJwPATs30qSWtuwJfYjdJO48GeUPdxCCFELQnbZFBS4ic310ebNo4633bi2puwOTMp6vZqnW9bCCGCIWwHkP/4w03btg5stjo6k9fwEfvnS7gbn0NxlxelJSCEiChh2zL4/XcP7dvXTavAVrqJlBXnY/EV44trJ4lACBFxwrZlsGWLh/btgzxe4PdgL1mP35FCcadn8SUeHdztCSFEiIRty2DLFjcdOgSvZWAv+oWUFefiKPgRf2wbSQRCiIgWti2DP/7wMGhQUu2v2O8Fi42YzGkUd3kZX4Kq/W2IBm3VqhU89tiDtG3bDovFQmlpKc2bt+Dxx8fgcDjIz8/npZcmkZW1E7/fT5MmTbnjjrtp1KgxAL/8spo333wNr9eL0+nkggv6M2TIjf/Yzvfff8uHH76HYRi4XC4GDbqW008/K6ifze1288wzo3n44SewWq24XC4uv3wAV155NYMGDQZg584dPP74Q0yd+lblcp9+Opvc3FxuumkIp53Wl27dzJ0vn8+Lz+dn5Mgnad68BV6vlxkz3mT58p+wWq3Y7XZuueU2unbtFlj3Tp54Ygz5+Xm4XC6U6sydd96Dw7HvHcesrCzGjHkMwzBISkri8cefJCYmhsWLv+ett17HZrNx4YUDGDDg4j2W27RJM3HieKxWK1FRUTzyyBOkpTVi5sy3+O9/FxEfH8+gQYM58cSTef7559i0SQOQl5dLQkJi5WfPz89n6NCbePvt94iOjmb37l089tiDWK1WnnhiLOnpTVi4cB42m42zzjoXgKVLfyQ3N5t+/QbW2v8NwjgZbNvmqfUjieyFy0nYcC9FR8+kpMvztbpuIarq1as3TzzxVOXjkSMfZvHi7zjttDN5+OERXHXVNZx88mkALF/+E/fddzdTp75FVtZOJk0az3PPvUhaWiNcLid33HErnTsfQefOPSvXt3btL8ya9S7jxk0iLi6OwsIChgy5gbZt29OuXfugfa5Zs97l9NPPxmo1Ox2+++5rzjzzHObN+5wrr7ym8vnqJCUlM3ny1MrHn376Ee+/P5Phw+/n9ddfwe/3MXnyVKxWK1lZOxkx4k6eeWYiTZtmcNttt3HnnfdVJodJk57l9ddfYejQO/YT7zucccbZXHLJ5bz66kt8/vmnDBx4GS++OIHXXptObGwsQ4fexEknnUJaWqPK5Z5//jnuvnsEHTsqPv30I955520uuGAAX365sLKiHzr0Rnr16sOdd5qzFXu9XoYOvYn7738EgJ9+Wsorr7xIXl5u5Xq//vpLBg0ajGEYfP31lwwceCmLF3/PqFF/f1eOP/5E7rnnP5x++lnEx9fe+GVYJgOXy09+vp+MjNoLP2rXp8RmvkHRUdPwx7aqtfWK+u+UU/5kwwZ3ra2vU6covv++TY3f7/F4yM3NITExCa1/IyEhoTIRAPTpcxyfffYpv/yymp9/XsV5511YWTFFR8cwYcJkWrduQl5eWeUyn332KZdffhVxcXEAJCenMHXq2yQmJvLGG6/SqFEjBg68jD//3Mr48WOZPHkq1157Ba1atcHhsJOZmcmYMc/QrFlzvvnmv/zyy8/cfPOtPP30KAoLCwG4664RdOhwROU2DcNg4cJ5vPnmu3vE8Z//3EN+fh5Ll/7IiSeefNDluWtXFomJZi/AokXzmTVrTmVSychoxiWXXMH8+Z9zzDG9ycjIqEwEAEOH3oFhGACMHv0Yt9xyGxkZGZWvd+yo2L17FwBlZaU0bdqUrVv/oEWLViQlmds8+uju/Pzzas444+9W1ciRY2nc2Gyp+Xw+oqKi2br1D3r27EV0tDkrQsuWrdm8eRPduh0FwOzZ73PssX0ry8xqtTBp0hRuuunayvXGxsbhcjkxDIiJieWDD97l8suvxLLX9U+OP/4E5s37nMsvr73ZkMMyGWRmesjIsNXKYaWOvMXYStfjbH4t7iYD5DrEDdDBVNy1ZeXKFdx++78pKMjHYrEwYMAl9O59LF999SXNm7f8x/ubN29BVtZOcnKy6djxyD1eS0hIwGbbc7LGnJxsmjdvscdzFZXb/pSXl3P99Tdx5JGd+OST2SxY8AU33HAL8+Z9xtChdzB9+jR69TqWiy++jO3btzF27BO8/PIblctv376NhIQE7HZ75WOns5yOHY/kwgsH8P7771SbDCoqvKKiQm6//d+UlZVSVFTEqaeezk033Up+fh6JiUmV669aNuvX/0pOTjatWu25I1dRMQM8+uiof2wzPb0Jr7zyIl9+uRCPx82NN/6brVu3kpDw9x53XFw8paUleyxXkQjWrv2Fjz+exeTJr1FYWMDMmW9SVlaKx+Ph11/XVHYveTwe5sz5mNdem165jj59+v4jnrPPPo8XX5yA1Wpl0KDBvP32G3Tv3pPx48fSqVMX+vc3u4Y6dOjIhx++L8lg2zY3LVocZheR302CfgBb+VaKOz8PtjC4WpqIGBXdRIWFBdx99zCaNWsOQHp6OllZO/7x/szMbfTpcxw5OdmVe7IVNm3aSHZ2LOnpf1eETZs2Y/fuXXskjjVrft6jqwOo3Guu0Lp1W8CslIYNu5n+/QdSWlpK+/ZHsGXLZlatWsFXXy0CoLi4aI9lCwsLSE39e/2fffYp5eVOhg+/AzBYu3YNmZnbiYuLw+PZsyVWVlZeWXFXdBP5fD7Gjh2J3e4gLi4Oh8NBcXERXq93j4SQmbmNpk0zyMhoxrJlP/wjprVr13DSSaf8o0wBpkx5noceGslxxx3PkiWLGTPmcYYMuZ2ystIqsZXukRwqfPXVIqZPn8a4cZNITU0lNTWVSy+9gnvuuYMmTTLo0qUryckpAKxY8RM9ehyzz/VUFRcXV9mNNHHiOK677iYmTBjHuHETeeSR+znrrHOJjY2lUaPGFBUVVruugxWWu8Hbt7tp3vzQ85i94Cew2HGlX0Bhz4+kW0iETHJyCo8+OppnnhlDTk4ORx3VndzcXBYv/r7yPcuWLSEzM5MePY7h7LPP47PP5pCfnw9AWVkZ48ePJTs7e4/1Xnhhf957bwbl5eUA5OfnMXbsKJxOJ1FR0eTmmv3UGzdu2GO5ir3zhIQElOrMCy9M4IIL+gPQpk1brrhiEJMnT2X06Kc555zz91g2NTWNkpJiwOwf/+qrRUyZ8hoTJrzIhAmTueaa6/jkkw9JTU2jrKyMP/7YApjdLCtW/ETnzl33WJ/NZuO++x7m+++/YcmSxTgcDk4//SymTp2C3+8H4K+/Mvnkk9mcf34/unY9iszMTNav/xUwE920aVNZs2b1fss/MTGpst+9cePGFBcX07ZtOzIzt1NUVIjH4+Hnn1dXDmhXWLhwHh99NIsXX3yVFi1aBso4n7KyMl5+eRojRjzErl27aN++AwArVvyPvn1P2G8ce9uyZTPR0dG0aNESl8uJxWLB7/dVJtHi4mJSUlJrvL6aCMuWwV9/eQ5pvMDiKSB+48NYPbkUd5uKp3Fwj6wQoibatWvPZZf9i0mTxjNmzDOMGzeR559/jhkz3gSgSZOmjB8/CZvNRrNmzbnttv/w8MMjsFqtlJWV0b//QE499VSys4sr19mt29EMGHAxd989DLvdjsvl5NZbh3HEER2Ji4vjscceZPXqlSjVeb9x9e8/kHvu+Q8PPvgYAIMH38jTT49m7tyPKSsr5cYb/73H+1u2bEV+fh5er5cff/wepTqTlPT3DL4XXjiA66+/iltuuY2HHhrJU0+Nwmq14vV6OemkUzjmmN7/iCE6OoYHHniUMWNG0rNnL4YOvYNp06YyZMj12O0OoqKiuP/+Ryor5Oeff55HHnkMp9NJeXk5Xbt245ZbbgP2PWZw110jmDhxHH6/H8MwGD78Pux2O7fffjfDh9+B3+/nwgsHkJ7ehD/+2MJHH83i7rtHMGnSszRtmsFDD40AoGfPXoEupj+4+ebBOBx2hg27s7L7btu2PznvvAtr+pVg+vQ3GT78fgDOP78fQ4bcQKdOXSrLc/36X+ndu0+N11cTlr2bifVVdnZxZaCjRxeQnm5w660HkRn9HmJ2vo9hi8XV9NKIuCB9enriHhVAQyXlUH/KYMaMN2ndui2nnnp6SLZfX8oh2IYPv4PRo5/a79FEVcshPT2xRpVdWHYTZWV5aNKkZi0DizuXxLU3EZv5Os4W1+LKuCwiEoEQ9dEVV1zFN9/8t7IbR9S+JUsWc9ppZ9TqYaUQpt1E2dle0tMPfKlLi6eQ5FUXUdb+ftxN+tdBZEI0bNHRMYwc+WSow4hoJ5xwUlDWG5Ytg+xsL2lp+08GFtcuEn4bjmGLpaDPfyURCCHEAYRlMsjJ8dKo0b6TQdSuT0lZdQnu9PPAGgW2+nGNZCGEqM/CrpvIyYo0DQAACGVJREFUMAzy832kpOyZx6zOTCy+cvwxrSnoMx/DHoR5i4QQIkKFXcvA5TIPKoqJCYRuGMRkvknSz1di8eTjTT5GEoEQQhykoLUMlFJWYArQHXABN2utN1d5/RZgCOAFxmitP6/Jep1Og7i4QCLwu7C6c7CVbaagz0KwxdfypxBCiIYhmC2DgUCM1vr4/7d390FW1XUcx9+AyJOIQ9ADaY8T3yx8otHF7UE0UGiCHpkpnRpskBwsKhgt0oCclJgEixonH4acwlEpNXAyd3JyVSBIwwWs/DhWhlNJhWbgBCLQH9/fymm79+xdd++5D/t9zezAPfee8/vud3fP75zfOef7A74CrOh8w8xeC8wH3g2cBywzsyElt9LFvn2HGTH8MMN2XseoR2dxaMg4Xhh/VXQEIYTQC9XsDN4D3AsgaTOQfbzwDGCjpP2SngeeBCqaPebAgcMsn7WMgfv/zvOnro1nBkIIoQ9U8wLysUC2ktJBMztK0ksl3tsDjCJH51N0Y8fCxIm3AzC8T8NtTGPHjqx1CHUh8hA56BR5cD3NQzXPDP4NZKMZmDqCUu+NBP5VxVhCCCHkqGZnsBH4AICZTQJ2ZN77NfBeMxtqZqOAE4HHqhhLCCGEHFUrVJe5m+hkYABwId45PClpfbqbaC7eIV0t6Y6qBBJCCKFbDVO1NIQQQvU03ENnIYQQ+l50BiGEEKIzCCGEUOeF6qpV0qKRVJCDLwGfSC/vkfT14qOsvu7ykPnMz4B1kr5ffJTVV8Hvw3RgCX7Txm+ASyQ11YXBCnKwEDgfOITfnHJXTQItiJm1AMslTe6yfAawGN8/rpZ0Y9526v3MoColLRpMXg7eAlwAtAKTgHPNrKInuRtQ2TxkfAPo21nC60/e78NI4FvAByW1AE8BY2oRZJXl5eA44AvAmcC5wLdrEmFBzOwy4CZgaJflg4Fr8RycBcw1s9fkbaveO4OqlLRoMHk5eBqYJulgOvobDOwrPsRC5OUBM/s4fiR4b/GhFSovD6348zwrzOwhYJekfxQfYtXl5eAF4M/AiPTV7PNv/gH4aInlJ+K38T8n6UVgA/C+vA3Ve2dQsqRFmfe6LWnRoMrmQNIBSf80swFmdg3wqKQnahJl9ZXNg5lNwIcFFtcisILl/U2MAc4GvgxMB75oZuMLjq8IeTkAP0j6HbAVWFVkYEVLz2cdKPFWj/eP9d4ZREmL/BxgZkOBW9Jn5hUcW5Hy8vBp4PXAL4HZwAIzm1ZseIXJy8Nu4GFJz0jaCzwInFp0gAXIy8F04HXAm4E3AB82szMKjq8e9Hj/WO+dQZS0yMmBmQ0A1gHbJH1W0sHahFiIsnmQdJmklnQB7WZgpaRmHS7K+5vYCkwwszHpSHkSfoTcbPJy8BzwH2C/pH34DvC4wiOsvd8DbzOz0WZ2ND5E9Ku8Fer6biLgLmCqmW0ilbQwswUcKWmxCngI79QuTz/8ZlM2B8Ag/OLQkHQXCcAiSbk/9AaV+7tQ29AK1d3fxCKgLX12raRmPEDqLgdTgM1mdggfK/9FDWMtlJmdDxwj6YaUkzZ8/7ha0l/y1o1yFCGEEOp+mCiEEEIBojMIIYQQnUEIIYToDEIIIRCdQQghBOr/1tLQT5jZm4An+P/74mdIerrMOksBJC3tRbuzgZXAzrRoGPAAMC/7cF+F27oSeCTd3ni/pLPT8g5JvXr4y8zageOBvWnRscAfgQsk7cpZby6wR9KtvWk/NL/oDEI9+Wtvd5qv0HpJswHMbBDQDlwCfKcnG5GULYcxObO8r76nOZLa4eXKnT8BFuDlJ8ppxb+fEHJFZxDqXqo99F3gGODVwApJqzLvDwZWAxPSousk3ZiqNF4PnIAXLFsk6b68tiQdTA8zjU/bvhBYCBzGS0J/Di+bXKq9m/Ed78S07hZJLWbWWURwJ3CapF1mNhp/Yv6NwPuBK9Nn/gRcJGl3N2kZgdci2pLampXiHJa+5gBHAzOBc8zsb0BHT/MR+o+4ZhDqyTgz68h8XZqWz8HnqzgdL8R2VZf1WoHRkk4DpuBlzcGP7FdLehe+U7w+lXkuy8xehde32WhmJwGXA2dJOgmviLkkpz0AJM1P/7Zklr0E/BiYlRZ9DPgpXirhm8B5aXttwPIy4d1kZtvSjn0z/mTtteks4WK8dPUpaXuXph39emCxpLZXko/Qf8SZQagn5YaJFgLTUqmFk/EzhKzHADOzNuAejgybTAHensbywY+834ofIWfNNLMOvLTBQOBO4FZ8qOjuzFH6DcAP8J1tqfa68yO8vv73gE8CVwAteEG1+80MvMTIs2XWnyOp3cxagTvwyYxexIP5CDDDfCOTgVJ1qirNR+iHojMIjWAtXoDsbuA2jszsBoCk3Wb2TmAqXsBsa3o9CDhH0rMAZjYOKHWx9eVrBlnpiDtrAHBUTnu5JD2SCoedDhwvaZOZfQjYIGlmanMo/1ttstR2NqW6XD80s1PwiU0exjubB4Ht+HBWV5XmI/RDMUwUGsFUfKhjHV6Yr/NCL+n/M4E1+JSX8/E7bk7AS1rPS595B76THN6Ddtvxs4bR6fVF+BF8ufayutbY73QLPm5/W3q9BTgzM+/A1/DZyrqzEr9ucDF+feMQcDX+PU/Hd/zgUx52xtHbfIQmFp1BaARLgQ1mthWf4vQpvF59p5/jZYt/i5c2v1PSDuDzwCQz2w7cDnxK0p5KG5W0HVgGPGBmj+Pj+1fktJe1DtiWjvSz1uBzDKxJbTwDfAZYa2Y78IvPCyuIbT9+PWMJPttVB/A4XsZ6L35hGuA+4KtpJrhe5SM0t6haGkIIIc4MQgghRGcQQgiB6AxCCCEQnUEIIQSiMwghhEB0BiGEEIjOIIQQAvBfYR8k9VS9nZ8AAAAASUVORK5CYII=\n", 322 | "text/plain": [ 323 | "
" 324 | ] 325 | }, 326 | "metadata": { 327 | "needs_background": "light" 328 | }, 329 | "output_type": "display_data" 330 | }, 331 | { 332 | "data": { 333 | "image/png": "\n", 334 | "text/plain": [ 335 | "
" 336 | ] 337 | }, 338 | "metadata": { 339 | "needs_background": "light" 340 | }, 341 | "output_type": "display_data" 342 | }, 343 | { 344 | "data": { 345 | "image/png": "\n", 346 | "text/plain": [ 347 | "
" 348 | ] 349 | }, 350 | "metadata": { 351 | "needs_background": "light" 352 | }, 353 | "output_type": "display_data" 354 | }, 355 | { 356 | "data": { 357 | "image/png": "\n", 358 | "text/plain": [ 359 | "
" 360 | ] 361 | }, 362 | "metadata": { 363 | "needs_background": "light" 364 | }, 365 | "output_type": "display_data" 366 | } 367 | ], 368 | "source": [ 369 | "ctr = y_true.mean()\n", 370 | "y_pred = get_y_pred(y_score, threshold=ctr)\n", 371 | "\n", 372 | "norm_entropy = get_norm_entropy(y_true, y_score)\n", 373 | "calibration = y_score.mean() / ctr\n", 374 | "accuracy, precision, recall, f1 = accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), \\\n", 375 | " recall_score(y_true, y_pred), f1_score(y_true, y_pred)\n", 376 | "\n", 377 | "confusion_matrix = plot_confusion_matrix(y_true, y_pred)\n", 378 | "auroc = plot_roc_curve(y_true, y_score)\n", 379 | "auprc = plot_pr_curve(y_true, y_score)\n", 380 | "_ = plot_lift_curve(y_true, y_score)\n", 381 | "_ = plot_class_density(y_true, y_score, threshold=ctr)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 19, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "dump_pickle(os.path.join(MODEL_DIR, '_'.join([model_type, 'metric', train_dataset_type]) + '.pkl'), \n", 391 | " (norm_entropy, calibration, accuracy, precision, recall, f1, confusion_matrix, auroc, auprc))" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 20, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "test_dataset_type = 'quiz'\n", 401 | "score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.pkl')" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 21, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "if USE_TFRECORD:\n", 411 | " test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.tfrecord')\n", 412 | " test_dataset = extract_dataset(test_dataset_path, compression_type='GZIP', \n", 413 | " shuffle_buffer_size=shuffle_buffer_size, is_training=False)\n", 414 | " test_dataset = transform_dataset(test_dataset, num_feature_names, cat_feature_names, target_name=target_name) \n", 415 | " test_generator = load_dataset(test_dataset, batch_size=batch_size, is_training=False)\n", 416 | " \n", 417 | " y_score = model.predict_generator(test_generator).ravel()\n", 418 | " \n", 419 | "else:\n", 420 | " df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))\n", 421 | " test_model_input = {column: df_X_test[column].values for column in df_X_test.columns}\n", 422 | " \n", 423 | " y_score = model.predict(test_model_input).ravel()\n", 424 | " \n", 425 | "dump_pickle(score_path, y_score)" 426 | ] 427 | } 428 | ], 429 | "metadata": { 430 | "kernelspec": { 431 | "display_name": "Python 3", 432 | "language": "python", 433 | "name": "python3" 434 | }, 435 | "language_info": { 436 | "codemirror_mode": { 437 | "name": "ipython", 438 | "version": 3 439 | }, 440 | "file_extension": ".py", 441 | "mimetype": "text/x-python", 442 | "name": "python", 443 | "nbconvert_exporter": "python", 444 | "pygments_lexer": "ipython3", 445 | "version": "3.7.5" 446 | } 447 | }, 448 | "nbformat": 4, 449 | "nbformat_minor": 2 450 | } 451 | --------------------------------------------------------------------------------