├── README.md ├── TF_LSTM_LOB.ipynb ├── data └── README.md ├── ml_analysis.py └── slides └── webinar.pdf /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow 2 | 3 | ## Overview 4 | This repository contains the Ipython notebooks used in our webinar demos and all necessary components to run them. 5 | 6 | ### System requirements 7 | 1. Python version >= 2.7 with ``pip`` 8 | 2. Install ``virtualenv`` by executing ``pip install virtualenv`` if necessary 9 | 10 | ### Installation 11 | 1. Check if your system satisfies the requirements above 12 | 2. Clone the repository to your local machine 13 | 3. Download the virtual environment folder from this [link](https://drive.google.com/open?id=0Bxe4BGRtR8QYVVJGY1FqZTFmVlE) and place it into the repository folder 14 | 15 | ### Data 16 | 1. Download the data from this [link](https://drive.google.com/open?id=0Bxe4BGRtR8QYMzd4YTlENmpROXM) and place it into the repository data folder 17 | 18 | ### Use 19 | 1. Open terminal and access the repository 20 | 2. Activate `dnn_env` virtual environment by executing ``source dnn_env/bin/activate`` 21 | 3. (dnn_env) will be displayed before the current terminal cursor 22 | 4. Start the notebook server and access the notebook 23 | 5. After shutting down the notebook server, to deactivate the virtual environment, execute ``deactivate`` 24 | 25 | ### Misc 26 | 1. **virtualenv**: [reference link](http://python-guide-pt-br.readthedocs.io/en/latest/dev/virtualenvs/) 27 | 28 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # README # 2 | 3 | This is where you would put the data locally -------------------------------------------------------------------------------- /ml_analysis.py: -------------------------------------------------------------------------------- 1 | from sklearn import ensemble 2 | from sklearn import naive_bayes 3 | from sklearn import svm 4 | from sklearn import linear_model 5 | from sklearn.metrics import confusion_matrix 6 | import numpy as np 7 | import xgboost as xgb 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import itertools 11 | from sklearn.metrics import roc_curve, auc 12 | 13 | 14 | class MLOperator(object): 15 | def __init__(self): 16 | pass 17 | 18 | def set_data_schema(self, data_schema): 19 | self.data_schema = data_schema 20 | 21 | def get_classifier(self, classifier, **kwargs): 22 | clf_name = classifier.lower() 23 | if clf_name == 'rf': 24 | return ensemble.RandomForestClassifier(**kwargs) 25 | elif clf_name == 'gb': 26 | return ensemble.GradientBoostingClassifier(**kwargs) 27 | elif clf_name == 'linearsvc': 28 | return svm.LinearSVC(**kwargs) 29 | elif clf_name == 'svc': 30 | return svm.SVC(**kwargs) 31 | elif clf_name == 'gaussiannb': 32 | return naive_bayes.GaussianNB(**kwargs) 33 | elif clf_name == 'xgb': 34 | return xgb.XGBClassifier(**kwargs) 35 | elif clf_name == 'logit': 36 | return linear_model.LogisticRegression(**kwargs) 37 | 38 | def get_samples_index(self, y, sampling): 39 | if type(sampling) == dict: 40 | sample_sizes = sampling 41 | elif type(sampling) == str and sampling == 'min': 42 | sample_sizes = dict([(label, y.value_counts().min()) \ 43 | for label in y.unique()]) 44 | elif type(sampling) == str and sampling in list(y.value_counts().index.astype(str)): 45 | sample_sizes = dict([(label, \ 46 | y.value_counts()[int(sampling)]) 47 | for label in y.unique()]) 48 | elif type(sampling) == str and 'multi:' in sampling: 49 | sample_sizes = dict([(label, 50 | int(sampling.replace('multi:', '')) * y.value_counts().min()) 51 | for label in y.unique()]) 52 | elif type(sampling) == int: 53 | sample_sizes = dict([(label, sampling) for label in y.unique()]) 54 | else: 55 | print 'unknown sampling method or no sampling' 56 | return y.index 57 | 58 | select_index_all = [] 59 | for label in y.unique(): 60 | label_index = y[y==label].index 61 | sample_size = sample_sizes[label] 62 | 63 | if sample_size <= len(label_index): 64 | replace = False 65 | else: 66 | replace = True 67 | select_index = np.random.choice(label_index, sample_size, \ 68 | replace=replace) 69 | select_index_all = select_index_all + list(select_index) 70 | 71 | return select_index_all 72 | 73 | def train(self, x_train, y_train, classifier='rf', **kwargs): 74 | ''' 75 | if not sampling is None: 76 | idx = self.get_samples_index(y_train, sampling) 77 | else: 78 | idx = x_train.index 79 | ''' 80 | 81 | y_sampled = y_train 82 | X_sampled = x_train 83 | 84 | clf = self.get_classifier(classifier=classifier, **kwargs) 85 | print clf 86 | print 'fitting.....' 87 | clf.fit(X_sampled, y_sampled) 88 | 89 | return clf 90 | 91 | def test(self, clf, x_test, y_test): 92 | 93 | print 'predicting.....' 94 | y_pred = pd.Series(clf.predict(x_test), index=x_test.index) 95 | y_pred.name = 'predict' 96 | 97 | y_prob = pd.DataFrame(clf.predict_proba(x_test), index=x_test.index) 98 | y_prob.columns = [int(c) for c in clf.classes_] 99 | pred_df = pd.concat([y_test, y_pred, y_prob], axis=1) 100 | 101 | return pred_df 102 | 103 | def get_pred_dfs(self, x_train, x_test, y_train, y_test, 104 | classifier='rf', **kwargs): 105 | 106 | if classifier == 'tensorflow': 107 | clf = kwargs['model'] 108 | prediction = clf.predict(x_train) 109 | pred_df_train = pd.DataFrame(prediction, columns=kwargs['columns']) 110 | pred_df_train['predict'] = pred_df_train.idxmax(axis=1) 111 | pred_df_train['true'] = pd.DataFrame(y_train, 112 | columns=kwargs['columns']).idxmax(axis=1) 113 | 114 | prediction = clf.predict(x_test) 115 | pred_df_test = pd.DataFrame(prediction, columns=kwargs['columns']) 116 | pred_df_test['predict'] = pred_df_test.idxmax(axis=1) 117 | pred_df_test['true'] = pd.DataFrame(y_test, 118 | columns=kwargs['columns']).idxmax(axis=1) 119 | 120 | else: 121 | clf = self.train(x_train, y_train, classifier=classifier, **kwargs) 122 | 123 | pred_df_train = self.test(clf, x_train, y_train) 124 | pred_df_test = self.test(clf, x_test, y_test) 125 | 126 | return pred_df_train, pred_df_test, clf 127 | 128 | def get_feature_importances(self, clf_type, X_train=None, y_train=None, 129 | clf=None, X_labels=None): 130 | 131 | if clf_type != 'pre-trained': 132 | clf = self.get_classifier(clf_type) 133 | clf.fit(X_train, y_train) 134 | X_labels = X_train.columns 135 | 136 | importances = clf.feature_importances_ 137 | std = np.std([estimator_.feature_importances_ 138 | for estimator_ in clf.estimators_], axis=0) 139 | 140 | return {'importances': importances, 'std': std, 'X_labels': X_labels} 141 | 142 | class MLEvaluator(object): 143 | def __init__(self): 144 | pass 145 | 146 | def set_pred_df(self, pred_df): 147 | self.pred_df = pred_df 148 | 149 | def generate_confusion_matrix(self): 150 | idx = list('actual_' + self.pred_df.columns[2:].astype(str)) 151 | cols = list('predict_' + self.pred_df.columns[2:].astype(str)) 152 | 153 | conf_mx = pd.DataFrame(confusion_matrix(self.pred_df.true, 154 | self.pred_df.predict), index=idx, columns=cols) 155 | 156 | return conf_mx 157 | 158 | def plot_confusion_matrix(self, cm, classes, 159 | normalize=False, 160 | title='Confusion matrix', 161 | cmap=plt.cm.Blues): 162 | """ 163 | This function prints and plots the confusion matrix. 164 | Normalization can be applied by setting `normalize=True`. 165 | """ 166 | fig = plt.figure(figsize=(10,8)) 167 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 168 | plt.title(title) 169 | plt.colorbar() 170 | tick_marks = np.arange(len(classes)) 171 | plt.xticks(tick_marks, classes, rotation=45) 172 | plt.yticks(tick_marks, classes) 173 | 174 | if normalize: 175 | cm_text = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 176 | print("Normalized confusion matrix") 177 | else: 178 | cm_text = cm 179 | print('Confusion matrix, without normalization') 180 | 181 | print cm_text 182 | 183 | thresh = cm.max() / 2. 184 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 185 | plt.text(j, i, round(cm_text[i, j], 4), 186 | horizontalalignment="center", 187 | color="white" if cm[i, j] > thresh else "black") 188 | 189 | plt.tight_layout(pad=4) 190 | plt.ylabel('True label') 191 | plt.xlabel('Predicted label') 192 | 193 | return fig 194 | 195 | def plot_feature_importance(self, importances_dict, 196 | num_features_imp_plt=50, **kwargs): 197 | 198 | date = importances_dict['date'] 199 | print 'start getting features importance plot for: ', date 200 | importances = importances_dict['importances'] 201 | std = importances_dict['std'] 202 | X_labels = importances_dict['X_labels'] 203 | 204 | indices = np.argsort(importances)[::-1] 205 | fig, ax = plt.subplots(figsize = (25,14)) 206 | 207 | plt.title("Feature importances (top {0}) {1} {2} features".format( 208 | num_features_imp_plt, date, kwargs['feature_type'])) 209 | 210 | plt.barh(range(num_features_imp_plt), 211 | importances[indices][:num_features_imp_plt], 212 | color="b", yerr=std[indices][:num_features_imp_plt], 213 | align="center") 214 | 215 | plt.yticks(range(num_features_imp_plt), 216 | [X_labels[i] for i in indices[:num_features_imp_plt]]) 217 | 218 | plt.ylim([-1, num_features_imp_plt]) 219 | 220 | return fig 221 | 222 | def plot_feature_importance_xgb(self, clf): 223 | fig,ax = plt.subplots(figsize = (25, 14)) 224 | 225 | ax = xgb.plot_importance(clf, ax = ax, 226 | title='Feature importance', 227 | xlabel='Importance', ylabel='Features', 228 | importance_type='gain') 229 | return fig 230 | 231 | def plot_roc(self): 232 | fig = plt.figure(figsize=(12,9)) 233 | for lab in self.pred_df.true.unique(): 234 | fpr, tpr, thresholds = roc_curve(self.pred_df.true, self.pred_df[str(lab)], pos_label=lab) 235 | roc_auc = auc(fpr, tpr) 236 | plt.plot(fpr, tpr,lw=2, label='Label {} (area = %0.2f)'.format(lab) % roc_auc) 237 | 238 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 239 | plt.xlim([0.0, 1.0]) 240 | plt.ylim([0.0, 1.05]) 241 | plt.xlabel('False Positive Rate', fontsize = 20) 242 | plt.ylabel('True Positive Rate', fontsize = 20) 243 | plt.legend(loc="lower right", fontsize = 20) 244 | plt.grid() 245 | 246 | return fig 247 | 248 | def plot_learning_curve(self, df, label, variable_name): 249 | 250 | train_f1 = df[df.index=='train'] 251 | test_f1 = df[df.index=='test'] 252 | 253 | variable = sorted(list(train_f1[variable_name].unique())) 254 | train_m = train_f1.groupby(variable_name)[label].mean() 255 | train_std = train_f1.groupby(variable_name)[label].std() 256 | 257 | test_m = test_f1.groupby(variable_name)[label].mean() 258 | test_std = test_f1.groupby(variable_name)[label].std() 259 | 260 | fig = plt.figure(figsize=(12,9)) 261 | 262 | plt.fill_between(variable, train_m - train_std, train_m + train_std, 263 | alpha=0.1, color="r") 264 | 265 | plt.fill_between(variable, test_m - test_std, test_m + test_std, 266 | alpha=0.1, color="g") 267 | 268 | train_line = plt.plot(variable, train_m, 269 | 'o-', color="r", label="Training score", markersize=10) 270 | 271 | test_line = plt.plot(variable, test_m, 'o-', color="g", 272 | label="Testing score", markersize=10) 273 | 274 | # plt.ylim(min(test_f1[label].min(), train_f1[label].min())*0.95, 1.05) 275 | plt.ylim(0, 1) 276 | plt.xlim(train_f1[variable_name].min()*0.8, train_f1[variable_name].max() * 1.1) 277 | plt.xticks(train_f1[variable_name].unique(), train_f1[variable_name].unique()) 278 | plt.tick_params(axis='both', labelsize=15) 279 | plt.title('learning curve ({} label)'.format(label), fontsize=20) 280 | plt.xlabel(variable_name, fontsize=20) 281 | plt.ylabel('f1 score', fontsize=20) 282 | plt.legend(loc="best", fontsize=20) 283 | plt.grid() 284 | -------------------------------------------------------------------------------- /slides/webinar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quiota/tensorflow/0ef362d5c156f6f4350ed377ab745c5333d482a9/slides/webinar.pdf --------------------------------------------------------------------------------