├── README.md
├── TF_LSTM_LOB.ipynb
├── data
    └── README.md
├── ml_analysis.py
└── slides
    └── webinar.pdf


/README.md:
--------------------------------------------------------------------------------
 1 | # tensorflow
 2 | 
 3 | ## Overview
 4 | This repository contains the Ipython notebooks used in our webinar demos and all necessary components to run them.
 5 | 
 6 | ### System requirements
 7 | 1. Python version >= 2.7 with ``pip``
 8 | 2. Install ``virtualenv`` by executing ``pip install virtualenv`` if necessary
 9 | 
10 | ### Installation
11 | 1. Check if your system satisfies the requirements above
12 | 2. Clone the repository to your local machine
13 | 3. Download the virtual environment folder from this [link](https://drive.google.com/open?id=0Bxe4BGRtR8QYVVJGY1FqZTFmVlE) and place it into the repository folder 
14 | 
15 | ### Data
16 | 1. Download the data from this [link](https://drive.google.com/open?id=0Bxe4BGRtR8QYMzd4YTlENmpROXM) and place it into the repository data folder 
17 | 
18 | ### Use
19 | 1. Open terminal and access the repository
20 | 2. Activate `dnn_env` virtual environment by executing ``source dnn_env/bin/activate``
21 | 3. (dnn_env) will be displayed before the current terminal cursor
22 | 4. Start the notebook server and access the notebook
23 | 5. After shutting down the notebook server, to deactivate the virtual environment, execute ``deactivate``
24 | 
25 | ### Misc
26 | 1. **virtualenv**: [reference link](http://python-guide-pt-br.readthedocs.io/en/latest/dev/virtualenvs/)
27 | 
28 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # README #
2 | 
3 | This is where you would put the data locally


--------------------------------------------------------------------------------
/ml_analysis.py:
--------------------------------------------------------------------------------
  1 | from sklearn import ensemble
  2 | from sklearn import naive_bayes
  3 | from sklearn import svm
  4 | from sklearn import linear_model
  5 | from sklearn.metrics import confusion_matrix
  6 | import numpy as np
  7 | import xgboost as xgb
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import itertools
 11 | from sklearn.metrics import roc_curve, auc
 12 | 
 13 | 
 14 | class MLOperator(object):
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def set_data_schema(self, data_schema):
 19 |         self.data_schema = data_schema
 20 | 
 21 |     def get_classifier(self, classifier, **kwargs):
 22 |         clf_name = classifier.lower()
 23 |         if clf_name == 'rf':
 24 |             return ensemble.RandomForestClassifier(**kwargs)
 25 |         elif clf_name == 'gb':
 26 |             return ensemble.GradientBoostingClassifier(**kwargs)
 27 |         elif clf_name == 'linearsvc':
 28 |             return svm.LinearSVC(**kwargs)
 29 |         elif clf_name == 'svc':
 30 |             return svm.SVC(**kwargs)
 31 |         elif clf_name == 'gaussiannb':
 32 |             return naive_bayes.GaussianNB(**kwargs)
 33 |         elif clf_name == 'xgb':
 34 |             return xgb.XGBClassifier(**kwargs)
 35 |         elif clf_name == 'logit':
 36 |             return linear_model.LogisticRegression(**kwargs)
 37 |         
 38 |     def get_samples_index(self, y, sampling):
 39 |         if type(sampling) == dict:
 40 |             sample_sizes = sampling
 41 |         elif type(sampling) == str and sampling == 'min':
 42 |             sample_sizes = dict([(label, y.value_counts().min()) \
 43 |                                  for label in y.unique()])
 44 |         elif type(sampling) == str and sampling in list(y.value_counts().index.astype(str)):
 45 |             sample_sizes = dict([(label, \
 46 |                                   y.value_counts()[int(sampling)])
 47 |                                  for label in y.unique()])
 48 |         elif type(sampling) == str and 'multi:' in sampling:
 49 |             sample_sizes = dict([(label, 
 50 |                 int(sampling.replace('multi:', '')) * y.value_counts().min())
 51 |                     for label in y.unique()])
 52 |         elif type(sampling) == int:
 53 |             sample_sizes = dict([(label, sampling) for label in y.unique()])
 54 |         else:
 55 |             print 'unknown sampling method or no sampling'
 56 |             return y.index
 57 | 
 58 |         select_index_all = []
 59 |         for label in y.unique():
 60 |             label_index = y[y==label].index
 61 |             sample_size = sample_sizes[label]
 62 |         
 63 |             if sample_size <= len(label_index):
 64 |                 replace = False
 65 |             else:
 66 |                 replace = True
 67 |             select_index = np.random.choice(label_index, sample_size, \
 68 |                                             replace=replace)
 69 |             select_index_all = select_index_all + list(select_index)
 70 |     
 71 |         return select_index_all
 72 |     
 73 |     def train(self, x_train, y_train, classifier='rf', **kwargs):
 74 |         '''
 75 |         if not sampling is None:
 76 |                 idx = self.get_samples_index(y_train, sampling)
 77 |         else:
 78 |             idx = x_train.index
 79 |         '''
 80 |         
 81 |         y_sampled = y_train
 82 |         X_sampled = x_train
 83 | 
 84 |         clf = self.get_classifier(classifier=classifier, **kwargs)
 85 |         print clf
 86 |         print 'fitting.....'
 87 |         clf.fit(X_sampled, y_sampled)
 88 | 
 89 |         return clf
 90 | 
 91 |     def test(self, clf, x_test, y_test):
 92 |         
 93 |         print 'predicting.....'
 94 |         y_pred = pd.Series(clf.predict(x_test), index=x_test.index)
 95 |         y_pred.name = 'predict'
 96 |         
 97 |         y_prob = pd.DataFrame(clf.predict_proba(x_test), index=x_test.index)
 98 |         y_prob.columns = [int(c) for c in clf.classes_]
 99 |         pred_df = pd.concat([y_test, y_pred, y_prob], axis=1)
100 | 
101 |         return pred_df    
102 |     
103 |     def get_pred_dfs(self, x_train, x_test, y_train, y_test, 
104 |             classifier='rf', **kwargs):
105 | 
106 |         if classifier == 'tensorflow':
107 |             clf = kwargs['model']
108 |             prediction = clf.predict(x_train)
109 |             pred_df_train = pd.DataFrame(prediction, columns=kwargs['columns'])
110 |             pred_df_train['predict'] = pred_df_train.idxmax(axis=1)
111 |             pred_df_train['true'] = pd.DataFrame(y_train, 
112 |                 columns=kwargs['columns']).idxmax(axis=1)
113 | 
114 |             prediction = clf.predict(x_test)
115 |             pred_df_test = pd.DataFrame(prediction, columns=kwargs['columns'])
116 |             pred_df_test['predict'] = pred_df_test.idxmax(axis=1)
117 |             pred_df_test['true'] = pd.DataFrame(y_test, 
118 |                 columns=kwargs['columns']).idxmax(axis=1)
119 | 
120 |         else:
121 |             clf = self.train(x_train, y_train, classifier=classifier, **kwargs)
122 |             
123 |             pred_df_train = self.test(clf, x_train, y_train)
124 |             pred_df_test = self.test(clf, x_test, y_test)
125 | 
126 |         return pred_df_train, pred_df_test, clf
127 | 
128 |     def get_feature_importances(self, clf_type, X_train=None, y_train=None,
129 |             clf=None, X_labels=None):
130 | 
131 |         if clf_type != 'pre-trained':
132 |             clf = self.get_classifier(clf_type)
133 |             clf.fit(X_train, y_train)
134 |             X_labels = X_train.columns
135 | 
136 |         importances = clf.feature_importances_
137 |         std = np.std([estimator_.feature_importances_ 
138 |             for estimator_ in clf.estimators_], axis=0)
139 | 
140 |         return {'importances': importances, 'std': std, 'X_labels': X_labels}
141 | 
142 | class MLEvaluator(object):
143 |     def __init__(self):
144 |         pass
145 |     
146 |     def set_pred_df(self, pred_df):
147 |         self.pred_df = pred_df
148 |     
149 |     def generate_confusion_matrix(self):
150 |         idx = list('actual_' + self.pred_df.columns[2:].astype(str))
151 |         cols = list('predict_' + self.pred_df.columns[2:].astype(str))
152 |         
153 |         conf_mx = pd.DataFrame(confusion_matrix(self.pred_df.true, 
154 |             self.pred_df.predict), index=idx, columns=cols)
155 |         
156 |         return conf_mx
157 | 
158 |     def plot_confusion_matrix(self, cm, classes,
159 |                               normalize=False,
160 |                               title='Confusion matrix',
161 |                               cmap=plt.cm.Blues):
162 |         """
163 |         This function prints and plots the confusion matrix.
164 |         Normalization can be applied by setting `normalize=True`.
165 |         """
166 |         fig = plt.figure(figsize=(10,8))
167 |         plt.imshow(cm, interpolation='nearest', cmap=cmap)
168 |         plt.title(title)
169 |         plt.colorbar()
170 |         tick_marks = np.arange(len(classes))
171 |         plt.xticks(tick_marks, classes, rotation=45)
172 |         plt.yticks(tick_marks, classes)
173 | 
174 |         if normalize:
175 |             cm_text = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
176 |             print("Normalized confusion matrix")
177 |         else:
178 |             cm_text = cm
179 |             print('Confusion matrix, without normalization')
180 | 
181 |         print cm_text
182 | 
183 |         thresh = cm.max() / 2.
184 |         for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
185 |             plt.text(j, i, round(cm_text[i, j], 4),
186 |                      horizontalalignment="center",
187 |                      color="white" if cm[i, j] > thresh else "black")
188 | 
189 |         plt.tight_layout(pad=4)
190 |         plt.ylabel('True label')
191 |         plt.xlabel('Predicted label')
192 |         
193 |         return fig
194 | 
195 |     def plot_feature_importance(self, importances_dict, 
196 |         num_features_imp_plt=50, **kwargs):
197 |         
198 |         date = importances_dict['date']
199 |         print 'start getting features importance plot for: ', date
200 |         importances = importances_dict['importances']
201 |         std = importances_dict['std']
202 |         X_labels = importances_dict['X_labels']        
203 | 
204 |         indices = np.argsort(importances)[::-1]
205 |         fig, ax = plt.subplots(figsize = (25,14))
206 |     
207 |         plt.title("Feature importances (top {0}) {1} {2} features".format(
208 |             num_features_imp_plt, date, kwargs['feature_type']))
209 |     
210 |         plt.barh(range(num_features_imp_plt), 
211 |             importances[indices][:num_features_imp_plt],
212 |                 color="b", yerr=std[indices][:num_features_imp_plt], 
213 |                     align="center")    
214 |         
215 |         plt.yticks(range(num_features_imp_plt), 
216 |             [X_labels[i] for i in indices[:num_features_imp_plt]])
217 |         
218 |         plt.ylim([-1, num_features_imp_plt])
219 | 
220 |         return fig
221 | 
222 |     def plot_feature_importance_xgb(self, clf):
223 |         fig,ax = plt.subplots(figsize = (25, 14))
224 |         
225 |         ax = xgb.plot_importance(clf, ax = ax,
226 |                                  title='Feature importance',
227 |                                  xlabel='Importance', ylabel='Features',
228 |                                  importance_type='gain')
229 |         return fig
230 | 
231 |     def plot_roc(self):
232 |         fig = plt.figure(figsize=(12,9))
233 |         for lab in self.pred_df.true.unique():
234 |             fpr, tpr, thresholds = roc_curve(self.pred_df.true, self.pred_df[str(lab)], pos_label=lab)
235 |             roc_auc = auc(fpr, tpr)
236 |             plt.plot(fpr, tpr,lw=2, label='Label {} (area = %0.2f)'.format(lab) % roc_auc)
237 | 
238 |         plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
239 |         plt.xlim([0.0, 1.0])
240 |         plt.ylim([0.0, 1.05])
241 |         plt.xlabel('False Positive Rate', fontsize = 20)
242 |         plt.ylabel('True Positive Rate', fontsize = 20)
243 |         plt.legend(loc="lower right", fontsize = 20)
244 |         plt.grid()
245 | 
246 |         return fig
247 | 
248 |     def plot_learning_curve(self, df, label, variable_name):
249 |         
250 |         train_f1 = df[df.index=='train']
251 |         test_f1 = df[df.index=='test']
252 | 
253 |         variable = sorted(list(train_f1[variable_name].unique()))
254 |         train_m = train_f1.groupby(variable_name)[label].mean()
255 |         train_std = train_f1.groupby(variable_name)[label].std()
256 |         
257 |         test_m = test_f1.groupby(variable_name)[label].mean()
258 |         test_std = test_f1.groupby(variable_name)[label].std()
259 |         
260 |         fig = plt.figure(figsize=(12,9))
261 |         
262 |         plt.fill_between(variable, train_m - train_std, train_m + train_std, 
263 |             alpha=0.1, color="r")
264 |         
265 |         plt.fill_between(variable, test_m - test_std, test_m + test_std, 
266 |             alpha=0.1, color="g")
267 |         
268 |         train_line = plt.plot(variable, train_m, 
269 |             'o-', color="r", label="Training score", markersize=10)
270 |         
271 |         test_line = plt.plot(variable, test_m, 'o-', color="g",
272 |                  label="Testing score", markersize=10)
273 |         
274 |         # plt.ylim(min(test_f1[label].min(), train_f1[label].min())*0.95, 1.05)
275 |         plt.ylim(0, 1)        
276 |         plt.xlim(train_f1[variable_name].min()*0.8, train_f1[variable_name].max() * 1.1)
277 |         plt.xticks(train_f1[variable_name].unique(), train_f1[variable_name].unique())
278 |         plt.tick_params(axis='both', labelsize=15)
279 |         plt.title('learning curve ({} label)'.format(label), fontsize=20)
280 |         plt.xlabel(variable_name, fontsize=20)
281 |         plt.ylabel('f1 score', fontsize=20)
282 |         plt.legend(loc="best", fontsize=20)
283 |         plt.grid()
284 |             


--------------------------------------------------------------------------------
/slides/webinar.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quiota/tensorflow/0ef362d5c156f6f4350ed377ab745c5333d482a9/slides/webinar.pdf


--------------------------------------------------------------------------------