├── Chapter01
├── algos_demo.py
└── datasets_demo.py
├── Chapter02
├── bias_variance.py
└── motivation.py
├── Chapter03
├── custom_voting_implementation.py
├── custom_voting_implementation_analysis.py
├── scikit_hard_voting.py
├── scikit_soft_voting.py
├── scikit_soft_voting_2knn.py
└── scikit_soft_voting_analysis.py
├── Chapter04
├── linear_nonlinear_example.py
├── stacking_classification.py
├── stacking_classification_analysis.py
├── stacking_classifiers.py
├── stacking_regression.py
└── stacking_regression_analysis.py
├── Chapter05
├── bagging_custom.py
├── bagging_custom_parallel.py
├── bagging_sklearn_classification.py
├── bagging_sklearn_regression.py
├── bootstrapping.py
└── validation_curves.py
├── Chapter06
├── adaboost_custom.py
├── adaboost_sklearn_classification.py
├── adaboost_sklearn_regression.py
├── boosting_overfit.py
├── dataset_segmentation.py
├── gradient_boosting_custom.py
├── gradient_boosting_sklearn_classification.py
├── gradient_boosting_sklearn_regression.py
├── xgb_classification.py
└── xgb_regression.py
├── Chapter07
├── extra_tree_classification.py
├── extra_tree_classification_validation_curves.py
├── extra_tree_regression.py
├── probability_to_choose.py
├── rf_classification.py
├── rf_classification_validation_curves.py
└── rf_regression.py
├── Chapter08
├── agglomerative.py
├── kmeans_cluster.py
├── kmeans_intro.py
├── kmeans_raw.py
├── oe_co_occurence.py
├── oe_graph_closure.py
├── oe_vote.py
├── oe_vote_tsne.py
└── voting_example.py
├── Chapter09
├── adaboost.py
├── bagging.py
├── base.py
├── dt_optimize.py
├── exploratory.py
├── logistic_regression.py
├── random_forest.py
├── stacking.py
├── stacking_classifier.py
├── unrelated_presentation_phd.py
├── voting.py
└── xgboosting.py
├── Chapter10
├── bagging.py
├── boosting.py
├── exploratory.py
├── random_forest.py
├── regression.py
├── simulator.py
├── simulator_plain.py
├── stacking.py
├── stacking_regressor.py
├── voting.py
└── voting_regressor.py
├── Chapter11
├── base_learners_twitter.py
├── comparisons.py
├── data_cleaning.py
├── exploratory.py
└── stream_sentiment.py
├── Chapter12
├── ensemble_fc_models.py
├── exploratory.py
├── single_dense_model.py
└── single_dot_model.py
├── Chapter13
├── clustering.py
├── ensemble_cluster.py
├── ensemble_cluster_normalized.py
├── ensemble_cluster_tsne.py
├── exploratory.py
└── insights.py
├── LICENSE
└── README.md
/Chapter01/algos_demo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Feb 15 19:37:48 2019
4 |
5 | @author: George Kyriakides
6 | ge.kyriakides@gmail.com
7 | """
8 |
9 |
10 |
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from mpl_toolkits.mplot3d import Axes3D
14 | from sklearn.decomposition import KernelPCA
15 |
16 |
17 | # =============================================================================
18 | # OLS
19 | # =============================================================================
20 | # --- SECTION 1 ---
21 | # Libraries and data loading
22 | from sklearn.datasets import load_diabetes
23 | from sklearn.linear_model import LinearRegression
24 | from sklearn import metrics
25 | diabetes = load_diabetes()
26 |
27 |
28 | # --- SECTION 2 ---
29 | # Split the data into train and test set
30 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
31 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
32 |
33 | # --- SECTION 3 ---
34 | # Instantiate, train and evaluate the model
35 | ols = LinearRegression()
36 | ols.fit(train_x, train_y)
37 | err = metrics.mean_squared_error(test_y, ols.predict(test_x))
38 | r2 = metrics.r2_score(test_y, ols.predict(test_x))
39 |
40 | # --- SECTION 4 ---
41 | # Print the model
42 | print('---OLS on diabetes dataset.---')
43 | print('Coefficients:')
44 | print('Intercept (b): %.2f'%ols.intercept_)
45 | for i in range(len(diabetes.feature_names)):
46 | print(diabetes.feature_names[i]+': %.2f'%ols.coef_[i])
47 | print('-'*30)
48 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err)
49 |
50 |
51 | # =============================================================================
52 | # LOGIT
53 | # =============================================================================
54 | # --- SECTION 1 ---
55 | # Libraries and data loading
56 | from sklearn.linear_model import LogisticRegression
57 | from sklearn.datasets import load_breast_cancer
58 | from sklearn import metrics
59 | bc = load_breast_cancer()
60 |
61 | # --- SECTION 2 ---
62 | # Split the data into train and test set
63 | train_x, train_y = bc.data[:400], bc.target[:400]
64 | test_x, test_y = bc.data[400:], bc.target[400:]
65 |
66 | # --- SECTION 3 ---
67 | # Instantiate, train and evaluate the model
68 | logit = LogisticRegression()
69 | logit.fit(train_x, train_y)
70 | acc = metrics.accuracy_score(test_y, logit.predict(test_x))
71 |
72 | # --- SECTION 4 ---
73 | # Print the model
74 | print('---Logistic Regression on breast cancer dataset.---')
75 | print('Coefficients:')
76 | print('Intercept (b): %.2f'%logit.intercept_)
77 | for i in range(len(bc.feature_names)):
78 | print(bc.feature_names[i]+': %.2f'%logit.coef_[0][i])
79 | print('-'*30)
80 | print('Accuracy: %.2f \n'%acc)
81 | print(metrics.confusion_matrix(test_y, logit.predict(test_x)))
82 |
83 | # =============================================================================
84 | # SVM FIGURE
85 | # =============================================================================
86 | f = lambda x: 2 * x - 5
87 | f_upp = lambda x: 2 * x - 5 + 2
88 | f_lower = lambda x: 2 * x - 5 - 2
89 |
90 | pos = []
91 | neg = []
92 |
93 | np.random.seed(345234)
94 | for i in range(80):
95 | x = np.random.randint(15)
96 | y = np.random.randint(15)
97 |
98 | d = np.abs(2*x-y-5)/np.sqrt(2**2+1)
99 | if f(x) < y and d>=1:
100 | pos.append([x,y])
101 | elif f(x) > y and d>=1 :
102 | neg.append([x,y])
103 |
104 | pos.append([4, f_upp(4)])
105 | neg.append([8, f_lower(8)])
106 |
107 |
108 | plt.figure()
109 | plt.xticks([])
110 | plt.yticks([])
111 | plt.scatter(*zip(*pos))
112 | plt.scatter(*zip(*neg))
113 |
114 | plt.plot([0,10],[f(0),f(10)], linestyle='--', color='m')
115 | plt.plot([0,10],[f_upp(0),f_upp(10)], linestyle='--', color='red')
116 | plt.plot([0,10],[f_lower(0),f_lower(10)], linestyle='--', color='red')
117 | plt.plot([4,3],[f_lower(4),f_upp(3)], linestyle='-', color='black')
118 | plt.plot([7,6],[f_lower(7),f_upp(6)], linestyle='-', color='black')
119 | plt.xlabel('x')
120 | plt.ylabel('y')
121 | plt.title('SVM')
122 |
123 | # =============================================================================
124 | # SVC
125 | # =============================================================================
126 | # --- SECTION 1 ---
127 | # Libraries and data loading
128 | from sklearn.svm import SVC
129 | from sklearn.datasets import load_breast_cancer
130 | from sklearn import metrics
131 |
132 | # --- SECTION 2 ---
133 | # Split the data into train and test set
134 | train_x, train_y = bc.data[:400], bc.target[:400]
135 | test_x, test_y = bc.data[400:], bc.target[400:]
136 |
137 | # --- SECTION 3 ---
138 | # Instantiate, train and evaluate the model
139 | svc = SVC(kernel='linear')
140 | svc.fit(train_x, train_y)
141 | acc = metrics.accuracy_score(test_y, svc.predict(test_x))
142 |
143 | # --- SECTION 4 ---
144 | # Print the model's accuracy
145 | print('---SVM on breast cancer dataset.---')
146 | print('Accuracy: %.2f \n'%acc)
147 | print(metrics.confusion_matrix(test_y, svc.predict(test_x)))
148 |
149 | # =============================================================================
150 | # SVR
151 | # =============================================================================
152 | # --- SECTION 1 ---
153 | # Libraries and data loading
154 | from sklearn.datasets import load_diabetes
155 | from sklearn.svm import SVR
156 | from sklearn import metrics
157 | diabetes = load_diabetes()
158 |
159 |
160 | # --- SECTION 2 ---
161 | # Split the data into train and test set
162 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
163 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
164 |
165 | # --- SECTION 3 ---
166 | # Instantiate, train and evaluate the model
167 | svr = SVR(kernel='linear', C=1000)
168 | svr.fit(train_x, train_y)
169 | err = metrics.mean_squared_error(test_y, svr.predict(test_x))
170 | r2 = metrics.r2_score(test_y, svr.predict(test_x))
171 |
172 | # --- SECTION 4 ---
173 | # Print the model
174 | print('---SVM on diabetes dataset.---')
175 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err)
176 |
177 |
178 |
179 | # =============================================================================
180 | # MLP REGRESSION
181 | # =============================================================================
182 |
183 | # --- SECTION 1 ---
184 | # Libraries and data loading
185 | from sklearn.datasets import load_diabetes
186 | from sklearn.neural_network import MLPRegressor
187 | from sklearn import metrics
188 | diabetes = load_diabetes()
189 |
190 |
191 | # --- SECTION 2 ---
192 | # Split the data into train and test set
193 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
194 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
195 |
196 | # --- SECTION 3 ---
197 | # Instantiate, train and evaluate the model
198 | mlpr = MLPRegressor(solver='sgd')
199 | mlpr.fit(train_x, train_y)
200 | err = metrics.mean_squared_error(test_y, mlpr.predict(test_x))
201 | r2 = metrics.r2_score(test_y, mlpr.predict(test_x))
202 |
203 | # --- SECTION 4 ---
204 | # Print the model
205 | print('---Neural Networks on diabetes dataset.---')
206 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err)
207 |
208 | # =============================================================================
209 | # MLP CLASSIFICATION
210 | # =============================================================================
211 |
212 | # --- SECTION 1 ---
213 | # Libraries and data loading
214 | from sklearn.datasets import load_breast_cancer
215 | from sklearn.neural_network import MLPClassifier
216 | from sklearn import metrics
217 | bc = load_breast_cancer()
218 |
219 |
220 |
221 |
222 | # --- SECTION 2 ---
223 | # Split the data into train and test set
224 | train_x, train_y = bc.data[:400], bc.target[:400]
225 | test_x, test_y = bc.data[400:], bc.target[400:]
226 |
227 | # --- SECTION 3 ---
228 | # Instantiate, train and evaluate the model
229 | mlpc = MLPClassifier(solver='lbfgs', random_state=12418)
230 | mlpc.fit(train_x, train_y)
231 | acc = metrics.accuracy_score(test_y, mlpc.predict(test_x))
232 |
233 | # --- SECTION 4 ---
234 | # Print the model's accuracy
235 | print('---Neural Networks on breast cancer dataset.---')
236 | print('Accuracy: %.2f \n'%acc)
237 | print(metrics.confusion_matrix(test_y, mlpc.predict(test_x)))
238 |
239 | # =============================================================================
240 | # MLP REGRESSION
241 | # =============================================================================
242 |
243 | # --- SECTION 1 ---
244 | # Libraries and data loading
245 | from sklearn.datasets import load_diabetes
246 | from sklearn.neural_network import MLPRegressor
247 | from sklearn import metrics
248 | diabetes = load_diabetes()
249 |
250 |
251 | # --- SECTION 2 ---
252 | # Split the data into train and test set
253 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
254 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
255 |
256 | # --- SECTION 3 ---
257 | # Instantiate, train and evaluate the model
258 | mlpr = MLPRegressor(solver='sgd')
259 | mlpr.fit(train_x, train_y)
260 | err = metrics.mean_squared_error(test_y, mlpr.predict(test_x))
261 | r2 = metrics.r2_score(test_y, mlpr.predict(test_x))
262 |
263 | # --- SECTION 4 ---
264 | # Print the model
265 | print('---Neural Networks on diabetes dataset.---')
266 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err)
267 |
268 | # =============================================================================
269 | # DTREE REGRESSION
270 | # =============================================================================
271 |
272 | # --- SECTION 1 ---
273 | # Libraries and data loading
274 | from sklearn.datasets import load_diabetes
275 | from sklearn.tree import DecisionTreeRegressor
276 | from sklearn import metrics
277 | diabetes = load_diabetes()
278 |
279 |
280 | # --- SECTION 2 ---
281 | # Split the data into train and test set
282 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
283 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
284 |
285 | # --- SECTION 3 ---
286 | # Instantiate, train and evaluate the model
287 | dtr = DecisionTreeRegressor(max_depth=2)
288 | dtr.fit(train_x, train_y)
289 | err = metrics.mean_squared_error(test_y, dtr.predict(test_x))
290 | r2 = metrics.r2_score(test_y, dtr.predict(test_x))
291 |
292 | # --- SECTION 4 ---
293 | # Print the model
294 | print('---Neural Networks on diabetes dataset.---')
295 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err)
296 |
297 | # =============================================================================
298 | # DTREE CLASSIFICATION
299 | # =============================================================================
300 |
301 | # --- SECTION 1 ---
302 | # Libraries and data loading
303 | from sklearn.datasets import load_breast_cancer
304 | from sklearn.tree import DecisionTreeClassifier
305 | from sklearn import metrics
306 | bc = load_breast_cancer()
307 |
308 |
309 |
310 | # --- SECTION 2 ---
311 | # Split the data into train and test set
312 | train_x, train_y = bc.data[:400], bc.target[:400]
313 | test_x, test_y = bc.data[400:], bc.target[400:]
314 |
315 | # --- SECTION 3 ---
316 | # Instantiate, train and evaluate the model
317 | dtc = DecisionTreeClassifier(max_depth=2)
318 | dtc.fit(train_x, train_y)
319 | acc = metrics.accuracy_score(test_y, dtc.predict(test_x))
320 |
321 | # --- SECTION 4 ---
322 | # Print the model's accuracy
323 | print('---Neural Networks on breast cancer dataset.---')
324 | print('Accuracy: %.2f \n'%acc)
325 | print(metrics.confusion_matrix(test_y, dtc.predict(test_x)))
326 | from sklearn.tree import export_graphviz
327 | export_graphviz(dtc, feature_names=bc.feature_names,
328 | class_names=bc.target_names, impurity=False)
329 |
330 |
331 |
332 | # =============================================================================
333 | # KNN REGRESSION
334 | # =============================================================================
335 |
336 | # --- SECTION 1 ---
337 | # Libraries and data loading
338 | from sklearn.datasets import load_diabetes
339 | from sklearn.neighbors import KNeighborsRegressor
340 | from sklearn import metrics
341 | diabetes = load_diabetes()
342 |
343 |
344 | # --- SECTION 2 ---
345 | # Split the data into train and test set
346 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
347 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
348 |
349 | # --- SECTION 3 ---
350 | # Instantiate, train and evaluate the model
351 | knnr = KNeighborsRegressor(n_neighbors=14)
352 | knnr.fit(train_x, train_y)
353 | err = metrics.mean_squared_error(test_y, knnr.predict(test_x))
354 | r2 = metrics.r2_score(test_y, knnr.predict(test_x))
355 |
356 | # --- SECTION 4 ---
357 | # Print the model
358 | print('---Neural Networks on diabetes dataset.---')
359 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err)
360 |
361 | # =============================================================================
362 | # KNN CLASSIFICATION
363 | # =============================================================================
364 |
365 | # --- SECTION 1 ---
366 | # Libraries and data loading
367 | from sklearn.datasets import load_breast_cancer
368 | from sklearn.neighbors import KNeighborsClassifier
369 | from sklearn import metrics
370 | bc = load_breast_cancer()
371 |
372 |
373 |
374 | # --- SECTION 2 ---
375 | # Split the data into train and test set
376 | train_x, train_y = bc.data[:400], bc.target[:400]
377 | test_x, test_y = bc.data[400:], bc.target[400:]
378 |
379 | # --- SECTION 3 ---
380 | # Instantiate, train and evaluate the model
381 | dtc = KNeighborsClassifier(n_neighbors=5)
382 | dtc.fit(train_x, train_y)
383 | acc = metrics.accuracy_score(test_y, dtc.predict(test_x))
384 |
385 | # --- SECTION 4 ---
386 | # Print the model's accuracy
387 | print('---Neural Networks on breast cancer dataset.---')
388 | print('Accuracy: %.2f \n'%acc)
389 | print(metrics.confusion_matrix(test_y, dtc.predict(test_x)))
390 |
391 |
392 | # =============================================================================
393 | # K-MEANS
394 | # =============================================================================
395 |
396 | # --- SECTION 1 ---
397 | # Libraries and data loading
398 | from sklearn.datasets import load_breast_cancer
399 | from sklearn.cluster import KMeans
400 | bc = load_breast_cancer()
401 |
402 |
403 | bc.data=bc.data[:,:2]
404 |
405 | # --- SECTION 2 ---
406 | # Instantiate and train
407 | km = KMeans(n_clusters=3)
408 | km.fit(bc.data)
409 |
410 | # --- SECTION 3 ---
411 | # Create a point mesh to plot cluster areas
412 |
413 | # Step size of the mesh.
414 | h = .02
415 |
416 | # Plot the decision boundary. For that, we will assign a color to each
417 | x_min, x_max = bc.data[:, 0].min() - 1, bc.data[:, 0].max() + 1
418 | y_min, y_max = bc.data[:, 1].min() - 1, bc.data[:, 1].max() + 1
419 |
420 | # Create the actual mesh and cluster it
421 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
422 | Z = km.predict(np.c_[xx.ravel(), yy.ravel()])
423 |
424 | # Put the result into a color plot
425 | Z = Z.reshape(xx.shape)
426 | plt.figure(1)
427 | plt.clf()
428 | plt.imshow(Z, interpolation='nearest',
429 | extent=(xx.min(), xx.max(), yy.min(), yy.max()),
430 | aspect='auto', origin='lower')
431 |
432 | # --- SECTION 4 ---
433 | # Plot the actual data
434 | c = km.predict(bc.data)
435 |
436 | r = c == 0
437 | b = c == 1
438 | g = c == 2
439 |
440 |
441 | plt.scatter(bc.data[r, 0], bc.data[r, 1], label='cluster 1', color='silver')
442 | plt.scatter(bc.data[b, 0], bc.data[b, 1], label='cluster 2', color='white')
443 | plt.scatter(bc.data[g, 0], bc.data[g, 1], label='cluster 3', color='black')
444 | plt.title('K-means')
445 | plt.xlim(x_min, x_max)
446 | plt.ylim(y_min, y_max)
447 | plt.xticks(())
448 | plt.yticks(())
449 | plt.xlabel(bc.feature_names[0])
450 | plt.ylabel(bc.feature_names[1])
451 | plt.show()
452 | plt.legend()
--------------------------------------------------------------------------------
/Chapter01/datasets_demo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Feb 12 23:01:53 2019
4 |
5 | @author: George Kyriakides
6 | ge.kyriakides@gmail.com
7 | """
8 |
9 | from sklearn.datasets import load_digits, load_breast_cancer, load_diabetes
10 |
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from mpl_toolkits.mplot3d import Axes3D
14 | from sklearn.cluster import KMeans
15 | from sklearn.decomposition import KernelPCA
16 |
17 | # =============================================================================
18 | # DATASETS
19 | # =============================================================================
20 | diabetes = load_diabetes()
21 | bc = load_breast_cancer()
22 | digits = load_digits()
23 | images_and_labels = list(zip(digits.images, digits.target))
24 | for index, (image, label) in enumerate(images_and_labels[10:20]):
25 | plt.subplot(2, 5, index + 1)
26 | plt.axis('off')
27 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
28 | plt.title('Target: %i' % label)
29 |
30 |
31 | # =============================================================================
32 | # CLASSIFICATION
33 | # =============================================================================
34 | f = lambda x: 2 * x - 5
35 |
36 | pos = []
37 | neg = []
38 |
39 | for i in range(30):
40 | x = np.random.randint(15)
41 | y = np.random.randint(15)
42 |
43 | if f(x) < y:
44 | pos.append([x,y])
45 | else:
46 | neg.append([x,y])
47 |
48 |
49 | plt.figure()
50 | plt.xticks([])
51 | plt.yticks([])
52 | plt.scatter(*zip(*pos))
53 | plt.scatter(*zip(*neg))
54 | plt.plot([0,10],[f(0),f(10)], linestyle='--', color='m')
55 | plt.xlabel('x')
56 | plt.ylabel('y')
57 | plt.title('Classification')
58 |
59 | # =============================================================================
60 | # REGRESSION
61 | # =============================================================================
62 |
63 | dat = []
64 |
65 |
66 | for i in range(30):
67 | x = np.random.uniform(10)
68 | y = f(x) + np.random.uniform(-2.0,2.0)
69 |
70 |
71 | dat.append([x,y])
72 |
73 |
74 | plt.figure()
75 | plt.xticks([])
76 | plt.yticks([])
77 | plt.scatter(*zip(*dat))
78 | plt.plot([0,10],[f(0),f(10)], linestyle='--', color='m')
79 | plt.xlabel('x')
80 | plt.ylabel('y')
81 | plt.title('Regression')
82 |
83 | # =============================================================================
84 | # CLUSTERING
85 | # =============================================================================
86 |
87 | km = KMeans(n_clusters=3)
88 | dat = []
89 |
90 | t = 0.5
91 |
92 | for i in range(300):
93 |
94 |
95 | c = np.random.randint(3)
96 | a = np.random.uniform() * 2 * 3.14
97 | r = t * np.sqrt(np.random.uniform())
98 |
99 | x = r * np.cos(a)
100 | y = r * np.sin(a)
101 |
102 |
103 | dat.append([c+x, c+y])
104 |
105 |
106 | c = km.fit_predict(dat)
107 | plt.figure()
108 | plt.xticks([])
109 | plt.yticks([])
110 | plt.scatter(*zip(*dat),c=c)
111 | plt.xlabel('x')
112 | plt.ylabel('y')
113 | plt.title('Clustering')
114 |
115 |
116 | # =============================================================================
117 | # PCA
118 | # =============================================================================
119 |
120 | from sklearn.datasets import make_circles
121 |
122 | pca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
123 | x, y = make_circles(n_samples=400, factor=.3, noise=.05)
124 |
125 |
126 | pp = pca.fit_transform(x)
127 | plt.figure()
128 | plt.xticks([])
129 | plt.yticks([])
130 | plt.scatter(pp[:,0], pp[:,1], c=y)
131 | plt.xlabel('x')
132 | plt.ylabel('y')
133 | plt.title('Clustering')
134 |
135 | # =============================================================================
136 | # TSNE
137 | # =============================================================================
138 |
139 | from sklearn.manifold import TSNE
140 |
141 | tsne = TSNE()
142 |
143 | dat = tsne.fit_transform(bc.data)
144 | reds = bc.target == 0
145 | blues = bc.target == 1
146 | plt.scatter(dat[reds,0], dat[reds,1], label='malignant')
147 | plt.scatter(dat[blues,0], dat[blues,1], label='benign')
148 | plt.xlabel('1st Component')
149 | plt.ylabel('2nd Component')
150 | plt.title('Breast Cancer Data')
151 | plt.legend()
152 |
153 | # =============================================================================
154 | # ROC
155 | # =============================================================================
156 | import numpy as np
157 | from sklearn import metrics
158 | ax1 = plt.subplot()
159 | ax1.margins(0)
160 | np.random.seed(856522)
161 | y = np.random.choice([1,2], 30)
162 | scores = np.random.choice([i/100 for i in range(0,100)], 30)
163 | fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
164 |
165 | x = [i/100 for i in range(0,100)]
166 | y = [i/100 for i in range(0,100)]
167 | plt.plot(x, y, linestyle='-.')
168 | plt.plot(fpr, tpr, label='ROC curve')
169 |
170 | plt.xlabel('Specificity')
171 | plt.ylabel('Sensitivity')
172 | plt.title('ROC')
173 | plt.legend()
174 |
--------------------------------------------------------------------------------
/Chapter02/bias_variance.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Feb 23 19:44:13 2019
4 |
5 | @author: George Kyriakides
6 | ge.kyriakides@gmail.com
7 | """
8 |
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 |
12 | np.random.seed(123456)
13 |
14 | def f(x):
15 | return np.sin(x)
16 |
17 | def sample(size):
18 | max_v = 20
19 | step = size/max_v
20 | x = [x/step for x in range(size)]
21 | y = [f(x)+np.random.uniform(-0.25,0.25) for x in x]
22 | return np.array(x).reshape(-1,1), np.array(y).reshape(-1,1)
23 |
24 |
25 | # =============================================================================
26 | # HIGH BIAS - UNDERFIT
27 | # =============================================================================
28 | from sklearn.linear_model import LinearRegression
29 | x, y = sample(100)
30 |
31 |
32 | lr = LinearRegression()
33 | lr.fit(x, y)
34 | preds = lr.predict(x)
35 | plt.figure()
36 | plt.scatter(x, y, label='data')
37 | plt.plot(x, preds, color='orange', label='model')
38 | plt.title('Biased Model')
39 | plt.legend()
40 |
41 | # =============================================================================
42 | # HIGH VARIANCE - OVERFIT
43 | # =============================================================================
44 | from sklearn.tree import DecisionTreeRegressor
45 | x, y = sample(100)
46 |
47 | dt = DecisionTreeRegressor()
48 | dt.fit(x, y)
49 | plt.figure()
50 | plt.scatter(x, y, label='training data')
51 | x, y = sample(100)
52 | preds = dt.predict(x)
53 | plt.plot(x, preds, color='orange', label='model')
54 | plt.scatter(x, y, label='test data')
55 | plt.title('High Variance Model')
56 | plt.legend()
57 |
58 |
59 | # =============================================================================
60 | # TRADEOFF
61 | # =============================================================================
62 | def bias(complexity):
63 | return 100/complexity
64 |
65 | def variance(complexity):
66 | return np.exp(complexity/28)
67 |
68 | r = range(5, 100)
69 |
70 | variance_ = np.array([variance(x) for x in r])
71 | bias_ = np.array([bias(x) for x in r])
72 | sum_ = variance_ + bias_
73 | mins = np.argmin(sum_)
74 | min_line = [mins for x in range(0, int(max(sum_)))]
75 |
76 |
77 | plt.figure()
78 | plt.plot(bias_, label=r'$bias^2$', linestyle='-')
79 | plt.plot(variance_, label='variance', linestyle=':')
80 | plt.plot(sum_, label='error', linestyle='-.')
81 | plt.plot(min_line, [x for x in range(0, int(max(sum_)))], linestyle='--')
82 | plt.title('Minimizing Error')
83 | plt.legend()
84 |
85 |
86 | # =============================================================================
87 | # BEST MODEL
88 | # =============================================================================
89 | from sklearn.tree import DecisionTreeRegressor
90 | x, y = sample(100)
91 |
92 | plt.figure()
93 | plt.scatter(x, y, label='training data')
94 |
95 | preds = f(x)
96 | plt.plot(x, preds, color='orange', label='model')
97 | plt.title('Perfect Model')
98 | plt.legend()
--------------------------------------------------------------------------------
/Chapter02/motivation.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Feb 25 23:15:52 2019
4 |
5 | @author: George Kyriakides
6 | ge.kyriakides@gmail.com
7 | """
8 |
9 | import numpy as np
10 | from scipy.special import binom
11 | import matplotlib.pyplot as plt
12 |
13 | # =============================================================================
14 | # ENSEMBLE SIZE - ERROR PLOT
15 | # =============================================================================
16 | def prob(size):
17 | err = 0.15
18 | half = int(np.ceil(size/2))
19 | s = 0
20 | for i in range(half, size):
21 | s += binom(size, i)*np.power(err,i)*np.power((1-err),(size-i))
22 | return s
23 |
24 |
25 | probs = [15]
26 | rg = range(3,14, 2)
27 | for sz in rg:
28 | probs.append(prob(sz)*100)
29 | print(sz, '%.2f'%(prob(sz)*100))
30 |
31 | rg = range(1,14, 2)
32 | plt.figure()
33 | plt.bar([x for x in rg], probs)
34 | plt.title('Probability of error for ensemble')
35 | plt.xlabel('Number of base learners')
36 | plt.ylabel('Error %')
37 | plt.xticks([x for x in rg])
38 |
39 |
40 | # =============================================================================
41 | # VALIDATION CURVES
42 | # =============================================================================
43 |
44 | # --- SECTION 1 ---
45 | # Libraries and data loading
46 | from sklearn.datasets import load_breast_cancer
47 | from sklearn.model_selection import validation_curve
48 | from sklearn.neighbors import KNeighborsClassifier
49 |
50 | bc = load_breast_cancer()
51 |
52 |
53 | # --- SECTION 2 ---
54 | # Create in-sample and out-of-sample scores
55 | x, y = bc.data, bc.target
56 | learner = KNeighborsClassifier()
57 | param_range = [2,3,4,5]
58 | train_scores, test_scores = validation_curve(learner, x, y,
59 | param_name='n_neighbors',
60 | param_range=param_range,
61 | cv=10,
62 | scoring="accuracy")
63 |
64 | # --- SECTION 3 ---
65 | # Calculate the average and standard deviation for each hyperparameter
66 | train_scores_mean = np.mean(train_scores, axis=1)
67 | train_scores_std = np.std(train_scores, axis=1)
68 | test_scores_mean = np.mean(test_scores, axis=1)
69 | test_scores_std = np.std(test_scores, axis=1)
70 |
71 |
72 | # --- SECTION 4 ---
73 | # Plot the scores
74 | plt.figure()
75 | plt.title('Validation curves')
76 | # Plot the standard deviations
77 | plt.fill_between(param_range, train_scores_mean - train_scores_std,
78 | train_scores_mean + train_scores_std, alpha=0.1,
79 | color="C1")
80 | plt.fill_between(param_range, test_scores_mean - test_scores_std,
81 | test_scores_mean + test_scores_std, alpha=0.1, color="C0")
82 |
83 | # Plot the means
84 | plt.plot(param_range, train_scores_mean, 'o-', color="C1",
85 | label="Training score")
86 | plt.plot(param_range, test_scores_mean, 'o-', color="C0",
87 | label="Cross-validation score")
88 |
89 | plt.xticks(param_range)
90 | plt.xlabel('Number of neighbors')
91 | plt.ylabel('Accuracy')
92 | plt.legend(loc="best")
93 | plt.show()
94 |
95 |
96 | # =============================================================================
97 | # LEARNING CURVES
98 | # =============================================================================
99 |
100 | # --- SECTION 1 ---
101 | # Libraries and data loading
102 | from sklearn.datasets import load_breast_cancer
103 | from sklearn.neighbors import KNeighborsClassifier
104 | from sklearn.model_selection import learning_curve
105 | bc = load_breast_cancer()
106 |
107 |
108 | # --- SECTION 2 ---
109 | # Create in-sample and out-of-sample scores
110 | x, y = bc.data, bc.target
111 | learner = KNeighborsClassifier()
112 | train_sizes = [50, 100, 150, 200, 250, 300]
113 | train_sizes, train_scores, test_scores = learning_curve(learner, x, y,
114 | train_sizes=train_sizes,
115 | cv=10)
116 |
117 |
118 | # --- SECTION 3 ---
119 | # Calculate the average and standard deviation for each hyperparameter
120 | train_scores_mean = np.mean(train_scores, axis=1)
121 | train_scores_std = np.std(train_scores, axis=1)
122 | test_scores_mean = np.mean(test_scores, axis=1)
123 | test_scores_std = np.std(test_scores, axis=1)
124 |
125 | # --- SECTION 4 ---
126 | # Plot the scores
127 | plt.figure()
128 | plt.title('Learning curves')
129 | # Plot the standard deviations
130 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
131 | train_scores_mean + train_scores_std, alpha=0.1,
132 | color="C1")
133 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
134 | test_scores_mean + test_scores_std, alpha=0.1, color="C0")
135 |
136 | # Plot the means
137 | plt.plot(train_sizes, train_scores_mean, 'o-', color="C1",
138 | label="Training score")
139 | plt.plot(train_sizes, test_scores_mean, 'o-', color="C0",
140 | label="Cross-validation score")
141 |
142 | plt.xticks(train_sizes)
143 | plt.xlabel('Size of training set (instances)')
144 | plt.ylabel('Accuracy')
145 | plt.legend(loc="best")
146 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/Chapter03/custom_voting_implementation.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Import the required libraries
3 | from sklearn import datasets, linear_model, svm, neighbors
4 | from sklearn.metrics import accuracy_score
5 | from numpy import argmax
6 | # Load the dataset
7 | breast_cancer = datasets.load_breast_cancer()
8 | x, y = breast_cancer.data, breast_cancer.target
9 |
10 | # --- SECTION 2 ---
11 | # Instantiate the learners (classifiers)
12 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5)
13 | learner_2 = linear_model.Perceptron(tol=1e-2, random_state=0)
14 | learner_3 = svm.SVC(gamma=0.001)
15 |
16 | # --- SECTION 3 ---
17 | # Split the train and test samples
18 | test_samples = 100
19 | x_train, y_train = x[:-test_samples], y[:-test_samples]
20 | x_test, y_test = x[-test_samples:], y[-test_samples:]
21 |
22 | # Fit learners with the train data
23 | learner_1.fit(x_train, y_train)
24 | learner_2.fit(x_train, y_train)
25 | learner_3.fit(x_train, y_train)
26 |
27 | # --- SECTION 4 ---
28 | # Each learner predicts the classes of the test data
29 | predictions_1 = learner_1.predict(x_test)
30 | predictions_2 = learner_2.predict(x_test)
31 | predictions_3 = learner_3.predict(x_test)
32 |
33 | # --- SECTION 5 ---
34 | # We combine the predictions with hard voting
35 | hard_predictions = []
36 | # For each predicted sample
37 | for i in range(test_samples):
38 | # Count the votes for each class
39 | counts = [0 for _ in range(2)]
40 | counts[predictions_1[i]] = counts[predictions_1[i]]+1
41 | counts[predictions_2[i]] = counts[predictions_2[i]]+1
42 | counts[predictions_3[i]] = counts[predictions_3[i]]+1
43 | # Find the class with most votes
44 | final = argmax(counts)
45 | # Add the class to the final predictions
46 | hard_predictions.append(final)
47 |
48 | # --- SECTION 6 ---
49 | # Accuracies of base learners
50 | print('L1:', accuracy_score(y_test, predictions_1))
51 | print('L2:', accuracy_score(y_test, predictions_2))
52 | print('L3:', accuracy_score(y_test, predictions_3))
53 | # Accuracy of hard voting
54 | print('-'*30)
55 | print('Hard Voting:', accuracy_score(y_test, hard_predictions))
56 |
--------------------------------------------------------------------------------
/Chapter03/custom_voting_implementation_analysis.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Import the required libraries
3 | import matplotlib as mpl
4 | import matplotlib.pyplot as plt
5 | mpl.style.use('seaborn-paper')
6 |
7 | # --- SECTION 2 ---
8 | # Calculate the errors
9 | errors_1 = y_test-predictions_1
10 | errors_2 = y_test-predictions_2
11 | errors_3 = y_test-predictions_3
12 |
13 |
14 | # --- SECTION 3 ---
15 | # Discard correct predictions and plot each learner's errors
16 | x=[]
17 | y=[]
18 | for i in range(len(errors_1)):
19 | if not errors_1[i] == 0:
20 | x.append(i)
21 | y.append(errors_1[i])
22 | plt.scatter(x, y, s=120, label='Learner 1 Errors')
23 |
24 | x=[]
25 | y=[]
26 | for i in range(len(errors_2)):
27 | if not errors_2[i] == 0:
28 | x.append(i)
29 | y.append(errors_2[i])
30 | plt.scatter(x, y, marker='x', s=60, label='Learner 2 Errors')
31 |
32 | x=[]
33 | y=[]
34 | for i in range(len(errors_3)):
35 | if not errors_3[i] == 0:
36 | x.append(i)
37 | y.append(errors_3[i])
38 | plt.scatter(x, y, s=20, label='Learner 3 Errors')
39 |
40 | plt.title('Learner errors')
41 | plt.xlabel('Test sample')
42 | plt.ylabel('Error')
43 | plt.legend()
44 | plt.show()
--------------------------------------------------------------------------------
/Chapter03/scikit_hard_voting.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Import the required libraries
3 | from sklearn import datasets, linear_model, svm, neighbors
4 | from sklearn.ensemble import VotingClassifier
5 | from sklearn.metrics import accuracy_score
6 | # Load the dataset
7 | breast_cancer = datasets.load_breast_cancer()
8 | x, y = breast_cancer.data, breast_cancer.target
9 |
10 | # Split the train and test samples
11 | test_samples = 100
12 | x_train, y_train = x[:-test_samples], y[:-test_samples]
13 | x_test, y_test = x[-test_samples:], y[-test_samples:]
14 |
15 |
16 | # --- SECTION 2 ---
17 | # Instantiate the learners (classifiers)
18 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5)
19 | learner_2 = linear_model.Perceptron(tol=1e-2, random_state=0)
20 | learner_3 = svm.SVC(gamma=0.001)
21 |
22 | # --- SECTION 3 ---
23 | # Instantiate the voting classifier
24 | voting = VotingClassifier([('KNN', learner_1),
25 | ('Prc', learner_2),
26 | ('SVM', learner_3)])
27 |
28 |
29 | # --- SECTION 4 ---
30 | # Fit classifier with the training data
31 | voting.fit(x_train, y_train)
32 |
33 | # --- SECTION 5 ---
34 | # Predict the most voted class
35 | hard_predictions = voting.predict(x_test)
36 |
37 | # --- SECTION 6 ---
38 | # Accuracy of hard voting
39 | print('-'*30)
40 | print('Hard Voting:', accuracy_score(y_test, hard_predictions))
41 |
--------------------------------------------------------------------------------
/Chapter03/scikit_soft_voting.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Import the required libraries
3 | from sklearn import datasets, naive_bayes, svm, neighbors
4 | from sklearn.ensemble import VotingClassifier
5 | from sklearn.metrics import accuracy_score
6 | # Load the dataset
7 | breast_cancer = datasets.load_breast_cancer()
8 | x, y = breast_cancer.data, breast_cancer.target
9 |
10 | # Split the train and test samples
11 | test_samples = 100
12 | x_train, y_train = x[:-test_samples], y[:-test_samples]
13 | x_test, y_test = x[-test_samples:], y[-test_samples:]
14 |
15 | # --- SECTION 2 ---
16 | # Instantiate the learners (classifiers)
17 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5)
18 | learner_2 = naive_bayes.GaussianNB()
19 | learner_3 = svm.SVC(gamma=0.001, probability=True)
20 |
21 | # --- SECTION 3 ---
22 | # Instantiate the voting classifier
23 | voting = VotingClassifier([('KNN', learner_1),
24 | ('NB', learner_2),
25 | ('SVM', learner_3)],
26 | voting='soft')
27 |
28 |
29 |
30 |
31 | # --- SECTION 4 ---
32 | # Fit classifier with the training data
33 | voting.fit(x_train, y_train)
34 | learner_1.fit(x_train, y_train)
35 | learner_2.fit(x_train, y_train)
36 | learner_3.fit(x_train, y_train)
37 |
38 | # --- SECTION 5 ---
39 | # Predict the most probable class
40 | hard_predictions = voting.predict(x_test)
41 |
42 | # --- SECTION 6 ---
43 | # Get the base learner predictions
44 | predictions_1 = learner_1.predict(x_test)
45 | predictions_2 = learner_2.predict(x_test)
46 | predictions_3 = learner_3.predict(x_test)
47 |
48 | # --- SECTION 7 ---
49 | # Accuracies of base learners
50 | print('L1:', accuracy_score(y_test, predictions_1))
51 | print('L2:', accuracy_score(y_test, predictions_2))
52 | print('L3:', accuracy_score(y_test, predictions_3))
53 | # Accuracy of hard voting
54 | print('-'*30)
55 | print('Hard Voting:', accuracy_score(y_test, hard_predictions))
56 |
57 |
58 |
--------------------------------------------------------------------------------
/Chapter03/scikit_soft_voting_2knn.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Import the required libraries
3 | from sklearn import datasets, naive_bayes, svm, neighbors
4 | from sklearn.ensemble import VotingClassifier
5 | from sklearn.metrics import accuracy_score
6 | # Load the dataset
7 | breast_cancer = datasets.load_breast_cancer()
8 | x, y = breast_cancer.data, breast_cancer.target
9 |
10 | # Split the train and test samples
11 | test_samples = 100
12 | x_train, y_train = x[:-test_samples], y[:-test_samples]
13 | x_test, y_test = x[-test_samples:], y[-test_samples:]
14 |
15 | # --- SECTION 2 ---
16 | # Instantiate the learners (classifiers)
17 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5)
18 | learner_2 = naive_bayes.GaussianNB()
19 | learner_3 = neighbors.KNeighborsClassifier(n_neighbors=50)
20 |
21 | # --- SECTION 3 ---
22 | # Instantiate the voting classifier
23 | voting = VotingClassifier([('5NN', learner_1),
24 | ('NB', learner_2),
25 | ('50NN', learner_3)],
26 | voting='soft')
27 |
28 |
29 |
30 |
31 | # --- SECTION 4 ---
32 | # Fit classifier with the training data
33 | voting.fit(x_train, y_train)
34 | learner_1.fit(x_train, y_train)
35 | learner_2.fit(x_train, y_train)
36 | learner_3.fit(x_train, y_train)
37 |
38 | # --- SECTION 5 ---
39 | # Predict the most probable class
40 | hard_predictions = voting.predict(x_test)
41 |
42 | # --- SECTION 6 ---
43 | # Get the base learner predictions
44 | predictions_1 = learner_1.predict(x_test)
45 | predictions_2 = learner_2.predict(x_test)
46 | predictions_3 = learner_3.predict(x_test)
47 |
48 | # --- SECTION 7 ---
49 | # Accuracies of base learners
50 | print('L1:', accuracy_score(y_test, predictions_1))
51 | print('L2:', accuracy_score(y_test, predictions_2))
52 | print('L3:', accuracy_score(y_test, predictions_3))
53 | # Accuracy of hard voting
54 | print('-'*30)
55 | print('Hard Voting:', accuracy_score(y_test, hard_predictions))
56 |
57 | # --- SECTION 1 ---
58 | # Import the required libraries
59 | import matplotlib as mpl
60 | import matplotlib.pyplot as plt
61 | mpl.style.use('seaborn-paper')
62 |
63 |
64 | # --- SECTION 2 ---
65 | # Get the wrongly predicted instances
66 | # and the predicted probabilities for the whole test set
67 | errors = y_test-hard_predictions
68 |
69 | probabilities_1 = learner_1.predict_proba(x_test)
70 | probabilities_2 = learner_2.predict_proba(x_test)
71 | probabilities_3 = learner_3.predict_proba(x_test)
72 |
73 |
74 | # --- SECTION 2 ---
75 | # Store the predicted probability for
76 | # each wrongly predicted instance, for each base learner
77 | # as well as the average predicted probability
78 | #
79 | x=[]
80 | y_1=[]
81 | y_2=[]
82 | y_3=[]
83 | y_avg=[]
84 |
85 | for i in range(len(errors)):
86 | if not errors[i] == 0:
87 | x.append(i)
88 | y_1.append(probabilities_1[i][0])
89 | y_2.append(probabilities_2[i][0])
90 | y_3.append(probabilities_3[i][0])
91 | y_avg.append((probabilities_1[i][0]+probabilities_2[i][0]+probabilities_3[i][0])/3)
92 |
93 | # --- SECTION 3 ---
94 | # Plot the predicted probaiblity of each base learner as
95 | # a bar and the average probability as an X
96 | plt.bar(x, y_1, 3, label='5NN')
97 | plt.bar(x, y_2, 2, label='NB')
98 | plt.bar(x, y_3, 1, label='50NN')
99 | plt.scatter(x, y_avg, marker='x', c='k', s=150, label='Average Positive', zorder=10)
100 |
101 | y = [0.5 for x in range(len(errors))]
102 | plt.plot(y, c='k', linestyle='--')
103 |
104 | plt.title('Positive Probability')
105 | plt.xlabel('Test sample')
106 | plt.ylabel('probability')
107 | plt.legend()
108 |
109 |
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/Chapter03/scikit_soft_voting_analysis.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Import the required libraries
3 | import matplotlib as mpl
4 | import matplotlib.pyplot as plt
5 | mpl.style.use('seaborn-paper')
6 |
7 |
8 | # --- SECTION 2 ---
9 | # Get the wrongly predicted instances
10 | # and the predicted probabilities for the whole test set
11 | errors = y_test-hard_predictions
12 |
13 | probabilities_1 = learner_1.predict_proba(x_test)
14 | probabilities_2 = learner_2.predict_proba(x_test)
15 | probabilities_3 = learner_3.predict_proba(x_test)
16 |
17 |
18 | # --- SECTION 2 ---
19 | # Store the predicted probability for
20 | # each wrongly predicted instance, for each base learner
21 | # as well as the average predicted probability
22 | #
23 | x=[]
24 | y_1=[]
25 | y_2=[]
26 | y_3=[]
27 | y_avg=[]
28 |
29 | for i in range(len(errors)):
30 | if not errors[i] == 0:
31 | x.append(i)
32 | y_1.append(probabilities_1[i][0])
33 | y_2.append(probabilities_2[i][0])
34 | y_3.append(probabilities_3[i][0])
35 | y_avg.append((probabilities_1[i][0]+probabilities_2[i][0]+probabilities_3[i][0])/3)
36 |
37 | # --- SECTION 3 ---
38 | # Plot the predicted probaiblity of each base learner as
39 | # a bar and the average probability as an X
40 | plt.bar(x, y_1, 3, label='KNN')
41 | plt.bar(x, y_2, 2, label='NB')
42 | plt.bar(x, y_3, 1, label='SVM')
43 | plt.scatter(x, y_avg, marker='x', c='k', s=150, label='Average Positive', zorder=10)
44 |
45 | y = [0.5 for x in range(len(errors))]
46 | plt.plot(y, c='k', linestyle='--')
47 |
48 | plt.title('Positive Probability')
49 | plt.xlabel('Test sample')
50 | plt.ylabel('probability')
51 | plt.legend()
52 | plt.show()
--------------------------------------------------------------------------------
/Chapter04/linear_nonlinear_example.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | x = [i for i in range(100)]
5 | y = [5 for i in range(100)]
6 |
7 | for i in range(30, 60):
8 | y[i] = 4+((i-45)**2)/230
9 |
10 | for i in range(100):
11 | y[i] = y[i] + np.random.uniform(-0.03, 0.03)
12 |
13 | plt.scatter(x, y, label='Data')
14 |
15 | y = [5 for i in range(100)]
16 |
17 | for i in range(20, 70):
18 | y[i] = 4+((i-45)**2)/230
19 |
20 | plt.plot([5 for i in range(100)], label='Linear $y=5$', color='C1')
21 | plt.plot(x[20:70], y[20:70], label='Non-Linear $y=x^2$', color='C2')
22 | plt.xlabel('x')
23 | plt.ylabel('y')
24 | plt.title('Linear and Non-Linear Relationships')
25 | plt.legend()
26 |
27 |
--------------------------------------------------------------------------------
/Chapter04/stacking_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_breast_cancer
4 | from sklearn.neighbors import KNeighborsClassifier
5 | from sklearn.tree import DecisionTreeClassifier
6 | from sklearn.neural_network import MLPClassifier
7 | from sklearn.naive_bayes import GaussianNB
8 | from sklearn.linear_model import LogisticRegression
9 | from sklearn.model_selection import KFold
10 | from sklearn import metrics
11 | import numpy as np
12 | bc = load_breast_cancer()
13 |
14 |
15 | train_x, train_y = bc.data[:400], bc.target[:400]
16 | test_x, test_y = bc.data[400:], bc.target[400:]
17 |
18 | # --- SECTION 2 ---
19 | # Create the ensemble's base learners and meta learner
20 | # Append base learners to a list for ease of access
21 | base_learners = []
22 |
23 | knn = KNeighborsClassifier(n_neighbors=2)
24 | base_learners.append(knn)
25 |
26 | dtr = DecisionTreeClassifier(max_depth=4, random_state=123456)
27 | base_learners.append(dtr)
28 |
29 | mlpc = MLPClassifier(hidden_layer_sizes =(100, ), solver='lbfgs', random_state=123456)
30 | base_learners.append(mlpc)
31 |
32 |
33 | meta_learner = LogisticRegression(solver='lbfgs')
34 |
35 |
36 | # --- SECTION 3 ---
37 | # Create the training meta data
38 |
39 | # Create variables to store meta data and their targets
40 | meta_data = np.zeros((len(base_learners), len(train_x)))
41 | meta_targets = np.zeros(len(train_x))
42 |
43 | # Create the cross-validation folds
44 | KF = KFold(n_splits=5)
45 | meta_index = 0
46 | for train_indices, test_indices in KF.split(train_x):
47 | # Train each learner on the K-1 folds and create meta data for the Kth fold
48 | for i in range(len(base_learners)):
49 | learner = base_learners[i]
50 |
51 | learner.fit(train_x[train_indices], train_y[train_indices])
52 | predictions = learner.predict_proba(train_x[test_indices])[:,0]
53 |
54 | meta_data[i][meta_index:meta_index+len(test_indices)] = predictions
55 |
56 | meta_targets[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
57 | meta_index += len(test_indices)
58 |
59 | # Transpose the meta data to be fed into the meta learner
60 | meta_data = meta_data.transpose()
61 |
62 | # --- SECTION 4 ---
63 | # Create the meta data for the test set and evaluate the base learners
64 | test_meta_data = np.zeros((len(base_learners), len(test_x)))
65 | base_acc = []
66 | for i in range(len(base_learners)):
67 | learner = base_learners[i]
68 | learner.fit(train_x, train_y)
69 | predictions = learner.predict_proba(test_x)[:,0]
70 | test_meta_data[i] = predictions
71 |
72 | acc = metrics.accuracy_score(test_y, learner.predict(test_x))
73 |
74 |
75 | base_acc.append(acc)
76 |
77 | test_meta_data = test_meta_data.transpose()
78 |
79 | # --- SECTION 5 ---
80 | # Fit the meta learner on the train set and evaluate it on the test set
81 | meta_learner.fit(meta_data, meta_targets)
82 | ensemble_predictions = meta_learner.predict(test_meta_data)
83 |
84 | acc = metrics.accuracy_score(test_y, ensemble_predictions)
85 |
86 | # --- SECTION 6 ---
87 | # Print the results
88 | print('Acc Name')
89 | print('-'*20)
90 | for i in range(len(base_learners)):
91 | learner = base_learners[i]
92 |
93 | print(f'{base_acc[i]:.2f} {learner.__class__.__name__}')
94 | print(f'{acc:.2f} Ensemble')
95 |
--------------------------------------------------------------------------------
/Chapter04/stacking_classification_analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Mar 18 20:19:55 2019
4 |
5 | @author: George Kyriakides
6 | ge.kyriakides@gmail.com
7 | """
8 |
9 | base_errors = (test_meta_data.transpose() - test_y).transpose()
10 | prediction_errors = ensemble_predictions - test_y
11 |
12 | for i in range(len(prediction_errors)):
13 | if not prediction_errors[i] == 0.0:
14 | print(base_errors[i,:])
--------------------------------------------------------------------------------
/Chapter04/stacking_classifiers.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries
3 | import numpy as np
4 |
5 | from sklearn.model_selection import KFold
6 | from copy import deepcopy
7 |
8 |
9 | class StackingRegressor():
10 |
11 | # --- SECTION 2 ---
12 | # The constructor
13 | def __init__(self, learners):
14 | # Create a list of sizes for each stacking level
15 | # And a list of deep copied learners
16 | self.level_sizes = []
17 | self.learners = []
18 | for learning_level in learners:
19 |
20 | self.level_sizes.append(len(learning_level))
21 | level_learners = []
22 | for learner in learning_level:
23 | level_learners.append(deepcopy(learner))
24 | self.learners.append(level_learners)
25 |
26 |
27 |
28 | # --- SECTION 3 ---
29 | # The fit function. Creates training meta data for every level and trains
30 | # each level on the previous level's meta data
31 | def fit(self, x, y):
32 | # Create a list of training meta data, one for each stacking level
33 | # and another one for the targets. For the first level, the actual data
34 | # is used.
35 | meta_data = [x]
36 | meta_targets = [y]
37 | for i in range(len(self.learners)):
38 | level_size = self.level_sizes[i]
39 |
40 | # Create the meta data and target variables for this level
41 | data_z = np.zeros((level_size, len(x)))
42 | target_z = np.zeros(len(x))
43 |
44 | train_x = meta_data[i]
45 | train_y = meta_targets[i]
46 |
47 | # Create the cross-validation folds
48 | KF = KFold(n_splits=5)
49 | meta_index = 0
50 | for train_indices, test_indices in KF.split(x):
51 | # Train each learner on the K-1 folds and create
52 | # meta data for the Kth fold
53 | for j in range(len(self.learners[i])):
54 |
55 | learner = self.learners[i][j]
56 | learner.fit(train_x[train_indices], train_y[train_indices])
57 | predictions = learner.predict(train_x[test_indices])
58 |
59 | data_z[j][meta_index:meta_index+len(test_indices)] = predictions
60 |
61 | target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
62 | meta_index += len(test_indices)
63 |
64 | # Add the data and targets to the meta data lists
65 | data_z = data_z.transpose()
66 | meta_data.append(data_z)
67 | meta_targets.append(target_z)
68 |
69 |
70 | # Train the learner on the whole previous meta data
71 | for learner in self.learners[i]:
72 | learner.fit(train_x, train_y)
73 |
74 |
75 |
76 |
77 |
78 |
79 | # --- SECTION 4 ---
80 | # The predict function. Creates meta data for the test data and returns
81 | # all of them. The actual predictions can be accessed with meta_data[-1]
82 | def predict(self, x):
83 |
84 | # Create a list of training meta data, one for each stacking level
85 | meta_data = [x]
86 | for i in range(len(self.learners)):
87 | level_size = self.level_sizes[i]
88 |
89 | data_z = np.zeros((level_size, len(x)))
90 |
91 | test_x = meta_data[i]
92 |
93 | # Create the cross-validation folds
94 | KF = KFold(n_splits=5)
95 | for train_indices, test_indices in KF.split(x):
96 | # Train each learner on the K-1 folds and create
97 | # meta data for the Kth fold
98 | for j in range(len(self.learners[i])):
99 |
100 | learner = self.learners[i][j]
101 | predictions = learner.predict(test_x)
102 | data_z[j] = predictions
103 |
104 |
105 |
106 | # Add the data and targets to the meta data lists
107 | data_z = data_z.transpose()
108 | meta_data.append(data_z)
109 |
110 | # Return the meta_data the final layer's prediction can be accessed
111 | # With meta_data[-1]
112 | return meta_data
113 |
114 |
115 |
116 | # --- SECTION 5 ---
117 | # Use the classifier
118 | from sklearn.datasets import load_diabetes
119 | from sklearn.neighbors import KNeighborsRegressor
120 | from sklearn.tree import DecisionTreeRegressor
121 | from sklearn.linear_model import LinearRegression, Ridge
122 | from sklearn import metrics
123 | diabetes = load_diabetes()
124 |
125 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
126 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
127 |
128 | base_learners = []
129 |
130 | knn = KNeighborsRegressor(n_neighbors=5)
131 | base_learners.append(knn)
132 |
133 | dtr = DecisionTreeRegressor(max_depth=4, random_state=123456)
134 | base_learners.append(dtr)
135 |
136 | ridge = Ridge()
137 | base_learners.append(ridge)
138 |
139 | meta_learner = LinearRegression()
140 |
141 | # Instantiate the stacking regressor
142 | sc = StackingRegressor([[knn,dtr,ridge],[meta_learner]])
143 |
144 | # Fit and predict
145 | sc.fit(train_x, train_y)
146 | meta_data = sc.predict(test_x)
147 |
148 | # Evaluate base learners and meta learner
149 | base_errors = []
150 | base_r2 = []
151 | for i in range(len(base_learners)):
152 | learner = base_learners[i]
153 |
154 | predictions = meta_data[1][:,i]
155 |
156 | err = metrics.mean_squared_error(test_y, predictions)
157 | r2 = metrics.r2_score(test_y, predictions)
158 |
159 | base_errors.append(err)
160 | base_r2.append(r2)
161 |
162 | err = metrics.mean_squared_error(test_y, meta_data[-1])
163 | r2 = metrics.r2_score(test_y, meta_data[-1])
164 |
165 | # Print the results
166 | print('ERROR R2 Name')
167 | print('-'*20)
168 | for i in range(len(base_learners)):
169 | learner = base_learners[i]
170 |
171 | print(f'{base_errors[i]:.1f} {base_r2[i]:.2f} {learner.__class__.__name__}')
172 | print(f'{err:.1f} {r2:.2f} Ensemble')
173 |
--------------------------------------------------------------------------------
/Chapter04/stacking_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_diabetes
4 | from sklearn.neighbors import KNeighborsRegressor
5 | from sklearn.tree import DecisionTreeRegressor
6 | from sklearn.linear_model import LinearRegression, Ridge
7 | from sklearn.model_selection import KFold
8 | from sklearn import metrics
9 | import numpy as np
10 | diabetes = load_diabetes()
11 |
12 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
13 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
14 |
15 | # --- SECTION 2 ---
16 | # Create the ensemble's base learners and meta learner
17 | # Append base learners to a list for ease of access
18 | base_learners = []
19 |
20 | knn = KNeighborsRegressor(n_neighbors=5)
21 | base_learners.append(knn)
22 |
23 | dtr = DecisionTreeRegressor(max_depth=4, random_state=123456)
24 | base_learners.append(dtr)
25 |
26 | ridge = Ridge()
27 | base_learners.append(ridge)
28 |
29 | meta_learner = LinearRegression()
30 |
31 |
32 | # --- SECTION 3 ---
33 | # Create the training meta data
34 |
35 | # Create variables to store meta data and their targets
36 | meta_data = np.zeros((len(base_learners), len(train_x)))
37 | meta_targets = np.zeros(len(train_x))
38 |
39 | # Create the cross-validation folds
40 | KF = KFold(n_splits=5)
41 | meta_index = 0
42 | for train_indices, test_indices in KF.split(train_x):
43 | # Train each learner on the K-1 folds and create meta data for the Kth fold
44 | for i in range(len(base_learners)):
45 | learner = base_learners[i]
46 |
47 | learner.fit(train_x[train_indices], train_y[train_indices])
48 | predictions = learner.predict(train_x[test_indices])
49 |
50 | meta_data[i][meta_index:meta_index+len(test_indices)] = predictions
51 |
52 | meta_targets[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
53 | meta_index += len(test_indices)
54 |
55 | # Transpose the meta data to be fed into the meta learner
56 | meta_data = meta_data.transpose()
57 |
58 | # --- SECTION 4 ---
59 | # Create the meta data for the test set and evaluate the base learners
60 | test_meta_data = np.zeros((len(base_learners), len(test_x)))
61 | base_errors = []
62 | base_r2 = []
63 | for i in range(len(base_learners)):
64 | learner = base_learners[i]
65 | learner.fit(train_x, train_y)
66 | predictions = learner.predict(test_x)
67 | test_meta_data[i] = predictions
68 |
69 | err = metrics.mean_squared_error(test_y, predictions)
70 | r2 = metrics.r2_score(test_y, predictions)
71 |
72 | base_errors.append(err)
73 | base_r2.append(r2)
74 |
75 | test_meta_data = test_meta_data.transpose()
76 |
77 | # --- SECTION 5 ---
78 | # Fit the meta learner on the train set and evaluate it on the test set
79 | meta_learner.fit(meta_data, meta_targets)
80 | ensemble_predictions = meta_learner.predict(test_meta_data)
81 |
82 | err = metrics.mean_squared_error(test_y, ensemble_predictions)
83 | r2 = metrics.r2_score(test_y, ensemble_predictions)
84 |
85 | # --- SECTION 6 ---
86 | # Print the results
87 | print('ERROR R2 Name')
88 | print('-'*20)
89 | for i in range(len(base_learners)):
90 | learner = base_learners[i]
91 |
92 | print(f'{base_errors[i]:.1f} {base_r2[i]:.2f} {learner.__class__.__name__}')
93 | print(f'{err:.1f} {r2:.2f} Ensemble')
94 |
--------------------------------------------------------------------------------
/Chapter04/stacking_regression_analysis.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 | knn_d = test_meta_data[:,0]-test_y
4 | dtr_d = test_meta_data[:,1]-test_y
5 | ridge_d = test_meta_data[:,2]-test_y
6 | meta_d = ensemble_predictions-test_y
7 |
8 | plt.plot(knn_d, label='KNN')
9 | plt.plot(dtr_d, label='DTree')
10 | plt.plot(ridge_d, label='Ridge')
11 | plt.plot(meta_d, label='Ensemble')
12 | plt.legend()
--------------------------------------------------------------------------------
/Chapter05/bagging_custom.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.tree import DecisionTreeClassifier
5 | from sklearn import metrics
6 | import numpy as np
7 | import time
8 |
9 | start = time.time()
10 |
11 | digits = load_digits()
12 |
13 | train_size = 1500
14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
16 |
17 | # --- SECTION 2 ---
18 | # Create our bootstrap samples and train the classifiers
19 |
20 | ensemble_size = 10
21 | base_learners = []
22 |
23 | for _ in range(ensemble_size):
24 | # We sample indices in order to access features and targets
25 | bootstrap_sample_indices = np.random.randint(0, train_size, size=train_size)
26 | bootstrap_x = train_x[bootstrap_sample_indices]
27 | bootstrap_y = train_y[bootstrap_sample_indices]
28 | dtree = DecisionTreeClassifier()
29 | dtree.fit(bootstrap_x, bootstrap_y)
30 | base_learners.append(dtree)
31 |
32 | # --- SECTION 3 ---
33 | # Predict with the base learners and evaluate them
34 | base_predictions = []
35 | base_accuracy = []
36 | for learner in base_learners:
37 | predictions = learner.predict(test_x)
38 | base_predictions.append(predictions)
39 | acc = metrics.accuracy_score(test_y, predictions)
40 | base_accuracy.append(acc)
41 |
42 | # --- SECTION 4 ---
43 | # Combine the base learners' predictions
44 |
45 | ensemble_predictions = []
46 | # Find the most voted class for each test instance
47 | for i in range(len(test_y)):
48 | # Count the votes for each class
49 | counts = [0 for _ in range(10)]
50 | for learner_predictions in base_predictions:
51 | counts[learner_predictions[i]] = counts[learner_predictions[i]]+1
52 |
53 | # Find the class with most votes
54 | final = np.argmax(counts)
55 | # Add the class to the final predictions
56 | ensemble_predictions.append(final)
57 |
58 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
59 |
60 | end = time.time()
61 |
62 |
63 | # --- SECTION 5 ---
64 | # Print the accuracies
65 | print('Base Learners:')
66 | print('-'*30)
67 | for index, acc in enumerate(sorted(base_accuracy)):
68 | print(f'Learner {index+1}: %.2f' % acc)
69 | print('-'*30)
70 | print('Bagging: %.2f' % ensemble_acc)
71 |
72 | print('Total time: %.2f' % (end - start))
73 |
--------------------------------------------------------------------------------
/Chapter05/bagging_custom_parallel.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries
3 | from sklearn.datasets import load_digits
4 | from sklearn.tree import DecisionTreeClassifier
5 | from sklearn import metrics
6 | import numpy as np
7 | import time
8 |
9 |
10 | from concurrent.futures import ProcessPoolExecutor
11 |
12 | # --- SECTION 2 ---
13 | # Define required functions
14 | train_size = 1500
15 |
16 |
17 | def create_learner(train_x, train_y):
18 | # We sample indices in order to access features and targets
19 | bootstrap_sample_indices = np.random.randint(0, train_size, size=train_size)
20 | bootstrap_x = train_x[bootstrap_sample_indices]
21 | bootstrap_y = train_y[bootstrap_sample_indices]
22 | dtree = DecisionTreeClassifier()
23 | dtree.fit(bootstrap_x, bootstrap_y)
24 | return dtree
25 |
26 |
27 | def predict(learner, test_x):
28 | return learner.predict(test_x)
29 |
30 |
31 | # --- SECTION 3 ---
32 | # Protect our main
33 | if __name__ == '__main__':
34 |
35 | start = time.time()
36 | digits = load_digits()
37 |
38 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
39 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
40 |
41 | ensemble_size = 1000
42 | base_learners = []
43 |
44 | # --- SECTION 4 ---
45 | # Create the base learners
46 | with ProcessPoolExecutor() as executor:
47 | futures = []
48 | for _ in range(ensemble_size):
49 | future = executor.submit(create_learner, train_x, train_y)
50 | futures.append(future)
51 |
52 | for future in futures:
53 | base_learners.append(future.result())
54 |
55 | # --- SECTION 5 ---
56 | # Predict with the base learners and evaluate them
57 | base_predictions = []
58 | base_accuracy = []
59 | with ProcessPoolExecutor() as executor:
60 | futures = []
61 | for learner in base_learners:
62 | future = executor.submit(predict, learner, test_x)
63 | futures.append(future)
64 |
65 | for future in futures:
66 | predictions = future.result()
67 | base_predictions.append(predictions)
68 | acc = metrics.accuracy_score(test_y, predictions)
69 | base_accuracy.append(acc)
70 |
71 | # --- SECTION 6 ---
72 | # Combine the base learners' predictions
73 | ensemble_predictions = []
74 | # Find the most voted class for each test instance
75 | for i in range(len(test_y)):
76 | # Count the votes for each class
77 | counts = [0 for _ in range(10)]
78 | for learner_predictions in base_predictions:
79 | counts[learner_predictions[i]] = counts[learner_predictions[i]]+1
80 |
81 | # Find the class with most votes
82 | final = np.argmax(counts)
83 | # Add the class to the final predictions
84 | ensemble_predictions.append(final)
85 |
86 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
87 |
88 | end = time.time()
89 |
90 | # --- SECTION 7 ---
91 | # Print the accuracies
92 | print('Base Learners:')
93 | print('-'*30)
94 | for index, acc in enumerate(sorted(base_accuracy)):
95 | print(f'Learner {index+1}: %.2f' % acc)
96 | print('-'*30)
97 | print('Bagging: %.2f' % ensemble_acc)
98 | print('Total time: %.2f' % (end - start))
99 |
--------------------------------------------------------------------------------
/Chapter05/bagging_sklearn_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.tree import DecisionTreeClassifier
5 | from sklearn.ensemble import BaggingClassifier
6 | from sklearn import metrics
7 |
8 |
9 | digits = load_digits()
10 |
11 | train_size = 1500
12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
14 |
15 | # --- SECTION 2 ---
16 | # Create the ensemble
17 | ensemble_size = 10
18 | ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
19 | n_estimators=ensemble_size,
20 | oob_score=True)
21 |
22 | # --- SECTION 3 ---
23 | # Train the ensemble
24 | ensemble.fit(train_x, train_y)
25 |
26 | # --- SECTION 4 ---
27 | # Evaluate the ensemble
28 | ensemble_predictions = ensemble.predict(test_x)
29 |
30 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
31 |
32 | # --- SECTION 5 ---
33 | # Print the accuracy
34 | print('Bagging: %.2f' % ensemble_acc)
35 |
--------------------------------------------------------------------------------
/Chapter05/bagging_sklearn_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_diabetes
4 | from sklearn.tree import DecisionTreeRegressor
5 | from sklearn.ensemble import BaggingRegressor
6 | from sklearn import metrics
7 | import numpy as np
8 | diabetes = load_diabetes()
9 |
10 | np.random.seed(1234)
11 |
12 | train_x, train_y = diabetes.data[:400], diabetes.target[:400]
13 | test_x, test_y = diabetes.data[400:], diabetes.target[400:]
14 |
15 | # --- SECTION 2 ---
16 | # Create the ensemble and a single base learner for comparison
17 | estimator = DecisionTreeRegressor(max_depth=6)
18 | ensemble = BaggingRegressor(base_estimator=estimator,
19 | n_estimators=10)
20 |
21 | # --- SECTION 3 ---
22 | # Train and evaluate both the ensemble and the base learner
23 | ensemble.fit(train_x, train_y)
24 | ensemble_predictions = ensemble.predict(test_x)
25 |
26 | estimator.fit(train_x, train_y)
27 | single_predictions = estimator.predict(test_x)
28 |
29 | ensemble_r2 = metrics.r2_score(test_y, ensemble_predictions)
30 | ensemble_mse = metrics.mean_squared_error(test_y, ensemble_predictions)
31 |
32 | single_r2 = metrics.r2_score(test_y, single_predictions)
33 | single_mse = metrics.mean_squared_error(test_y, single_predictions)
34 |
35 | # --- SECTION 4 ---
36 | # Print the metrics
37 | print('Bagging r-squared: %.2f' % ensemble_r2)
38 | print('Bagging MSE: %.2f' % ensemble_mse)
39 | print('-'*30)
40 | print('Decision Tree r-squared: %.2f' % single_r2)
41 | print('Decision Tree MSE: %.2f' % single_mse)
42 |
--------------------------------------------------------------------------------
/Chapter05/bootstrapping.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | from sklearn.datasets import load_diabetes
6 |
7 | diabetes = load_diabetes()
8 |
9 | # --- SECTION 2 ---
10 | # Print the original sample's statistics
11 | target = diabetes.target
12 |
13 | print(np.mean(target))
14 | print(np.std(target))
15 |
16 | # --- SECTION 3 ---
17 | # Create the bootstrap samples and statistics
18 | bootstrap_stats = []
19 |
20 | for _ in range(10000):
21 | bootstrap_sample = np.random.choice(target, size=int(len(target)/1))
22 | mean = np.mean(bootstrap_sample)
23 | std = np.std(bootstrap_sample)
24 | bootstrap_stats.append((mean, std))
25 |
26 | bootstrap_stats = np.array(bootstrap_stats)
27 |
28 |
29 | # --- SECTION 4 ---
30 | # plot the histograms
31 | plt.figure()
32 | plt.subplot(2,1,1)
33 | std_err = np.std(bootstrap_stats[:,0])
34 | plt.title('Mean, Std. Error: %.2f'%std_err)
35 | plt.hist(bootstrap_stats[:,0], bins=20)
36 |
37 | plt.subplot(2,1,2)
38 | std_err = np.std(bootstrap_stats[:,1])
39 | plt.title('Std. Dev, Std. Error: %.2f'%std_err)
40 | plt.hist(bootstrap_stats[:,1], bins=20)
41 | plt.show()
--------------------------------------------------------------------------------
/Chapter05/validation_curves.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.tree import DecisionTreeClassifier
5 | from sklearn.ensemble import BaggingClassifier
6 | from sklearn.model_selection import validation_curve
7 | import warnings
8 | import matplotlib.pyplot as plt
9 | import numpy as np
10 |
11 | warnings.filterwarnings("ignore")
12 |
13 | digits = load_digits()
14 |
15 | train_size = 1500
16 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
17 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
18 |
19 |
20 | # --- SECTION 2 ---
21 | # Create in-sample and out-of-sample scores
22 | x, y = train_x, train_y
23 | learner = BaggingClassifier()
24 | param_range = [x for x in range(1, 40, 2)]
25 | train_scores, test_scores = validation_curve(learner, x, y,
26 | param_name='n_estimators',
27 | param_range=param_range,
28 | cv=10,
29 | scoring="accuracy")
30 |
31 | # --- SECTION 3 ---
32 | # Calculate the average and standard deviation for each hyperparameter
33 | train_scores_mean = np.mean(train_scores, axis=1)
34 | train_scores_std = np.std(train_scores, axis=1)
35 | test_scores_mean = np.mean(test_scores, axis=1)
36 | test_scores_std = np.std(test_scores, axis=1)
37 |
38 |
39 | # --- SECTION 4 ---
40 | # Plot the scores
41 | plt.figure()
42 | plt.title('Validation curves')
43 | # Plot the standard deviations
44 | plt.fill_between(param_range, train_scores_mean - train_scores_std,
45 | train_scores_mean + train_scores_std, alpha=0.1,
46 | color="C1")
47 | plt.fill_between(param_range, test_scores_mean - test_scores_std,
48 | test_scores_mean + test_scores_std, alpha=0.1, color="C0")
49 |
50 | # Plot the means
51 | plt.plot(param_range, train_scores_mean, 'o-', color="C1",
52 | label="Training score")
53 | plt.plot(param_range, test_scores_mean, 'o-', color="C0",
54 | label="Cross-validation score")
55 |
56 | plt.xticks(param_range)
57 | plt.xlabel('Ensemble Size')
58 | plt.ylabel('Accuracy')
59 | plt.legend(loc="best")
--------------------------------------------------------------------------------
/Chapter06/adaboost_custom.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from copy import deepcopy
4 | from sklearn.datasets import load_breast_cancer
5 | from sklearn.tree import DecisionTreeClassifier
6 | from sklearn import metrics
7 |
8 | import numpy as np
9 |
10 | bc = load_breast_cancer()
11 |
12 | train_size = 400
13 | train_x, train_y = bc.data[:train_size], bc.target[:train_size]
14 | test_x, test_y = bc.data[train_size:], bc.target[train_size:]
15 |
16 | np.random.seed(123456)
17 |
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 100
21 | base_classifier = DecisionTreeClassifier(max_depth=1)
22 |
23 | # Create the initial weights
24 | data_weights = np.zeros(train_size) + 1/train_size
25 | # Create a list of indices for the train set
26 | indices = [x for x in range(train_size)]
27 |
28 | base_learners = []
29 | learners_errors = np.zeros(ensemble_size)
30 | learners_weights = np.zeros(ensemble_size)
31 |
32 | errs = []
33 | # Create each base learner
34 | for i in range(ensemble_size):
35 | weak_learner = deepcopy(base_classifier)
36 |
37 | # Choose the samples by sampling with replacement.
38 | # Each instance's probability is dictated by its weight.
39 | data_indices = np.random.choice(indices, train_size, p=data_weights)
40 | sample_x, sample_y = train_x[data_indices], train_y[data_indices]
41 |
42 | # Fit the weak learner and evaluate it
43 | weak_learner.fit(sample_x, sample_y)
44 | predictions = weak_learner.predict(train_x)
45 |
46 | errors = predictions != train_y
47 | corrects = predictions == train_y
48 |
49 | # Calculate the weighted errors
50 | weighted_errors = data_weights*errors
51 |
52 |
53 | # The base learner's error is the average of the weighted errors
54 | learner_error = np.mean(weighted_errors)
55 | learners_errors[i] = learner_error
56 |
57 | # The learner's weight
58 | learner_weight = np.log((1-learner_error)/learner_error)/2
59 | learners_weights[i] = learner_weight
60 |
61 | # Update the data weights
62 | data_weights[errors] = np.exp(data_weights[errors] * learner_weight)
63 | data_weights[corrects] = np.exp(-data_weights[corrects] * learner_weight)
64 |
65 | data_weights = data_weights/sum(data_weights)
66 | # Save the learner
67 | base_learners.append(weak_learner)
68 |
69 |
70 |
71 | # --- SECTION 3 ---
72 | # Evaluate the ensemble
73 | ensemble_predictions = []
74 | for learner, weight in zip(base_learners, learners_weights):
75 | # Calculate the weighted predictions
76 | prediction = learner.predict(test_x)
77 | ensemble_predictions.append(prediction*weight)
78 |
79 | # The final prediction is the weighted mean of the individual predictions
80 | ensemble_predictions = np.mean(ensemble_predictions, axis=0) >= 0.5
81 |
82 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
83 |
84 | # --- SECTION 4 ---
85 | # Print the accuracy
86 | print('Boosting: %.2f' % ensemble_acc)
87 |
--------------------------------------------------------------------------------
/Chapter06/adaboost_sklearn_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 |
5 | from sklearn.datasets import load_digits
6 | from sklearn.tree import DecisionTreeClassifier
7 | from sklearn.ensemble import AdaBoostClassifier
8 | from sklearn import metrics
9 |
10 |
11 | digits = load_digits()
12 |
13 | train_size = 1500
14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
16 |
17 | np.random.seed(123456)
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 1000
21 | ensemble = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
22 | algorithm="SAMME",
23 | n_estimators=ensemble_size)
24 |
25 | # --- SECTION 3 ---
26 | # Train the ensemble
27 | ensemble.fit(train_x, train_y)
28 |
29 | # --- SECTION 4 ---
30 | # Evaluate the ensemble
31 | ensemble_predictions = ensemble.predict(test_x)
32 |
33 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
34 |
35 | # --- SECTION 5 ---
36 | # Print the accuracy
37 | print('Boosting: %.2f' % ensemble_acc)
38 |
39 |
--------------------------------------------------------------------------------
/Chapter06/adaboost_sklearn_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from copy import deepcopy
4 | from sklearn.datasets import load_diabetes
5 | from sklearn.ensemble import AdaBoostRegressor
6 | from sklearn.tree import DecisionTreeRegressor
7 | from sklearn import metrics
8 |
9 | import numpy as np
10 |
11 | diabetes = load_diabetes()
12 |
13 | train_size = 400
14 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size]
15 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:]
16 |
17 | np.random.seed(123456)
18 |
19 | # --- SECTION 2 ---
20 | # Create the ensemble
21 | ensemble_size = 1000
22 | ensemble = AdaBoostRegressor(n_estimators=ensemble_size)
23 |
24 | # --- SECTION 3 ---
25 | # Evaluate the ensemble
26 | ensemble.fit(train_x, train_y)
27 | predictions = ensemble.predict(test_x)
28 |
29 | # --- SECTION 4 ---
30 | # Print the metrics
31 | r2 = metrics.r2_score(test_y, predictions)
32 | mse = metrics.mean_squared_error(test_y, predictions)
33 |
34 | print('Gradient Boosting:')
35 | print('R-squared: %.2f' % r2)
36 | print('MSE: %.2f' % mse)
--------------------------------------------------------------------------------
/Chapter06/boosting_overfit.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | np.random.seed(12345)
5 |
6 | points = np.random.multivariate_normal([1, 1], ([1, 0.5],[0.5, 1]), 10)
7 | points2 = np.random.multivariate_normal([4, 4], ([1, 0.5],[0.5, 1]), 10)
8 |
9 | plt.scatter(*zip(*points), label='Class 1', marker='+', s=150)
10 | plt.scatter(*zip(*points2), label='Class 2', marker='_', s=150)
11 | plt.plot([-x+6 for x in range(0,10)], linestyle='--',
12 | color='black', label='class boundary')
13 |
14 | #plt.text(0,5, '+', fontsize=18)
15 | #plt.text(1.5,5.5, '_', fontsize=18)
16 |
17 | r = range(-5,10)
18 | stable = [x for x in r]
19 |
20 | plt.plot([1.45 for x in r], stable, linestyle='--',
21 | color='gray', label='outlier rules')
22 | plt.plot([1.9 for x in r], stable, linestyle='--',
23 | color='gray')
24 |
25 | plt.plot(stable,[0.85 for x in r], linestyle='--',
26 | color='gray')
27 | plt.plot(stable,[0.55 for x in r], linestyle='--',
28 | color='gray')
29 | plt.xticks([], [])
30 | plt.yticks([], [])
31 |
32 |
33 |
34 | plt.legend()
--------------------------------------------------------------------------------
/Chapter06/dataset_segmentation.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | np.random.seed(654321)
5 |
6 | points = np.random.randint(0, 10, size=(10, 2))
7 | classes = np.random.randint(0, 2, size=(10,))
8 |
9 |
10 | positives = points[classes == 0]
11 | negatives = points[classes == 1]
12 |
13 | plt.scatter(*positives.T, marker='+', s=150)
14 | plt.scatter(*negatives.T, marker='_', s=150)
15 | plt.xticks([], [])
16 | plt.yticks([], [])
17 |
18 | plt.plot([1.5 for _ in range(12)], [x for x in range(-1, 11)], linestyle='--', color='black')
19 | plt.plot([4.5 for _ in range(12)], [x for x in range(-1, 11)], linestyle='--', color='black')
20 |
21 | plt.plot([x for x in range(-1, 8)], [1.5 for _ in range(9)], linestyle='--', color='black')
--------------------------------------------------------------------------------
/Chapter06/gradient_boosting_custom.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from copy import deepcopy
4 | from sklearn.datasets import load_diabetes
5 | from sklearn.tree import DecisionTreeRegressor
6 | from sklearn import metrics
7 |
8 | import numpy as np
9 |
10 | diabetes = load_diabetes()
11 |
12 | train_size = 400
13 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size]
14 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:]
15 |
16 | np.random.seed(123456)
17 |
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 |
21 | # Define the ensemble's size, learning rate and decision tree depth
22 | ensemble_size = 50
23 | learning_rate = 0.1
24 | base_classifier = DecisionTreeRegressor(max_depth=3)
25 |
26 | # Create placeholders for the base learners and each step's prediction
27 | base_learners = []
28 | # Note that the initial prediction is the target variable's mean
29 | previous_predictions = np.zeros(len(train_y)) + np.mean(train_y)
30 |
31 | # Create the base learners
32 | for _ in range(ensemble_size):
33 | # Start by calcualting the pseudo-residuals
34 | errors = train_y - previous_predictions
35 |
36 | # Make a deep copy of the base classifier and train it on the
37 | # pseudo-residuals
38 | learner = deepcopy(base_classifier)
39 | learner.fit(train_x, errors)
40 |
41 | # Predict the residuals on the train set
42 | predictions = learner.predict(train_x)
43 |
44 | # Multiply the predictions with the learning rate and add the results
45 | # to the previous prediction
46 | previous_predictions = previous_predictions + learning_rate*predictions
47 |
48 | # Save the base learner
49 | base_learners.append(learner)
50 |
51 | # --- SECTION 3 ---
52 | # Evaluate the ensemble
53 |
54 | # Start with the train set's mean
55 | previous_predictions = np.zeros(len(test_y)) + np.mean(train_y)
56 |
57 | # For each base learner predict the pseudo-residuals for the test set and
58 | # add them to the previous prediction, after multiplying with the learning rate
59 | for learner in base_learners:
60 | predictions = learner.predict(test_x)
61 | previous_predictions = previous_predictions + learning_rate*predictions
62 |
63 | # --- SECTION 4 ---
64 | # Print the metrics
65 | r2 = metrics.r2_score(test_y, previous_predictions)
66 | mse = metrics.mean_squared_error(test_y, previous_predictions)
67 |
68 | print('Gradient Boosting:')
69 | print('R-squared: %.2f' % r2)
70 | print('MSE: %.2f' % mse)
--------------------------------------------------------------------------------
/Chapter06/gradient_boosting_sklearn_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 |
5 | from sklearn.datasets import load_digits
6 | from sklearn.tree import DecisionTreeClassifier
7 | from sklearn.ensemble import GradientBoostingClassifier
8 | from sklearn import metrics
9 |
10 |
11 | digits = load_digits()
12 |
13 | train_size = 1500
14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
16 |
17 | np.random.seed(123456)
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 200
21 | learning_rate = 0.1
22 | ensemble = GradientBoostingClassifier(n_estimators=ensemble_size,
23 | learning_rate=learning_rate)
24 |
25 | # --- SECTION 3 ---
26 | # Train the ensemble
27 | ensemble.fit(train_x, train_y)
28 |
29 | # --- SECTION 4 ---
30 | # Evaluate the ensemble
31 | ensemble_predictions = ensemble.predict(test_x)
32 |
33 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
34 |
35 | # --- SECTION 5 ---
36 | # Print the accuracy
37 | print('Boosting: %.2f' % ensemble_acc)
38 |
39 |
40 | import matplotlib.pyplot as plt
41 | diffs = [ensemble.train_score_[i] - ensemble.train_score_[i-1] for i in range(1, len(ensemble.train_score_))]
42 |
43 | fig, ax1 = plt.subplots()
44 | ax1.plot(ensemble.train_score_, linestyle='--', label='Errors (Left axis)')
45 |
46 |
47 | ax2 = ax1.twinx()
48 | ax2.plot(diffs, label='Errors Differences (Right axis)')
49 | fig.legend()
50 |
--------------------------------------------------------------------------------
/Chapter06/gradient_boosting_sklearn_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_diabetes
4 | from sklearn.ensemble import GradientBoostingRegressor
5 | from sklearn import metrics
6 |
7 | import numpy as np
8 |
9 | diabetes = load_diabetes()
10 |
11 | train_size = 400
12 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size]
13 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:]
14 |
15 | np.random.seed(123456)
16 |
17 | # --- SECTION 2 ---
18 | # Create the ensemble
19 | ensemble_size = 200
20 | learning_rate = 0.1
21 | ensemble = GradientBoostingRegressor(n_estimators=ensemble_size,
22 | learning_rate=learning_rate)
23 |
24 | # --- SECTION 3 ---
25 | # Evaluate the ensemble
26 | ensemble.fit(train_x, train_y)
27 | predictions = ensemble.predict(test_x)
28 |
29 | # --- SECTION 4 ---
30 | # Print the metrics
31 | r2 = metrics.r2_score(test_y, predictions)
32 | mse = metrics.mean_squared_error(test_y, predictions)
33 |
34 | print('Gradient Boosting:')
35 | print('R-squared: %.2f' % r2)
36 | print('MSE: %.2f' % mse)
37 |
38 |
39 | import matplotlib.pyplot as plt
40 | diffs = [ensemble.train_score_[i] - ensemble.train_score_[i-1] for i in range(1, len(ensemble.train_score_))]
41 |
42 | fig, ax1 = plt.subplots()
43 | ax1.plot(ensemble.train_score_, linestyle='--', label='Errors (Left axis)')
44 |
45 |
46 | ax2 = ax1.twinx()
47 | ax2.plot(diffs, label='Errors Differences (Right axis)')
48 | fig.legend()
49 |
--------------------------------------------------------------------------------
/Chapter06/xgb_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from xgboost import XGBClassifier
5 | from sklearn import metrics
6 |
7 | import numpy as np
8 |
9 | digits = load_digits()
10 |
11 | train_size = 1500
12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
14 |
15 | np.random.seed(123456)
16 | # --- SECTION 2 ---
17 | # Create the ensemble
18 | ensemble_size = 100
19 | ensemble = XGBClassifier(n_estimators=ensemble_size, n_jobs=4)
20 |
21 | # --- SECTION 3 ---
22 | # Train the ensemble
23 | ensemble.fit(train_x, train_y)
24 |
25 | # --- SECTION 4 ---
26 | # Evaluate the ensemble
27 | ensemble_predictions = ensemble.predict(test_x)
28 |
29 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
30 |
31 | # --- SECTION 5 ---
32 | # Print the accuracy
33 | print('Boosting: %.2f' % ensemble_acc)
34 |
--------------------------------------------------------------------------------
/Chapter06/xgb_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_diabetes
4 | from xgboost import XGBRegressor
5 | from sklearn import metrics
6 |
7 | import numpy as np
8 |
9 | diabetes = load_diabetes()
10 |
11 | train_size = 400
12 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size]
13 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:]
14 |
15 | np.random.seed(123456)
16 |
17 | # --- SECTION 2 ---
18 | # Create the ensemble
19 | ensemble_size = 200
20 | ensemble = XGBRegressor(n_estimators=ensemble_size, n_jobs=4,
21 | max_depth=1, learning_rate=0.1,
22 | objective ='reg:squarederror')
23 |
24 | # --- SECTION 3 ---
25 | # Evaluate the ensemble
26 | ensemble.fit(train_x, train_y)
27 | predictions = ensemble.predict(test_x)
28 |
29 | # --- SECTION 4 ---
30 | # Print the metrics
31 | r2 = metrics.r2_score(test_y, predictions)
32 | mse = metrics.mean_squared_error(test_y, predictions)
33 |
34 | print('Gradient Boosting:')
35 | print('R-squared: %.2f' % r2)
36 | print('MSE: %.2f' % mse)
37 |
--------------------------------------------------------------------------------
/Chapter07/extra_tree_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.ensemble import ExtraTreesClassifier
5 | from sklearn import metrics
6 | import numpy as np
7 |
8 | digits = load_digits()
9 |
10 |
11 | train_size = 1500
12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
14 |
15 | np.random.seed(123456)
16 | # --- SECTION 2 ---
17 | # Create the ensemble
18 | ensemble_size = 500
19 | ensemble = ExtraTreesClassifier(n_estimators=ensemble_size, n_jobs=4)
20 |
21 | # --- SECTION 3 ---
22 | # Train the ensemble
23 | ensemble.fit(train_x, train_y)
24 |
25 | # --- SECTION 4 ---
26 | # Evaluate the ensemble
27 | ensemble_predictions = ensemble.predict(test_x)
28 |
29 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
30 |
31 | # --- SECTION 5 ---
32 | # Print the accuracy
33 | print('Extra Tree Forest: %.2f' % ensemble_acc)
34 |
--------------------------------------------------------------------------------
/Chapter07/extra_tree_classification_validation_curves.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.ensemble import ExtraTreesClassifier
5 | from sklearn.model_selection import validation_curve
6 | from sklearn import metrics
7 | import numpy as np
8 | import matplotlib.pyplot as plt
9 |
10 | digits = load_digits()
11 |
12 |
13 | train_size = 1500
14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
16 |
17 | np.random.seed(123456)
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 500
21 | ensemble = ExtraTreesClassifier(n_estimators=ensemble_size, n_jobs=4)
22 |
23 | param_range = [10, 50, 100, 150, 200, 250, 300, 350, 400]
24 | train_scores, test_scores = validation_curve(ensemble, train_x, train_y, 'n_estimators', param_range,
25 | cv=10, scoring='accuracy')
26 |
27 | # --- SECTION 3 ---
28 | # Calculate the average and standard deviation for each hyperparameter
29 | train_scores_mean = np.mean(train_scores, axis=1)
30 | train_scores_std = np.std(train_scores, axis=1)
31 | test_scores_mean = np.mean(test_scores, axis=1)
32 | test_scores_std = np.std(test_scores, axis=1)
33 |
34 |
35 | # --- SECTION 4 ---
36 | # Plot the scores
37 | plt.figure()
38 | plt.title('Validation curves (Extra Trees)')
39 | # Plot the standard deviations
40 | plt.fill_between(param_range, train_scores_mean - train_scores_std,
41 | train_scores_mean + train_scores_std, alpha=0.1,
42 | color="C1")
43 | plt.fill_between(param_range, test_scores_mean - test_scores_std,
44 | test_scores_mean + test_scores_std, alpha=0.1, color="C0")
45 |
46 | # Plot the means
47 | plt.plot(param_range, train_scores_mean, 'o-', color="C1",
48 | label="Training score")
49 | plt.plot(param_range, test_scores_mean, 'o-', color="C0",
50 | label="Cross-validation score")
51 |
52 | plt.xticks(param_range)
53 | plt.xlabel('Number of trees')
54 | plt.ylabel('Accuracy')
55 | plt.legend(loc="best")
56 |
--------------------------------------------------------------------------------
/Chapter07/extra_tree_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from copy import deepcopy
4 | from sklearn.datasets import load_diabetes
5 | from sklearn.ensemble import ExtraTreesRegressor
6 | from sklearn import metrics
7 |
8 | import numpy as np
9 |
10 | diabetes = load_diabetes()
11 |
12 | train_size = 400
13 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size]
14 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:]
15 |
16 | np.random.seed(123456)
17 |
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 1000
21 | ensemble = ExtraTreesRegressor(n_estimators=ensemble_size,
22 | min_samples_leaf=10, n_jobs=4)
23 |
24 | # --- SECTION 3 ---
25 | # Evaluate the ensemble
26 | ensemble.fit(train_x, train_y)
27 | predictions = ensemble.predict(test_x)
28 |
29 | # --- SECTION 4 ---
30 | # Print the metrics
31 | r2 = metrics.r2_score(test_y, predictions)
32 | mse = metrics.mean_squared_error(test_y, predictions)
33 |
34 | print('Extra Trees:')
35 | print('R-squared: %.2f' % r2)
36 | print('MSE: %.2f' % mse)
--------------------------------------------------------------------------------
/Chapter07/probability_to_choose.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | p = 0
6 | def prob(relevant, irrelevant, select):
7 | p = 1 - relevant/(relevant+irrelevant)
8 | p_none = np.power(p, select)
9 | at_least_one = 1 - p_none
10 | return at_least_one
11 |
12 |
13 | data = np.zeros((10,10))
14 | for i in range(1, 11):
15 | for j in range(1, 11):
16 | select = int(np.floor(np.sqrt(j*10)))
17 | data[-1+i,-1+j] = prob(i,j*10,select)
18 |
19 |
20 | fig, ax = plt.subplots()
21 | plt.gray()
22 | cs = ax.imshow(data, extent=[10,100,10,1])
23 | ax.set_aspect(10)
24 | plt.xlabel('Irrelevant Features')
25 | plt.ylabel('Relevant Features')
26 | plt.title('Probability to select at least one relevant feature')
27 | fig.colorbar(cs)
--------------------------------------------------------------------------------
/Chapter07/rf_classification.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn import metrics
6 | import numpy as np
7 |
8 | digits = load_digits()
9 |
10 |
11 | train_size = 1500
12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
14 |
15 | np.random.seed(123456)
16 | # --- SECTION 2 ---
17 | # Create the ensemble
18 | ensemble_size = 500
19 | ensemble = RandomForestClassifier(n_estimators=ensemble_size, n_jobs=4)
20 |
21 | # --- SECTION 3 ---
22 | # Train the ensemble
23 | ensemble.fit(train_x, train_y)
24 |
25 | # --- SECTION 4 ---
26 | # Evaluate the ensemble
27 | ensemble_predictions = ensemble.predict(test_x)
28 |
29 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions)
30 |
31 | # --- SECTION 5 ---
32 | # Print the accuracy
33 | print('Random Forest: %.2f' % ensemble_acc)
34 |
--------------------------------------------------------------------------------
/Chapter07/rf_classification_validation_curves.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from sklearn.datasets import load_digits
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.model_selection import validation_curve
6 | from sklearn import metrics
7 | import numpy as np
8 | import matplotlib.pyplot as plt
9 |
10 | digits = load_digits()
11 |
12 |
13 | train_size = 1500
14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size]
15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:]
16 |
17 | np.random.seed(123456)
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 500
21 | ensemble = RandomForestClassifier(n_estimators=ensemble_size, n_jobs=4)
22 |
23 | param_range = [10, 50, 100, 150, 200, 250, 300, 350, 400]
24 | train_scores, test_scores = validation_curve(ensemble, train_x, train_y, 'n_estimators', param_range,
25 | cv=10, scoring='accuracy')
26 |
27 | # --- SECTION 3 ---
28 | # Calculate the average and standard deviation for each hyperparameter
29 | train_scores_mean = np.mean(train_scores, axis=1)
30 | train_scores_std = np.std(train_scores, axis=1)
31 | test_scores_mean = np.mean(test_scores, axis=1)
32 | test_scores_std = np.std(test_scores, axis=1)
33 |
34 |
35 | # --- SECTION 4 ---
36 | # Plot the scores
37 | plt.figure()
38 | plt.title('Validation curves (Random Forest)')
39 | # Plot the standard deviations
40 | plt.fill_between(param_range, train_scores_mean - train_scores_std,
41 | train_scores_mean + train_scores_std, alpha=0.1,
42 | color="C1")
43 | plt.fill_between(param_range, test_scores_mean - test_scores_std,
44 | test_scores_mean + test_scores_std, alpha=0.1, color="C0")
45 |
46 | # Plot the means
47 | plt.plot(param_range, train_scores_mean, 'o-', color="C1",
48 | label="Training score")
49 | plt.plot(param_range, test_scores_mean, 'o-', color="C0",
50 | label="Cross-validation score")
51 |
52 | plt.xticks(param_range)
53 | plt.xlabel('Number of trees')
54 | plt.ylabel('Accuracy')
55 | plt.legend(loc="best")
56 |
--------------------------------------------------------------------------------
/Chapter07/rf_regression.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | from copy import deepcopy
4 | from sklearn.datasets import load_diabetes
5 | from sklearn.ensemble import RandomForestRegressor
6 | from sklearn import metrics
7 |
8 | import numpy as np
9 |
10 | diabetes = load_diabetes()
11 |
12 | train_size = 400
13 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size]
14 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:]
15 |
16 | np.random.seed(123456)
17 |
18 | # --- SECTION 2 ---
19 | # Create the ensemble
20 | ensemble_size = 1000
21 | ensemble = RandomForestRegressor(n_estimators=ensemble_size,
22 | min_samples_leaf=20, n_jobs=4)
23 |
24 | # --- SECTION 3 ---
25 | # Evaluate the ensemble
26 | ensemble.fit(train_x, train_y)
27 | predictions = ensemble.predict(test_x)
28 |
29 | # --- SECTION 4 ---
30 | # Print the metrics
31 | r2 = metrics.r2_score(test_y, predictions)
32 | mse = metrics.mean_squared_error(test_y, predictions)
33 |
34 | print('Random Forest:')
35 | print('R-squared: %.2f' % r2)
36 | print('MSE: %.2f' % mse)
--------------------------------------------------------------------------------
/Chapter08/agglomerative.py:
--------------------------------------------------------------------------------
1 | from scipy.cluster import hierarchy
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | a = np.random.uniform(size=10)
6 | Z = hierarchy.linkage(a, 'single')
7 | plt.figure()
8 | dn = hierarchy.dendrogram(Z)
9 | plt.title('Hierarchical Clustering Dendrogram')
--------------------------------------------------------------------------------
/Chapter08/kmeans_cluster.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | from sklearn.cluster import KMeans
5 | from sklearn.datasets import load_breast_cancer
6 | from sklearn.manifold import TSNE
7 |
8 |
9 | np.random.seed(123456)
10 |
11 | bc = load_breast_cancer()
12 | tsne = TSNE()
13 |
14 | data = tsne.fit_transform(bc.data)
15 | reds = bc.target == 0
16 | blues = bc.target == 1
17 | plt.scatter(data[reds, 0], data[reds, 1], label='malignant')
18 | plt.scatter(data[blues, 0], data[blues, 1], label='benign')
19 | plt.xlabel('1st Component')
20 | plt.ylabel('2nd Component')
21 | plt.title('Breast Cancer dataa')
22 | plt.legend()
23 |
24 |
25 | plt.figure()
26 | plt.title('2, 4, and 6 clusters.')
27 | for clusters in [2, 4, 6]:
28 | km = KMeans(n_clusters=clusters)
29 | preds = km.fit_predict(data)
30 | plt.subplot(1, 3, clusters/2)
31 | plt.scatter(*zip(*data), c=preds)
32 |
33 | classified = {x: {'m': 0, 'b': 0} for x in range(clusters)}
34 |
35 | for i in range(len(data)):
36 | cluster = preds[i]
37 | label = bc.target[i]
38 | label = 'm' if label == 0 else 'b'
39 | classified[cluster][label] = classified[cluster][label]+1
40 |
41 | print('-'*40)
42 | for c in classified:
43 | print('Cluster %d. Malignant percentage: ' % c, end=' ')
44 | print(classified[c], end=' ')
45 | print('%.3f' % (classified[c]['m'] /
46 | (classified[c]['m'] + classified[c]['b'])))
47 |
48 | print(metrics.homogeneity_score(bc.target, preds))
49 | print(metrics.silhouette_score(data, preds))
50 |
--------------------------------------------------------------------------------
/Chapter08/kmeans_intro.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.cluster import KMeans
5 |
6 | np.random.seed(87654)
7 |
8 | dat = []
9 |
10 | t = 0.5
11 |
12 | for i in range(20):
13 |
14 | # dat.append(np.random.uniform(size=2))
15 | c = np.random.randint(3)
16 | a = np.random.uniform() * 2 * 3.14
17 | r = t * np.sqrt(np.random.uniform())
18 |
19 | x = r * np.cos(a)
20 | y = r * np.sin(a)
21 |
22 |
23 | dat.append([c/4+x, c/4+y])
24 |
25 | plt.figure()
26 | for i in range(1, 5):
27 | np.random.seed(98765432)
28 |
29 | inits = np.array([[0.95,0.95],[0.95,0.95],[0.95,0.95]
30 |
31 | ])
32 | km = KMeans(n_clusters=3, init=inits, max_iter=i, n_init=1)
33 | plt.subplot(2, 2, i)
34 | plt.xticks([])
35 | plt.yticks([])
36 | km.fit(dat)
37 | km.cluster_centers_ = np.sort(km.cluster_centers_, axis=0)
38 | c = km.predict(dat)
39 | plt.scatter(*zip(*dat), c=c)
40 | c = km.fit_predict(km.cluster_centers_)
41 | plt.scatter(*zip(*km.cluster_centers_), c='w', marker='*', s=240, edgecolors='r')
42 | plt.title('Iteration: %d'%i)
43 | print(km.cluster_centers_)
44 |
45 |
--------------------------------------------------------------------------------
/Chapter08/kmeans_raw.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | from sklearn.cluster import KMeans
5 | from sklearn.datasets import load_breast_cancer
6 | from sklearn.manifold import TSNE
7 |
8 |
9 | np.random.seed(123456)
10 |
11 | bc = load_breast_cancer()
12 | data = bc.data
13 |
14 |
15 | #plt.figure()
16 | #plt.title('2, 4, and 6 clusters.')
17 | for clusters in [2, 4, 6]:
18 | km = KMeans(n_clusters=clusters)
19 | preds = km.fit_predict(data)
20 | # plt.subplot(1,3,clusters/2)
21 | # plt.scatter(*zip(*data), c=preds)
22 |
23 | classified = {x: {'m': 0, 'b': 0} for x in range(clusters)}
24 |
25 | for i in range(len(data)):
26 | cluster = preds[i]
27 | label = bc.target[i]
28 | label = 'm' if label == 0 else 'b'
29 | classified[cluster][label] = classified[cluster][label]+1
30 |
31 | print('-'*40)
32 | for c in classified:
33 | print('Cluster %d. Malignant percentage: ' % c, end=' ')
34 | print(classified[c], end=' ')
35 | print('%.3f' % (classified[c]['m'] /
36 | (classified[c]['m'] + classified[c]['b'])))
37 |
38 | print(metrics.homogeneity_score(bc.target, preds))
39 | print(metrics.silhouette_score(data, preds))
40 |
--------------------------------------------------------------------------------
/Chapter08/oe_co_occurence.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import openensembles as oe
4 | import numpy as np
5 | import pandas as pd
6 | import sklearn.metrics
7 |
8 | from sklearn.datasets import load_breast_cancer
9 |
10 |
11 | bc = load_breast_cancer()
12 |
13 | # --- SECTION 2 ---
14 | # Create the data object
15 | cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names)
16 |
17 | np.random.seed(123456)
18 |
19 |
20 | # --- SECTION 3 ---
21 | # Create the ensembles and calculate the homogeneity score
22 | for K in [2, 3, 4, 5, 6, 7]:
23 | for ensemble_size in [3, 4, 5]:
24 | ensemble = oe.cluster(cluster_data)
25 | for i in range(ensemble_size):
26 | name = f'kmeans_{ensemble_size}_{i}'
27 | ensemble.cluster('parent', 'kmeans', name, K)
28 |
29 | preds = ensemble.finish_co_occ_linkage(threshold=0.5)
30 | print(f'K: {K}, size {ensemble_size}:', end=' ')
31 | print('%.2f' % sklearn.metrics.homogeneity_score(
32 | bc.target, preds.labels['co_occ_linkage']))
33 |
34 |
35 |
--------------------------------------------------------------------------------
/Chapter08/oe_graph_closure.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import openensembles as oe
4 | import numpy as np
5 | import pandas as pd
6 | import sklearn.metrics
7 |
8 | from sklearn.datasets import load_breast_cancer
9 |
10 | bc = load_breast_cancer()
11 |
12 | # --- SECTION 2 ---
13 | # Create the data object
14 | cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names)
15 |
16 | np.random.seed(123456)
17 | # --- SECTION 3 ---
18 | # Create the ensembles and calculate the homogeneity score
19 | for K in [2, 3, 4, 5, 6, 7]:
20 | for ensemble_size in [3, 4, 5]:
21 | ensemble = oe.cluster(cluster_data)
22 | for i in range(ensemble_size):
23 | name = f'kmeans_{ensemble_size}_{i}'
24 | ensemble.cluster('parent', 'kmeans', name, K)
25 |
26 | preds = ensemble.finish_graph_closure(threshold=0.5)
27 | print(f'K: {K}, size {ensemble_size}:', end=' ')
28 | print('%.2f' % sklearn.metrics.homogeneity_score(
29 | bc.target, preds.labels['graph_closure']))
30 |
31 |
32 |
--------------------------------------------------------------------------------
/Chapter08/oe_vote.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import openensembles as oe
4 | import numpy as np
5 | import pandas as pd
6 | import sklearn.metrics
7 |
8 | from sklearn.datasets import load_breast_cancer
9 |
10 |
11 | bc = load_breast_cancer()
12 |
13 | # --- SECTION 2 ---
14 | # Create the data object
15 | cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names)
16 |
17 | np.random.seed(123456)
18 | # --- SECTION 3 ---
19 | # Create the ensembles and calculate the homogeneity score
20 | for K in [2, 3, 4, 5, 6, 7]:
21 | for ensemble_size in [3, 4, 5]:
22 | ensemble = oe.cluster(cluster_data)
23 | for i in range(ensemble_size):
24 | name = f'kmeans_{ensemble_size}_{i}'
25 | ensemble.cluster('parent', 'kmeans', name, K)
26 |
27 | preds = ensemble.finish_majority_vote(threshold=0.5)
28 | print(f'K: {K}, size {ensemble_size}:', end=' ')
29 | print('%.2f' % sklearn.metrics.homogeneity_score(
30 | bc.target, preds.labels['majority_vote']))
31 |
32 |
33 |
--------------------------------------------------------------------------------
/Chapter08/oe_vote_tsne.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import openensembles as oe
4 | import numpy as np
5 | import pandas as pd
6 | import sklearn.metrics
7 |
8 | from sklearn.datasets import load_breast_cancer
9 | from sklearn.manifold import TSNE
10 |
11 | bc = load_breast_cancer()
12 | t = TSNE()
13 | # --- SECTION 2 ---
14 | # Create the data object
15 | cluster_data = oe.data(pd.DataFrame(t.fit_transform(bc.data)), [0,1])
16 |
17 | np.random.seed(123456)
18 | # --- SECTION 3 ---
19 | # Create the ensembles and calculate the homogeneity score
20 | for K in [2, 3, 4, 5, 6, 7]:
21 | for ensemble_size in [3, 4, 5]:
22 | ensemble = oe.cluster(cluster_data)
23 | for i in range(ensemble_size):
24 | name = f'kmeans_{ensemble_size}_{i}'
25 | ensemble.cluster('parent', 'kmeans', name, K)
26 |
27 | preds = ensemble.finish_majority_vote(threshold=0.5)
28 | print(f'K: {K}, size {ensemble_size}:', end=' ')
29 | print('%.2f' % sklearn.metrics.homogeneity_score(
30 | bc.target, preds.labels['majority_vote']))
31 |
32 |
33 |
--------------------------------------------------------------------------------
/Chapter08/voting_example.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.cluster import KMeans
5 |
6 | np.random.seed(123456)
7 |
8 | dat = []
9 |
10 | t = 0.5
11 |
12 | for i in range(10):
13 |
14 | # dat.append(np.random.uniform(size=2))
15 | c = np.random.randint(3)
16 | a = np.random.uniform() * 2 * 3.14
17 | r = t * np.sqrt(np.random.uniform())
18 |
19 | x = r * np.cos(a)
20 | y = r * np.sin(a)
21 |
22 |
23 | dat.append([c/2+x, c/2+y])
24 |
25 |
26 | clusterers = []
27 | for _ in range(3):
28 | km = KMeans(n_clusters=3)
29 | noise = np.random.uniform(low=-0.5, high=0.5, size=(3,2))
30 | km.cluster_centers_ = np.array([[0, 0], [0.5, 0.5], [1, 1]]) + noise
31 | clusterers.append(km)
32 |
33 |
34 |
35 | plt.figure()
36 | for i, clusterer in enumerate(clusterers):
37 |
38 | plt.subplot(1, 3, i+1)
39 | plt.xticks([])
40 | plt.yticks([])
41 | c = clusterer.predict(dat)
42 | print(c)
43 | plt.scatter(*zip(*dat), c=c)
44 | c = clusterer.predict(clusterer.cluster_centers_)
45 | plt.scatter(*zip(*clusterer.cluster_centers_), c='w', marker='*', s=240, edgecolors='r')
46 | plt.title('Clustering: %d'%i)
47 |
48 |
49 | plt.figure()
50 | dat = np.array(dat)
51 | plt.xticks([])
52 | plt.yticks([])
53 | c = np.array([0, 0, 1, 1, 1, 0, 0, 0, 0, 1])
54 | plt.scatter(*zip(*dat[c == 0]), c='C0', label='Cluster 0')
55 | plt.scatter(*zip(*dat[c == 1]), c='C1', label='Cluster 1')
56 | plt.legend()
57 | plt.title('Voting Clustering')
58 |
59 |
60 |
--------------------------------------------------------------------------------
/Chapter09/adaboost.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from sklearn.ensemble import AdaBoostClassifier
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.utils import shuffle
9 | from sklearn import metrics
10 |
11 |
12 |
13 | np.random.seed(123456)
14 | data = pd.read_csv('creditcard.csv')
15 | data.Time = (data.Time-data.Time.min())/data.Time.std()
16 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
17 |
18 | # Train-Test slpit of 70%-30%
19 | x_train, x_test, y_train, y_test = train_test_split(
20 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
21 |
22 | # --- SECTION 2 ---
23 | # Ensemble evaluation
24 | ensemble = AdaBoostClassifier(n_estimators=70, learning_rate=1.0)
25 |
26 | ensemble.fit(x_train, y_train)
27 |
28 | print('AdaBoost f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
29 | print('AdaBoost recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
30 |
31 |
32 |
33 | # --- SECTION 3 ---
34 | # Filter features according to their correlation to the target
35 | np.random.seed(123456)
36 | threshold = 0.1
37 |
38 | correlations = data.corr()['Class'].drop('Class')
39 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
40 | fs.append('Class')
41 | data = data[fs]
42 |
43 | x_train, x_test, y_train, y_test = train_test_split(
44 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
45 |
46 | ensemble = AdaBoostClassifier(n_estimators=70, learning_rate=1.0)
47 |
48 | ensemble.fit(x_train, y_train)
49 |
50 | print('AdaBoost f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
51 | print('AdaBoost recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
52 |
--------------------------------------------------------------------------------
/Chapter09/bagging.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from sklearn.ensemble import BaggingClassifier
7 | from sklearn.tree import DecisionTreeClassifier
8 | from sklearn.model_selection import train_test_split
9 | from sklearn import metrics
10 |
11 |
12 |
13 |
14 | np.random.seed(123456)
15 | data = pd.read_csv('creditcard.csv')
16 | data.Time = (data.Time-data.Time.min())/data.Time.std()
17 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
18 |
19 | # Train-Test slpit of 70%-30%
20 | x_train, x_test, y_train, y_test = train_test_split(
21 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
22 |
23 | # --- SECTION 2 ---
24 | # Ensemble evaluation
25 | ensemble = BaggingClassifier(n_estimators=10,
26 | base_estimator=DecisionTreeClassifier(max_depth=8))
27 |
28 | ensemble.fit(x_train, y_train)
29 |
30 | print('Bagging f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
31 | print('Bagging recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
32 |
33 |
34 |
35 | # --- SECTION 3 ---
36 | # Filter features according to their correlation to the target
37 | np.random.seed(123456)
38 | threshold = 0.1
39 |
40 | correlations = data.corr()['Class'].drop('Class')
41 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
42 | fs.append('Class')
43 | data = data[fs]
44 |
45 | x_train, x_test, y_train, y_test = train_test_split(
46 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
47 |
48 | ensemble = BaggingClassifier(n_estimators=10,
49 | base_estimator=DecisionTreeClassifier(max_depth=8))
50 |
51 | ensemble.fit(x_train, y_train)
52 |
53 | print('Bagging f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
54 | print('Bagging recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
55 |
--------------------------------------------------------------------------------
/Chapter09/base.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from sklearn.tree import DecisionTreeClassifier
7 | from sklearn.linear_model import LogisticRegression
8 | from sklearn.naive_bayes import GaussianNB
9 | from sklearn.model_selection import train_test_split
10 | from sklearn import metrics
11 |
12 |
13 |
14 |
15 | np.random.seed(123456)
16 | data = pd.read_csv('creditcard.csv')
17 | data.Time = (data.Time-data.Time.min())/data.Time.std()
18 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
19 |
20 | # Train-Test slpit of 70%-30%
21 | x_train, x_test, y_train, y_test = train_test_split(
22 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
23 |
24 | # --- SECTION 2 ---
25 | # Base learners evaluation
26 | base_classifiers = [('DT', DecisionTreeClassifier(max_depth=6)),
27 | ('NB', GaussianNB()),
28 | ('LR', LogisticRegression())]
29 |
30 | for bc in base_classifiers:
31 | lr = bc[1]
32 | lr.fit(x_train, y_train)
33 |
34 | predictions = lr.predict(x_test)
35 | print(bc[0]+' f1', metrics.f1_score(y_test, predictions))
36 | print(bc[0]+' recall', metrics.recall_score(y_test, predictions))
37 | print(metrics.confusion_matrix(y_test, predictions))
38 |
39 | # --- SECTION 3 ---
40 | # Filter features according to their correlation to the target
41 | np.random.seed(123456)
42 | threshold = 0.1
43 |
44 | correlations = data.corr()['Class'].drop('Class')
45 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
46 | fs.append('Class')
47 | data = data[fs]
48 |
49 | x_train, x_test, y_train, y_test = train_test_split(
50 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
51 |
52 | for bc in base_classifiers:
53 | lr = bc[1]
54 | lr.fit(x_train, y_train)
55 |
56 | predictions = lr.predict(x_test)
57 | print(bc[0]+' f1', metrics.f1_score(y_test, predictions))
58 | print(bc[0]+' recall', metrics.recall_score(y_test, predictions))
59 | print(metrics.confusion_matrix(y_test, predictions))
--------------------------------------------------------------------------------
/Chapter09/dt_optimize.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from sklearn.tree import DecisionTreeClassifier
8 | from sklearn.linear_model import LogisticRegression
9 | from sklearn.naive_bayes import GaussianNB
10 | from sklearn.model_selection import train_test_split
11 | from sklearn import metrics
12 |
13 |
14 |
15 |
16 | np.random.seed(123456)
17 | data = pd.read_csv('creditcard.csv')
18 | data.Time = (data.Time-data.Time.min())/data.Time.std()
19 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
20 |
21 | # Train-Test slpit of 70%-30%
22 | x_train, x_test, y_train, y_test = train_test_split(
23 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
24 |
25 | # --- SECTION 2 ---
26 | # Base learners evaluation
27 | base_classifiers = [('DT', DecisionTreeClassifier(max_depth=6)),
28 | ('NB', GaussianNB()),
29 | ('LR', LogisticRegression())]
30 |
31 | raw_f1 = []
32 | raw_recall = []
33 | range_ = [x for x in range(3,12)]
34 | for max_d in range_:
35 | lr = DecisionTreeClassifier(max_depth=max_d)
36 | lr.fit(x_train, y_train)
37 |
38 | predictions = lr.predict(x_test)
39 | raw_f1.append(metrics.f1_score(y_test, predictions))
40 | raw_recall.append(metrics.recall_score(y_test, predictions))
41 |
42 | plt.plot(range_, raw_f1, label='Raw F1')
43 | plt.plot(range_, raw_recall, label='Raw Recall')
44 | print(raw_f1)
45 | print(raw_recall)
46 | # --- SECTION 3 ---
47 | # Filter features according to their correlation to the target
48 | np.random.seed(123456)
49 | threshold = 0.1
50 |
51 | correlations = data.corr()['Class'].drop('Class')
52 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
53 | fs.append('Class')
54 | data = data[fs]
55 |
56 | x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
57 |
58 | filter_f1 = []
59 | filter_recall = []
60 | for max_d in range_:
61 | lr = DecisionTreeClassifier(max_depth=max_d)
62 | lr.fit(x_train, y_train)
63 |
64 | predictions = lr.predict(x_test)
65 | filter_f1.append(metrics.f1_score(y_test, predictions))
66 | filter_recall.append(metrics.recall_score(y_test, predictions))
67 |
68 | print(filter_f1)
69 | print(filter_recall)
70 |
71 | plt.plot(range_, filter_f1, label='Filtered F1')
72 | plt.plot(range_, filter_recall, label='Filtered Recall')
--------------------------------------------------------------------------------
/Chapter09/exploratory.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 |
4 | from sklearn.model_selection import train_test_split
5 |
6 | import warnings
7 |
8 | warnings.filterwarnings("ignore")
9 |
10 | data = pd.read_csv('creditcard.csv')
11 |
12 | data.Time = (data.Time-data.Time.min())/data.Time.std()
13 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
14 |
15 | plt.figure()
16 | data.groupby('Class').V1.count().plot(kind='bar')
17 | plt.title('0-1 Class distribution')
18 |
19 | plt.figure()
20 | ax = data.Amount.hist(grid=False, bins=50)
21 | ax.set_yscale("log", nonposy='clip')
22 | plt.title('Amount')
23 |
24 | plt.figure()
25 | data.Time.hist(grid=False, bins=50)
26 | plt.title('Time')
27 |
28 | plt.figure()
29 | correlations = data.corr()['Class'].drop('Class')
30 | correlations.sort_values().plot(kind='bar')
31 | plt.title('Correlations to Class')
32 |
33 |
34 |
35 |
36 |
37 | frauds = data[data.Class == 1]
38 | non_frauds = data[data.Class == 0]
39 |
40 | frauds_no = len(frauds)
41 |
42 | balanced_data = pd.concat([frauds, non_frauds.sample(frauds_no)])
43 |
44 | plt.figure()
45 | balanced_data.groupby('Class').V1.count().plot(kind='bar')
46 | plt.title('0-1 Class distribution (subsampled)')
47 |
48 | plt.figure()
49 | ax = balanced_data.Amount.hist(grid=False, bins=50)
50 | ax.set_yscale("log", nonposy='clip')
51 | plt.title('Amount (subsampled)')
52 |
53 | plt.figure()
54 | correlations = balanced_data.corr()['Class'].drop('Class')
55 | correlations.sort_values().plot(kind='bar')
56 | plt.title('Correlations to Class (subsampled)')
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/Chapter09/logistic_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed May 15 19:20:55 2019
4 |
5 | @author: George Kyriakides
6 | ge.kyriakides@gmail.com
7 | """
8 | import numpy as np
9 | import pandas as pd
10 |
11 |
12 | from sklearn.linear_model import LogisticRegression
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.utils import shuffle
15 | from sklearn import metrics
16 |
17 |
18 | np.random.seed(123456)
19 | data = pd.read_csv('creditcard.csv')
20 | data.Time = (data.Time-data.Time.min())/data.Time.std()
21 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
22 |
23 | x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
24 |
25 |
26 | X, Y = shuffle(x_train, y_train)
27 |
28 | lr = LogisticRegression()
29 | lr.fit(X, Y)
30 |
31 | print('f1', metrics.f1_score(y_test, lr.predict(x_test)))
32 | print('recall', metrics.recall_score(y_test, lr.predict(x_test)))
33 |
34 |
35 | # =============================================================================
36 | # Selected Features
37 | # =============================================================================
38 |
39 | np.random.seed(123456)
40 | threshold = 0.1
41 |
42 | correlations = data.corr()['Class'].drop('Class')
43 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
44 | fs.append('Class')
45 | data = data[fs]
46 |
47 | x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
48 |
49 | X, Y = shuffle(x_train, y_train)
50 | lr = LogisticRegression()
51 | lr.fit(X, Y)
52 |
53 | print('f1', metrics.f1_score(y_test, lr.predict(x_test)))
54 | print('recall', metrics.recall_score(y_test, lr.predict(x_test)))
55 |
--------------------------------------------------------------------------------
/Chapter09/random_forest.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 |
7 | from sklearn.ensemble import RandomForestClassifier
8 | from sklearn.model_selection import train_test_split
9 | from sklearn.utils import shuffle
10 | from sklearn import metrics
11 |
12 |
13 | np.random.seed(123456)
14 | data = pd.read_csv('creditcard.csv')
15 | data.Time = (data.Time-data.Time.min())/data.Time.std()
16 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
17 |
18 | np.random.seed(123456)
19 | data = pd.read_csv('creditcard.csv')
20 | data.Time = (data.Time-data.Time.min())/data.Time.std()
21 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
22 |
23 | # Train-Test slpit of 70%-30%
24 | x_train, x_test, y_train, y_test = train_test_split(
25 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
26 |
27 | # --- SECTION 2 ---
28 | # Ensemble evaluation
29 | ensemble = RandomForestClassifier(criterion='entropy', n_jobs=4)
30 |
31 | ensemble.fit(x_train, y_train)
32 |
33 | print('RF f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
34 | print('RF recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
35 |
36 |
37 |
38 | # --- SECTION 3 ---
39 | # Filter features according to their correlation to the target
40 | np.random.seed(123456)
41 | threshold = 0.1
42 |
43 | correlations = data.corr()['Class'].drop('Class')
44 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
45 | fs.append('Class')
46 | data = data[fs]
47 |
48 | x_train, x_test, y_train, y_test = train_test_split(
49 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
50 |
51 | ensemble = RandomForestClassifier(criterion='entropy', n_jobs=4)
52 |
53 | ensemble.fit(x_train, y_train)
54 |
55 | print('RF f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
56 | print('RF recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
--------------------------------------------------------------------------------
/Chapter09/stacking.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from stacking_classifier import Stacking
7 | from sklearn.tree import DecisionTreeClassifier
8 | from sklearn.linear_model import LogisticRegression
9 | from sklearn.naive_bayes import GaussianNB
10 | from sklearn.svm import LinearSVC
11 | from sklearn.model_selection import train_test_split
12 | from sklearn import metrics
13 |
14 |
15 | np.random.seed(123456)
16 | data = pd.read_csv('creditcard.csv')
17 | data.Time = (data.Time-data.Time.min())/data.Time.std()
18 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
19 |
20 | # Train-Test slpit of 70%-30%
21 | x_train, x_test, y_train, y_test = train_test_split(
22 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
23 |
24 | # --- SECTION 2 ---
25 | # Ensemble evaluation
26 | base_classifiers = [DecisionTreeClassifier(max_depth=5),
27 | GaussianNB(),
28 | LogisticRegression(),
29 | DecisionTreeClassifier(max_depth=3),
30 | DecisionTreeClassifier(max_depth=8)]
31 |
32 | l1_classifiers = [DecisionTreeClassifier(max_depth=2), LinearSVC()]
33 |
34 | ensemble = Stacking(learner_levels=[base_classifiers, l1_classifiers,
35 | [LogisticRegression()]])
36 |
37 |
38 | ensemble.fit(x_train, y_train)
39 |
40 | print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
41 | print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
42 |
43 |
44 | # --- SECTION 3 ---
45 | # Filter features according to their correlation to the target
46 | np.random.seed(123456)
47 | threshold = 0.1
48 |
49 | correlations = data.corr()['Class'].drop('Class')
50 | fs = list(correlations[(abs(correlations) > threshold)].index.values)
51 | fs.append('Class')
52 | data = data[fs]
53 |
54 | x_train, x_test, y_train, y_test = train_test_split(
55 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
56 |
57 | ensemble = Stacking(learner_levels=[base_classifiers, l1_classifiers,
58 | [LogisticRegression()]])
59 |
60 | ensemble.fit(x_train, y_train)
61 |
62 | print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
63 | print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
64 |
--------------------------------------------------------------------------------
/Chapter09/stacking_classifier.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator
2 | from copy import deepcopy
3 | from sklearn.model_selection import KFold
4 |
5 | import numpy as np
6 |
7 | class Stacking(BaseEstimator):
8 |
9 | # --- SECTION 2 ---
10 | # The constructor
11 | def __init__(self, learner_levels):
12 | # Create a list of sizes for each stacking level
13 | # And a list of deep copied learners
14 | self.level_sizes = []
15 | self.learners = []
16 | self.learner_levels = learner_levels
17 | for learning_level in self.learner_levels:
18 |
19 | self.level_sizes.append(len(learning_level))
20 | level_learners = []
21 | for learner in learning_level:
22 | level_learners.append(deepcopy(learner))
23 | self.learners.append(level_learners)
24 |
25 |
26 |
27 | # --- SECTION 3 ---
28 | # The fit function. Creates training meta data for every level and trains
29 | # each level on the previous level's meta data
30 | def fit(self, x, y):
31 | # Create a list of training meta data, one for each stacking level
32 | # and another one for the targets. For the first level, the actual data
33 | # is used.
34 | meta_data = [x]
35 | meta_targets = [y]
36 | for i in range(len(self.learners)):
37 | level_size = self.level_sizes[i]
38 |
39 | # Create the meta data and target variables for this level
40 | data_z = np.zeros((level_size, len(x)))
41 | target_z = np.zeros(len(x))
42 |
43 | train_x = meta_data[i]
44 | train_y = meta_targets[i]
45 |
46 | # Create the cross-validation folds
47 | KF = KFold(n_splits=3)
48 | meta_index = 0
49 | for train_indices, test_indices in KF.split(x):
50 | # Train each learner on the K-1 folds and create
51 | # meta data for the Kth fold
52 | for j in range(len(self.learners[i])):
53 |
54 | learner = self.learners[i][j]
55 | learner.fit(train_x[train_indices], train_y[train_indices])
56 | predictions = learner.predict(train_x[test_indices])
57 |
58 | data_z[j][meta_index:meta_index+len(test_indices)] = predictions
59 |
60 | target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
61 | meta_index += len(test_indices)
62 |
63 | # Add the data and targets to the meta data lists
64 | data_z = data_z.transpose()
65 | meta_data.append(data_z)
66 | meta_targets.append(target_z)
67 |
68 |
69 | # Train the learner on the whole previous meta data
70 | for learner in self.learners[i]:
71 | learner.fit(train_x, train_y)
72 |
73 |
74 |
75 |
76 |
77 |
78 | # --- SECTION 4 ---
79 | # The predict function. Creates meta data for the test data and returns
80 | # all of them. The actual predictions can be accessed with meta_data[-1]
81 | def predict(self, x):
82 |
83 | # Create a list of training meta data, one for each stacking level
84 | meta_data = [x]
85 | for i in range(len(self.learners)):
86 | level_size = self.level_sizes[i]
87 |
88 | data_z = np.zeros((level_size, len(x)))
89 |
90 | test_x = meta_data[i]
91 |
92 | # Create the cross-validation folds
93 | KF = KFold(n_splits=3)
94 | for train_indices, test_indices in KF.split(x):
95 | # Train each learner on the K-1 folds and create
96 | # meta data for the Kth fold
97 | for j in range(len(self.learners[i])):
98 |
99 | learner = self.learners[i][j]
100 | predictions = learner.predict(test_x)
101 | data_z[j] = predictions
102 |
103 |
104 |
105 | # Add the data and targets to the meta data lists
106 | data_z = data_z.transpose()
107 | meta_data.append(data_z)
108 |
109 | # Return the meta_data the final layer's prediction can be accessed
110 | # With meta_data[-1]
111 | return meta_data[-1]
112 |
113 | def predict_proba(self, x):
114 |
115 | # Create a list of training meta data, one for each stacking level
116 | meta_data = [x]
117 | for i in range(len(self.learners)-1):
118 | level_size = self.level_sizes[i]
119 |
120 | data_z = np.zeros((level_size, len(x)))
121 |
122 | test_x = meta_data[i]
123 |
124 | # Create the cross-validation folds
125 | KF = KFold(n_splits=5)
126 | for train_indices, test_indices in KF.split(x):
127 | # Train each learner on the K-1 folds and create
128 | # meta data for the Kth fold
129 | for j in range(len(self.learners[i])):
130 |
131 | learner = self.learners[i][j]
132 | predictions = learner.predict(test_x)
133 | data_z[j] = predictions
134 |
135 |
136 |
137 | # Add the data and targets to the meta data lists
138 | data_z = data_z.transpose()
139 | meta_data.append(data_z)
140 |
141 | learner = self.learners[-1][-1]
142 | return learner.predict_proba(meta_data[-1])
--------------------------------------------------------------------------------
/Chapter09/unrelated_presentation_phd.py:
--------------------------------------------------------------------------------
1 |
2 | percentage_f = lambda x: '%.2f %%'%x if x>2.5 else ''
3 |
4 |
5 | counts = publisher.groupby('Publisher').Publisher.count()
6 | sorted_vals = counts.sort_values(ascending=False)
7 |
8 | explode = [0.6 if x<10 else 0 for x in sorted_vals.values ]
9 |
10 | sorted_vals.plot.pie(autopct=percentage_f, explode=explode)
--------------------------------------------------------------------------------
/Chapter09/voting.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from sklearn.ensemble import VotingClassifier
7 | from sklearn.tree import DecisionTreeClassifier
8 | from sklearn.linear_model import LogisticRegression
9 | from sklearn.naive_bayes import GaussianNB
10 | from sklearn.model_selection import train_test_split
11 | from sklearn import metrics
12 |
13 |
14 |
15 |
16 | np.random.seed(123456)
17 | data = pd.read_csv('creditcard.csv')
18 | data.Time = (data.Time-data.Time.min())/data.Time.std()
19 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
20 |
21 | # Train-Test slpit of 70%-30%
22 | x_train, x_test, y_train, y_test = train_test_split(
23 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
24 |
25 | # --- SECTION 2 ---
26 | # Ensemble evaluation
27 | base_classifiers = [('DT', DecisionTreeClassifier(max_depth=5)),
28 | ('NB', GaussianNB()),
29 | ('ensemble', LogisticRegression()),
30 | ('DT2', DecisionTreeClassifier(max_depth=3)),
31 | ('DT3', DecisionTreeClassifier(max_depth=8))]
32 |
33 | ensemble = VotingClassifier(base_classifiers)
34 | ensemble.fit(x_train, y_train)
35 |
36 | print('Voting f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
37 | print('Voting recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
38 |
39 |
40 |
41 | # --- SECTION 3 ---
42 | # Filter features according to their correlation to the target
43 | np.random.seed(123456)
44 | threshold = 0.1
45 |
46 | correlations = data.corr()['Class'].drop('Class')
47 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
48 | fs.append('Class')
49 | data = data[fs]
50 |
51 | x_train, x_test, y_train, y_test = train_test_split(
52 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
53 |
54 | ensemble = VotingClassifier(base_classifiers)
55 | ensemble.fit(x_train, y_train)
56 |
57 | print('Voting f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
58 | print('Voting recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
59 |
--------------------------------------------------------------------------------
/Chapter09/xgboosting.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries and data loading
3 | import numpy as np
4 | import pandas as pd
5 |
6 |
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.utils import shuffle
9 | from sklearn import metrics
10 |
11 | from xgboost import XGBClassifier
12 |
13 |
14 | np.random.seed(123456)
15 | data = pd.read_csv('creditcard.csv')
16 | data.Time = (data.Time-data.Time.min())/data.Time.std()
17 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()
18 |
19 | # Train-Test slpit of 70%-30%
20 | x_train, x_test, y_train, y_test = train_test_split(
21 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
22 |
23 | # --- SECTION 2 ---
24 | # Ensemble evaluation
25 | ensemble = XGBClassifier(max_depth=5, n_jobs=4)
26 |
27 | ensemble.fit(x_train, y_train)
28 |
29 | print('XGB f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
30 | print('XGB recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
31 |
32 |
33 |
34 | # --- SECTION 3 ---
35 | # Filter features according to their correlation to the target
36 | np.random.seed(123456)
37 | threshold = 0.1
38 |
39 | correlations = data.corr()['Class'].drop('Class')
40 | fs = list(correlations[(abs(correlations)>threshold)].index.values)
41 | fs.append('Class')
42 | data = data[fs]
43 |
44 | x_train, x_test, y_train, y_test = train_test_split(
45 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)
46 |
47 | ensemble = XGBClassifier(max_depth=5, n_jobs=4)
48 |
49 | ensemble.fit(x_train, y_train)
50 |
51 | print('XGB f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
52 | print('XGB recall', metrics.recall_score(y_test, ensemble.predict(x_test)))
53 |
--------------------------------------------------------------------------------
/Chapter10/bagging.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from simulator import simulate
5 | from sklearn import metrics
6 | from sklearn.ensemble import BaggingRegressor
7 | from sklearn.tree import DecisionTreeRegressor
8 | from sklearn.model_selection import train_test_split
9 |
10 | np.random.seed(123456)
11 |
12 | lr = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=1))
13 |
14 | data = pd.read_csv('BTC-USD.csv')
15 | data = data.dropna()
16 | data.Date = pd.to_datetime(data.Date)
17 | data.set_index('Date', drop=True, inplace=True)
18 | diffs = (data.Close.diff()/data.Close).values[1:]
19 |
20 | diff_len = len(diffs)
21 |
22 |
23 |
24 | def create_x_data(lags=1):
25 | diff_data = np.zeros((diff_len, lags))
26 |
27 | for lag in range(1, lags+1):
28 | this_data = diffs[:-lag]
29 | diff_data[lag:, lag-1] = this_data
30 |
31 | return diff_data
32 |
33 | x_data = create_x_data(lags=20)*100
34 | y_data = diffs*100
35 |
36 | # REPRODUCIBILITY
37 | x_data = np.around(x_data, decimals=8)
38 | y_data = np.around(y_data, decimals=8)
39 |
40 | # =============================================================================
41 | # WALK FORWARD
42 | # =============================================================================
43 |
44 | window = 150
45 | preds = np.zeros(diff_len-window)
46 | for i in range(diff_len-window-1):
47 | x_train = x_data[i:i+window, :]
48 | y_train = y_data[i:i+window]
49 | lr.fit(x_train, y_train)
50 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))
51 |
52 |
53 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds))
54 | simulate(data, preds)
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/Chapter10/boosting.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from simulator import simulate
5 | from sklearn import metrics
6 | from sklearn.model_selection import train_test_split
7 |
8 | from xgboost import XGBRegressor
9 | np.random.seed(123456)
10 |
11 |
12 | lr = XGBRegressor(n_jobs=5, max_depth=2, n_estimators=10, reg_alpha=0.5)
13 |
14 | data = pd.read_csv('BTC-USD.csv')
15 | data = data.dropna()
16 | data.Date = pd.to_datetime(data.Date)
17 | data.set_index('Date', drop=True, inplace=True)
18 | diffs = (data.Close.diff()/data.Close).values[1:]
19 |
20 | diff_len = len(diffs)
21 |
22 |
23 |
24 | def create_x_data(lags=1):
25 | diff_data = np.zeros((diff_len, lags))
26 | ma_data = np.zeros((diff_len, lags))
27 |
28 | diff_ma = (data.Close.diff()/data.Close).rolling(15).mean().fillna(0).values[1:]
29 | for lag in range(1, lags+1):
30 | this_data = diffs[:-lag]
31 | diff_data[lag:, lag-1] = this_data
32 |
33 | this_data = diff_ma[:-lag]
34 | ma_data[lag:, lag-1] = this_data
35 | return np.concatenate((diff_data, ma_data), axis=1)
36 |
37 |
38 | x_data = create_x_data(lags=30)*100
39 | y_data = diffs*100
40 |
41 | # REPRODUCIBILITY
42 | x_data = np.around(x_data, decimals=8)
43 | y_data = np.around(y_data, decimals=8)
44 |
45 | # =============================================================================
46 | # WALK FORWARD
47 | # =============================================================================
48 |
49 | window = 150
50 | preds = np.zeros(diff_len-window)
51 | for i in range(diff_len-window-1):
52 | x_train = x_data[i:i+window, :]
53 | y_train = y_data[i:i+window]
54 | lr.fit(x_train, y_train)
55 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))
56 |
57 |
58 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds))
59 | simulate(data, preds)
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/Chapter10/exploratory.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
7 |
8 | # Read the data, parse the dates and set the dates as index
9 | data = pd.read_csv('BTC-USD.csv')
10 | data = data.dropna()
11 | data.Date = pd.to_datetime(data.Date)
12 | data.set_index('Date', drop=True, inplace=True)
13 |
14 |
15 | # =============================================================================
16 | # ORIGINAL
17 | # =============================================================================
18 | # Plot ACF-> Non-Stationary
19 | plot_acf(data.Close, lags=30)
20 | plt.xlabel('Date')
21 | plt.ylabel('Correlation')
22 |
23 | # =============================================================================
24 | # Percentage Differences
25 | # =============================================================================
26 |
27 | # Make two subplots
28 | fig, axes = plt.subplots(nrows=2, ncols=1)
29 |
30 | # Calculate the percentage differences
31 | diffs = data.Close.diff()/data.Close
32 |
33 | # Plot the rolling deviation
34 | diffs.rolling(30).std().plot(ax=axes[0])
35 | plt.xlabel('Date')
36 | plt.ylabel('Std. Dev.')
37 | axes[0].title.set_text('Transformed Data Rolling Std.Dev.')
38 |
39 | diffs = diffs.dropna()
40 |
41 | # Plot ACF for percentage diffs
42 | plot_acf(diffs, lags=60, ax=axes[1])
43 | plt.xlabel('Date')
44 | plt.ylabel('Correlation')
45 |
46 | # Plot the changes
47 | plt.figure()
48 | diffs.plot()
49 | plt.xlabel('Date')
50 | plt.ylabel('Change %')
51 | plt.title('Transformed Data')
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/Chapter10/random_forest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from simulator import simulate
5 | from sklearn import metrics
6 | from sklearn.ensemble import RandomForestRegressor
7 | from sklearn.model_selection import train_test_split
8 |
9 | np.random.seed(123456)
10 | lr = RandomForestRegressor(n_estimators=50, max_depth=2, n_jobs=5)
11 |
12 | data = pd.read_csv('BTC-USD.csv')
13 | data = data.dropna()
14 | data.Date = pd.to_datetime(data.Date)
15 | data.set_index('Date', drop=True, inplace=True)
16 | diffs = (data.Close.diff()/data.Close).values[1:]
17 |
18 | diff_len = len(diffs)
19 |
20 |
21 |
22 | def create_x_data(lags=1):
23 | diff_data = np.zeros((diff_len, lags))
24 | ma_data = np.zeros((diff_len, lags))
25 |
26 | diff_ma = (data.Close.diff()/data.Close).rolling(15).mean().fillna(0).values[1:]
27 | for lag in range(1, lags+1):
28 | this_data = diffs[:-lag]
29 | diff_data[lag:, lag-1] = this_data
30 |
31 | this_data = diff_ma[:-lag]
32 | ma_data[lag:, lag-1] = this_data
33 | return np.concatenate((diff_data, ma_data), axis=1)
34 |
35 | x_data = create_x_data(lags=30)*100
36 | y_data = diffs*100
37 |
38 | # REPRODUCIBILITY
39 | x_data = np.around(x_data, decimals=8)
40 | y_data = np.around(y_data, decimals=8)
41 |
42 | # =============================================================================
43 | # WALK FORWARD
44 | # =============================================================================
45 |
46 | window = 150
47 | preds = np.zeros(diff_len-window)
48 | for i in range(diff_len-window-1):
49 | x_train = x_data[i:i+window, :]
50 | y_train = y_data[i:i+window]
51 | lr.fit(x_train, y_train)
52 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))
53 |
54 |
55 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds))
56 | simulate(data, preds)
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/Chapter10/regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from simulator import simulate
5 | from sklearn import metrics
6 | from sklearn.linear_model import LinearRegression
7 | from sklearn.model_selection import train_test_split
8 |
9 | np.random.seed(123456)
10 | lr = LinearRegression()
11 |
12 | data = pd.read_csv('BTC-USD.csv')
13 | data = data.dropna()
14 | data.Date = pd.to_datetime(data.Date)
15 | data.set_index('Date', drop=True, inplace=True)
16 | diffs = (data.Close.diff()/data.Close).values[1:]
17 |
18 | diff_len = len(diffs)
19 |
20 |
21 |
22 | def create_x_data(lags=1):
23 | diff_data = np.zeros((diff_len, lags))
24 |
25 | for lag in range(1, lags+1):
26 | this_data = diffs[:-lag]
27 | diff_data[lag:, lag-1] = this_data
28 |
29 | return diff_data
30 |
31 | # REPRODUCIBILITY
32 | x_data = create_x_data(lags=20)*100
33 | y_data = diffs*100
34 |
35 |
36 | x_data = np.around(x_data, decimals=8)
37 | y_data = np.around(y_data, decimals=8)
38 |
39 | # =============================================================================
40 | # WALK FORWARD
41 | # =============================================================================
42 |
43 | window = 150
44 | preds = np.zeros(diff_len-window)
45 | for i in range(diff_len-window-1):
46 | x_train = x_data[i:i+window, :]
47 | y_train = y_data[i:i+window]
48 | lr.fit(x_train, y_train)
49 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))
50 |
51 |
52 | print('Percentages MSE: %.2f'%metrics.mean_absolute_error(y_data[window:], preds))
53 | simulate(data, preds)
54 |
55 |
--------------------------------------------------------------------------------
/Chapter10/simulator.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from sklearn import metrics
6 |
7 |
8 | def simulate(data, preds):
9 | # Constants and placeholders
10 | buy_threshold = 0.5
11 | stake = 100
12 |
13 | true, pred, balances = [], [], []
14 |
15 | buy_price = 0
16 | buy_points, sell_points = [], []
17 | balance = 0
18 |
19 | start_index = len(data)-len(preds)-1
20 | # Calcualte predicted values
21 | for i in range(len(preds)):
22 |
23 | last_close = data.Close[i+start_index-1]
24 | current_close = data.Close[i+start_index]
25 |
26 | # Save predicted values and true values
27 | true.append(current_close)
28 | pred.append(last_close*(1+preds[i]/100))
29 |
30 |
31 | # Buy/Sell according to signal
32 | if preds[i] > buy_threshold and buy_price == 0:
33 | buy_price = true[-1]
34 | buy_points.append(i)
35 |
36 | elif preds[i] < -buy_threshold and not buy_price == 0:
37 | profit = (current_close - buy_price) * stake/buy_price
38 | balance += profit
39 | buy_price = 0
40 | sell_points.append(i)
41 |
42 | balances.append(balance)
43 |
44 |
45 | true = np.array(true)
46 | pred = np.array(pred)
47 |
48 | # Create plots
49 | plt.figure()
50 |
51 | plt.subplot(2, 1, 1)
52 | plt.plot(true, label='True')
53 | plt.plot(pred, label='pred')
54 | plt.scatter(buy_points, true[buy_points]+500, marker='v',
55 | c='blue', s=5, zorder=10)
56 | plt.scatter(sell_points, true[sell_points]-500, marker='^'
57 | , c='red', s=5, zorder=10)
58 | plt.title('Trades')
59 |
60 | plt.subplot(2, 1, 2)
61 | plt.plot(balances)
62 | plt.title('Profit')
63 | print('MSE: %.2f'%metrics.mean_squared_error(true, pred))
64 | balance_df = pd.DataFrame(balances)
65 |
66 | pct_returns = balance_df.diff()/stake
67 | pct_returns = pct_returns[pct_returns != 0].dropna()
68 |
69 |
70 | print('Sharpe: %.2f'%(np.mean(pct_returns)/np.std(pct_returns)))
--------------------------------------------------------------------------------
/Chapter10/simulator_plain.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from sklearn import metrics
6 |
7 |
8 | def simulate(data, preds):
9 | true, pred= [], []
10 |
11 | start_index = len(data)-len(preds)-1
12 | for i in range(len(preds)):
13 |
14 | last_close = data.Close[i+start_index-1]
15 | current_close = data.Close[i+start_index]
16 |
17 | true.append(current_close)
18 | pred.append(last_close*(1+preds[i]/100))
19 |
20 |
21 |
22 |
23 |
24 | true = np.array(true)
25 | pred = np.array(pred)
26 |
27 | plt.figure()
28 |
29 | plt.plot(true, label='True')
30 | plt.plot(pred, label='pred')
31 |
32 | print('MSE: %.2f'%metrics.mean_squared_error(true, pred))
33 |
--------------------------------------------------------------------------------
/Chapter10/stacking.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from simulator import simulate
5 | from sklearn import metrics
6 | from sklearn.neighbors import KNeighborsRegressor
7 | from sklearn.linear_model import LinearRegression
8 | from sklearn.svm import SVR
9 | from sklearn.model_selection import train_test_split
10 | from stacking_regressor import StackingRegressor
11 |
12 | np.random.seed(123456)
13 |
14 |
15 | lr = SVR()
16 |
17 | data = pd.read_csv('BTC-USD.csv')
18 | data = data.dropna()
19 | data.Date = pd.to_datetime(data.Date)
20 | data.set_index('Date', drop=True, inplace=True)
21 | diffs = (data.Close.diff()/data.Close).values[1:]
22 |
23 | diff_len = len(diffs)
24 |
25 | base_learners = [[SVR(), KNeighborsRegressor()],
26 | [LinearRegression()]]
27 |
28 | lr = StackingRegressor(base_learners)
29 |
30 | def create_x_data(lags=1):
31 | diff_data = np.zeros((diff_len, lags))
32 |
33 | for lag in range(1, lags+1):
34 | this_data = diffs[:-lag]
35 | diff_data[lag:, lag-1] = this_data
36 |
37 | return diff_data
38 |
39 | x_data = create_x_data(lags=20)*100
40 | y_data = diffs*100
41 |
42 | # REPRODUCIBILITY
43 | x_data = np.around(x_data, decimals=8)
44 | y_data = np.around(y_data, decimals=8)
45 |
46 | # =============================================================================
47 | # WALK FORWARD
48 | # =============================================================================
49 |
50 | window = 150
51 | preds = np.zeros(diff_len-window)
52 | for i in range(diff_len-window-1):
53 | x_train = x_data[i:i+window, :]
54 | y_train = y_data[i:i+window]
55 | lr.fit(x_train, y_train)
56 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))[-1]
57 |
58 |
59 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds))
60 | simulate(data, preds)
61 |
62 |
--------------------------------------------------------------------------------
/Chapter10/stacking_regressor.py:
--------------------------------------------------------------------------------
1 | # --- SECTION 1 ---
2 | # Libraries
3 | import numpy as np
4 |
5 | from sklearn.model_selection import KFold
6 | from copy import deepcopy
7 |
8 |
9 | class StackingRegressor():
10 |
11 | # --- SECTION 2 ---
12 | # The constructor
13 | def __init__(self, learners):
14 | # Create a list of sizes for each stacking level
15 | # And a list of deep copied learners
16 | self.level_sizes = []
17 | self.learners = []
18 | for learning_level in learners:
19 |
20 | self.level_sizes.append(len(learning_level))
21 | level_learners = []
22 | for learner in learning_level:
23 | level_learners.append(deepcopy(learner))
24 | self.learners.append(level_learners)
25 |
26 |
27 |
28 | # --- SECTION 3 ---
29 | # The fit function. Creates training meta data for every level and trains
30 | # each level on the previous level's meta data
31 | def fit(self, x, y):
32 | # Create a list of training meta data, one for each stacking level
33 | # and another one for the targets. For the first level, the actual data
34 | # is used.
35 | meta_data = [x]
36 | meta_targets = [y]
37 | for i in range(len(self.learners)):
38 | level_size = self.level_sizes[i]
39 |
40 | # Create the meta data and target variables for this level
41 | data_z = np.zeros((level_size, len(x)))
42 | target_z = np.zeros(len(x))
43 |
44 | train_x = meta_data[i]
45 | train_y = meta_targets[i]
46 |
47 | # Create the cross-validation folds
48 | KF = KFold(n_splits=5)
49 | meta_index = 0
50 | for train_indices, test_indices in KF.split(x):
51 | # Train each learner on the K-1 folds and create
52 | # meta data for the Kth fold
53 | for j in range(len(self.learners[i])):
54 |
55 | learner = self.learners[i][j]
56 | learner.fit(train_x[train_indices], train_y[train_indices])
57 | predictions = learner.predict(train_x[test_indices])
58 |
59 | data_z[j][meta_index:meta_index+len(test_indices)] = predictions
60 |
61 | target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
62 | meta_index += len(test_indices)
63 |
64 | # Add the data and targets to the meta data lists
65 | data_z = data_z.transpose()
66 | meta_data.append(data_z)
67 | meta_targets.append(target_z)
68 |
69 |
70 | # Train the learner on the whole previous meta data
71 | for learner in self.learners[i]:
72 | learner.fit(train_x, train_y)
73 |
74 |
75 |
76 |
77 |
78 |
79 | # --- SECTION 4 ---
80 | # The predict function. Creates meta data for the test data and returns
81 | # all of them. The actual predictions can be accessed with meta_data[-1]
82 | def predict(self, x):
83 |
84 | # Create a list of training meta data, one for each stacking level
85 | meta_data = [x]
86 | for i in range(len(self.learners)):
87 | level_size = self.level_sizes[i]
88 |
89 | data_z = np.zeros((level_size, len(x)))
90 |
91 | test_x = meta_data[i]
92 |
93 |
94 | for j in range(len(self.learners[i])):
95 |
96 | learner = self.learners[i][j]
97 | predictions = learner.predict(test_x)
98 | data_z[j] = predictions
99 |
100 |
101 |
102 | # Add the data and targets to the meta data lists
103 | data_z = data_z.transpose()
104 | meta_data.append(data_z)
105 |
106 | # Return the meta_data the final layer's prediction can be accessed
107 | # With meta_data[-1]
108 | return meta_data
109 |
110 |
--------------------------------------------------------------------------------
/Chapter10/voting.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from simulator import simulate
5 | from sklearn import metrics
6 | from sklearn.neighbors import KNeighborsRegressor
7 | from sklearn.linear_model import LinearRegression
8 | from sklearn.svm import SVR
9 | from voting_regressor import VotingRegressor
10 |
11 | np.random.seed(123456)
12 |
13 |
14 | lr = SVR()
15 |
16 | data = pd.read_csv('BTC-USD.csv')
17 | data = data.dropna()
18 | data.Date = pd.to_datetime(data.Date)
19 | data.set_index('Date', drop=True, inplace=True)
20 | diffs = (data.Close.diff()/data.Close).values[1:]
21 |
22 | diff_len = len(diffs)
23 |
24 | base_learners = [('SVR', SVR()),
25 | ('LR', LinearRegression()),
26 | ('KNN', KNeighborsRegressor())]
27 |
28 | lr = VotingRegressor(base_learners)
29 | def create_x_data(lags=1):
30 | diff_data = np.zeros((diff_len, lags))
31 | ma_data = np.zeros((diff_len, lags))
32 |
33 | diff_ma = (data.Close.diff()/data.Close).rolling(15).mean().fillna(0).values[1:]
34 | for lag in range(1, lags+1):
35 | this_data = diffs[:-lag]
36 | diff_data[lag:, lag-1] = this_data
37 |
38 | this_data = diff_ma[:-lag]
39 | ma_data[lag:, lag-1] = this_data
40 | return np.concatenate((diff_data, ma_data), axis=1)
41 |
42 | x_data = create_x_data(lags=20)*100
43 | y_data = diffs*100
44 |
45 | # REPRODUCIBILITY
46 | x_data = np.around(x_data, decimals=8)
47 | y_data = np.around(y_data, decimals=8)
48 |
49 | # =============================================================================
50 | # WALK FORWARD
51 | # =============================================================================
52 |
53 | window = 150
54 | preds = np.zeros(diff_len-window)
55 | for i in range(diff_len-window-1):
56 | x_train = x_data[i:i+window, :]
57 | y_train = y_data[i:i+window]
58 | lr.fit(x_train, y_train)
59 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))
60 |
61 |
62 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds))
63 | simulate(data, preds)
64 |
--------------------------------------------------------------------------------
/Chapter10/voting_regressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from copy import deepcopy
3 |
4 | class VotingRegressor():
5 |
6 | # Accepts a list of (name, classifier) tuples
7 | def __init__(self, base_learners):
8 | self.base_learners = {}
9 | for name, learner in base_learners:
10 | self.base_learners[name] = deepcopy(learner)
11 |
12 |
13 | # Fits each individual base learner
14 | def fit(self, x_data, y_data):
15 | for name in self.base_learners:
16 | learner = self.base_learners[name]
17 | learner.fit(x_data, y_data)
18 |
19 | # Generates the predictions
20 | def predict(self, x_data):
21 |
22 | # Create the predictions matrix
23 | predictions = np.zeros((len(x_data), len(self.base_learners)))
24 |
25 | names = list(self.base_learners.keys())
26 |
27 | # For each base learner
28 | for i in range(len(self.base_learners)):
29 | name = names[i]
30 | learner = self.base_learners[name]
31 |
32 | # Store the predictions in a column
33 | preds = learner.predict(x_data)
34 | predictions[:,i] = preds
35 |
36 | # Take the row-average
37 | predictions = np.mean(predictions, axis=1)
38 | return predictions
--------------------------------------------------------------------------------
/Chapter11/base_learners_twitter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from sklearn import metrics
5 | from sklearn.ensemble import VotingClassifier
6 | from sklearn.feature_extraction.text import TfidfVectorizer
7 | from sklearn.linear_model import LogisticRegression, RidgeClassifier
8 | from sklearn.naive_bayes import MultinomialNB
9 | # Load the data
10 | data = pd.read_csv('sent140_preprocessed.csv')
11 | data = data.dropna()
12 |
13 |
14 |
15 |
16 | # Set the train and test sizes
17 | train_size = 10000
18 | test_start = 10000
19 | test_end = 100000
20 |
21 |
22 |
23 | def check_features_ngrams(features, n_grams, classifiers):
24 |
25 | print(features, n_grams)
26 |
27 | # Create the IDF feature extractor
28 | tf = TfidfVectorizer(max_features=features, ngram_range=n_grams,
29 | stop_words='english')
30 |
31 | # Create the IDF features
32 | tf.fit(data.text)
33 | transformed = tf.transform(data.text)
34 |
35 | np.random.seed(123456)
36 |
37 | def check_classifier(name, classifier):
38 | print('--'+name+'--')
39 |
40 | # Train the classifier
41 | x_data = transformed[:train_size].toarray()
42 | y_data = data.polarity[:train_size].values
43 |
44 | classifier.fit(x_data, y_data)
45 | i_s = metrics.accuracy_score(y_data, classifier.predict(x_data))
46 |
47 | # Evaluate on the test set
48 | x_data = transformed[test_start:test_end].toarray()
49 | y_data = data.polarity[test_start:test_end].values
50 | oos = metrics.accuracy_score(y_data, classifier.predict(x_data))
51 |
52 | # Expor the results
53 | with open("outs.txt","a") as f:
54 | f.write(str(features)+',')
55 | f.write(str(n_grams[-1])+',')
56 | f.write(name+',')
57 | f.write('%.4f'%i_s+',')
58 | f.write('%.4f'%oos+'\n')
59 |
60 | for name, classifier in classifiers:
61 | check_classifier(name, classifier)
62 |
63 |
64 | # Create csv header
65 | with open("outs.txt","a") as f:
66 | f.write('features,ngram_range,classifier,train_acc,test_acc')
67 |
68 | # Test all features and n-grams combinations
69 | for features in [500, 1000, 5000, 10000, 20000, 30000]:
70 | for n_grams in [(1, 1), (1, 2), (1, 3)]:
71 |
72 | # Create the ensemble
73 | voting = VotingClassifier([('LR', LogisticRegression()),
74 | ('NB', MultinomialNB()),
75 | ('Ridge', RidgeClassifier())])
76 |
77 | # Create the named classifiers
78 | classifiers = [('LR', LogisticRegression()),
79 | ('NB', MultinomialNB()),
80 | ('Ridge', RidgeClassifier()),
81 | ('Voting', voting)]
82 |
83 | # Evaluate them
84 | check_features_ngrams(features, n_grams, classifiers)
85 |
86 |
87 |
--------------------------------------------------------------------------------
/Chapter11/comparisons.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 |
4 |
5 |
6 | res = pd.read_csv('outs_old.csv')
7 |
8 | i = 1
9 | for key, grp in res.groupby(['classifier']):
10 | plt.subplot(2, 2, i)
11 | i += 1
12 | plt.title(str(key))
13 | for key2, grp2 in grp.groupby(['ngram_range']):
14 | plt.plot(grp2.features.values, grp2['test_acc'].values, label=str(key2)+'-gram')
15 | # plt.xscale('log')
16 | plt.legend()
17 | plt.xlabel('features')
18 | plt.ylabel('accuracy')
19 | plt.xscale('log')
--------------------------------------------------------------------------------
/Chapter11/data_cleaning.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 | import re
4 | from nltk.corpus import stopwords
5 | from nltk.stem import PorterStemmer
6 | from string import punctuation
7 |
8 | # Read the data and assign labels
9 | labels = ['polarity', 'id', 'date', 'query', 'user', 'text']
10 | data = pd.read_csv("sent140.csv", names=labels)
11 |
12 | # Keep only text and polarity, change polarity to 0-1
13 | data = data[['text', 'polarity']]
14 | data.polarity.replace(4, 1, inplace=True)
15 |
16 | # Create a list of stopwords
17 | stops = stopwords.words("english")
18 |
19 | # Add stop variants without single quotes
20 | no_quotes = []
21 | for word in stops:
22 | if "'" in word:
23 | no_quotes.append(re.sub(r'\'', '', word))
24 | stops.extend(no_quotes)
25 |
26 |
27 | def clean_string(string):
28 | # Remove HTML entities
29 | tmp = re.sub(r'\&\w*;', '', string)
30 | # Remove @user
31 | tmp = re.sub(r'@(\w+)', '', tmp)
32 | # Remove links
33 | tmp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', '', tmp)
34 | # Lowercase
35 | tmp = tmp.lower()
36 | # Remove Hashtags
37 | tmp = re.sub(r'#(\w+)', '', tmp)
38 | # Remove repeating chars
39 | tmp = re.sub(r'(.)\1{1,}', r'\1\1', tmp)
40 | # Remove anything that is not letters
41 | tmp = re.sub("[^a-zA-Z]", " ", tmp)
42 | # Remove anything that is less than two characters
43 | tmp = re.sub(r'\b\w{1,2}\b', '', tmp)
44 | # Remove multiple spaces
45 | tmp = re.sub(r'\s\s+', ' ', tmp)
46 | return tmp
47 |
48 |
49 |
50 | def preprocess(string):
51 |
52 | stemmer = PorterStemmer()
53 | # Remove any punctuation character
54 | removed_punc = ''.join([char for char in string if char not in punctuation])
55 |
56 | cleaned = []
57 | # Remove any stopword
58 | for word in removed_punc.split(' '):
59 | if word not in stops:
60 | cleaned.append(stemmer.stem(word.lower()))
61 | return ' '.join(cleaned)
62 |
63 |
64 |
65 |
66 | # Shuffle
67 | data = data.sample(frac=1).reset_index(drop=True)
68 | # Clean
69 | data.text = data.text.apply(clean_string)
70 | # Pre-process
71 | data.text = data.text.apply(preprocess)
72 | # Save to CSV
73 | data.to_csv('sent140_preprocessed.csv', index=False)
--------------------------------------------------------------------------------
/Chapter11/exploratory.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 |
4 | from collections import Counter
5 |
6 | # Read the data and assign labels
7 | labels = ['polarity', 'id', 'date', 'query', 'user', 'text']
8 | data = pd.read_csv("sent140.csv", names=labels)
9 |
10 | # Plot polarities
11 | data.groupby('polarity').id.count().plot(kind='bar')
12 |
13 | # Get most frequent words
14 | data['words'] = data.text.str.split()
15 |
16 | words = []
17 | # Get a list of all words
18 | for w in data.words:
19 | words.extend(w)
20 |
21 | # Get the frequencies and plot
22 | freqs = Counter(words).most_common(30)
23 | plt.plot(*zip(*freqs))
24 | plt.xticks(rotation=80)
25 | plt.ylabel('Count')
26 | plt.title('30 most common words.')
27 |
28 |
--------------------------------------------------------------------------------
/Chapter11/stream_sentiment.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 | from sklearn.ensemble import VotingClassifier
5 | from sklearn.feature_extraction.text import TfidfVectorizer
6 | from sklearn.linear_model import LogisticRegression, RidgeClassifier
7 | from sklearn.naive_bayes import MultinomialNB
8 | from tweepy import OAuthHandler, Stream, StreamListener
9 |
10 | # Please fill your API keys as strings
11 | consumer_key=""
12 | consumer_secret=""
13 |
14 | access_token=""
15 | access_token_secret=""
16 |
17 |
18 |
19 | # Load the data
20 | data = pd.read_csv('sent140_preprocessed.csv')
21 | data = data.dropna()
22 | # Replicate our voting classifier for 30.000 features and 1-3 n-grams
23 | train_size = 10000
24 |
25 | tf = TfidfVectorizer(max_features=30000, ngram_range=(1, 3),
26 | stop_words='english')
27 | tf.fit(data.text)
28 | transformed = tf.transform(data.text)
29 |
30 | x_data = transformed[:train_size].toarray()
31 | y_data = data.polarity[:train_size].values
32 |
33 | voting = VotingClassifier([('LR', LogisticRegression()),
34 | ('NB', MultinomialNB()),
35 | ('Ridge', RidgeClassifier())])
36 |
37 | voting.fit(x_data, y_data)
38 |
39 |
40 | # Define the streaming classifier
41 | class StreamClassifier(StreamListener):
42 |
43 | def __init__(self, classifier, vectorizer, api=None):
44 | super().__init__(api)
45 | self.clf = classifier
46 | self.vec = vectorizer
47 |
48 | # What to do when a tweet arrives
49 | def on_data(self, data):
50 | # Create a json object
51 | json_format = json.loads(data)
52 | # Get the tweet's text
53 | text = json_format['text']
54 |
55 | features = self.vec.transform([text]).toarray()
56 | print(text, self.clf.predict(features))
57 | return True
58 |
59 | # If an error occurs, print the status
60 | def on_error(self, status):
61 | print(status)
62 |
63 | # Create the classifier and authentication handlers
64 | classifier = StreamClassifier(classifier=voting, vectorizer=tf)
65 | auth = OAuthHandler(consumer_key, consumer_secret)
66 | auth.set_access_token(access_token, access_token_secret)
67 |
68 | # Listen for specific hashtags
69 | stream = Stream(auth, classifier)
70 | stream.filter(track=['basketball'])
--------------------------------------------------------------------------------
/Chapter12/ensemble_fc_models.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
2 | from keras.models import Model
3 | from sklearn import metrics
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.linear_model import BayesianRidge
6 |
7 |
8 | import matplotlib.pyplot as plt
9 | import numpy as np
10 | import pandas as pd
11 |
12 | np.random.seed(123456)
13 |
14 | def get_data():
15 | # Read the data and drop timestamp
16 | data = pd.read_csv('ratings.csv')
17 | data.drop('timestamp', axis=1, inplace=True)
18 |
19 | # Re-map the indices
20 | users = data.userId.unique()
21 | movies = data.movieId.unique()
22 | # Create maps from old to new indices
23 | moviemap={}
24 | for i in range(len(movies)):
25 | moviemap[movies[i]]=i
26 | usermap={}
27 | for i in range(len(users)):
28 | usermap[users[i]]=i
29 |
30 | # Change the indices
31 | data.movieId = data.movieId.apply(lambda x: moviemap[x])
32 | data.userId = data.userId.apply(lambda x: usermap[x])
33 |
34 | # Shuffle the data
35 | data = data.sample(frac=1.0).reset_index(drop=True)
36 |
37 | # Create a train/test split
38 | train, test = train_test_split(data, test_size=0.2)
39 |
40 | n_users = len(users)
41 | n_movies = len(movies)
42 |
43 | return train, test, n_users, n_movies
44 |
45 |
46 | train, test, n_users, n_movies = get_data()
47 |
48 |
49 | def create_model(n_features=5, train_model=True, load_weights=False):
50 | fts = n_features
51 |
52 | # Movie part. Input accepts the index as input
53 | # and passes it to the Embedding layer. Finally,
54 | # Flatten transforms Embedding's output to a
55 | # one-dimensional tensor.
56 | movie_in = Input(shape=[1], name="Movie")
57 | mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in)
58 | flat_movie = Flatten(name="FlattenM")(mov_embed)
59 |
60 | # Repeat for the user.
61 | user_in = Input(shape=[1], name="User")
62 | user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in)
63 | flat_user = Flatten(name="FlattenU")(user_inuser_embed)
64 |
65 | # Concatenate the Embedding layers and feed them
66 | # to the Dense part of the network
67 | concat = Concatenate()([flat_movie, flat_user])
68 | dense_1 = Dense(128)(concat)
69 | dense_2 = Dense(32)(dense_1)
70 | out = Dense(1)(dense_2)
71 |
72 | # Create and compile the model
73 | model = Model([user_in, movie_in], out)
74 | model.compile('adam', 'mean_squared_error')
75 | # Train the model
76 | model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1)
77 |
78 | return model
79 |
80 | def predictions(model):
81 | preds = model.predict([test.userId, test.movieId])
82 | return preds
83 |
84 | # Create base and meta learner
85 | model5 = create_model(5)
86 | model10 = create_model(10)
87 | model15 = create_model(15)
88 | meta_learner = BayesianRidge()
89 |
90 | # Predict on the test set
91 | preds5 = predictions(model5)
92 | preds10 = predictions(model10)
93 | preds15 = predictions(model15)
94 | # Create a single array with the predictions
95 | preds = np.stack([preds5, preds10, preds15], axis=-1).reshape(-1, 3)
96 |
97 |
98 | # Fit the meta learner on all but the last 1000 test samples
99 | meta_learner.fit(preds[:-1000], test.rating[:-1000])
100 |
101 | # Evaluate the base learners and the meta learner on the last
102 | # 1000 test samples
103 | print('Base Learner 5 Features')
104 | print(metrics.mean_squared_error(test.rating[-1000:], preds5[-1000:]))
105 | print('Base Learner 10 Features')
106 | print(metrics.mean_squared_error(test.rating[-1000:], preds10[-1000:]))
107 | print('Base Learner 15 Features')
108 | print(metrics.mean_squared_error(test.rating[-1000:], preds15[-1000:]))
109 | print('Ensemble')
110 | print(metrics.mean_squared_error(test.rating[-1000:], meta_learner.predict(preds[-1000:])))
111 |
112 |
--------------------------------------------------------------------------------
/Chapter12/exploratory.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 |
4 | data = pd.read_csv('ratings.csv')
5 |
6 | print(data.head())
7 | data.drop('timestamp', axis=1, inplace=True)
8 |
9 |
10 | data.rating.hist(grid=False)
11 | plt.ylabel('Frequency')
12 | plt.ylabel('Rating')
13 | plt.title('Rating Distribution')
14 |
15 | data.describe()
--------------------------------------------------------------------------------
/Chapter12/single_dense_model.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
2 | from keras.models import Model
3 | from sklearn.model_selection import train_test_split
4 | from sklearn import metrics
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 | np.random.seed(123456)
10 |
11 |
12 | def get_data():
13 | # Read the data and drop timestamp
14 | data = pd.read_csv('ratings.csv')
15 | data.drop('timestamp', axis=1, inplace=True)
16 |
17 | # Re-map the indices
18 | users = data.userId.unique()
19 | movies = data.movieId.unique()
20 | # Create maps from old to new indices
21 | moviemap={}
22 | for i in range(len(movies)):
23 | moviemap[movies[i]]=i
24 | usermap={}
25 | for i in range(len(users)):
26 | usermap[users[i]]=i
27 |
28 | # Change the indices
29 | data.movieId = data.movieId.apply(lambda x: moviemap[x])
30 | data.userId = data.userId.apply(lambda x: usermap[x])
31 |
32 | # Shuffle the data
33 | data = data.sample(frac=1.0).reset_index(drop=True)
34 |
35 | # Create a train/test split
36 | train, test = train_test_split(data, test_size=0.2)
37 |
38 | n_users = len(users)
39 | n_movies = len(movies)
40 |
41 | return train, test, n_users, n_movies
42 |
43 |
44 | train, test, n_users, n_movies = get_data()
45 |
46 | fts = 5
47 |
48 | # Movie part. Input accepts the index as input
49 | # and passes it to the Embedding layer. Finally,
50 | # Flatten transforms Embedding's output to a
51 | # one-dimensional tensor.
52 | movie_in = Input(shape=[1], name="Movie")
53 | mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in)
54 | flat_movie = Flatten(name="FlattenM")(mov_embed)
55 |
56 | # Repeat for the user.
57 | user_in = Input(shape=[1], name="User")
58 | user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in)
59 | flat_user = Flatten(name="FlattenU")(user_inuser_embed)
60 |
61 | # Concatenate the Embedding layers and feed them
62 | # to the Dense part of the network
63 | concat = Concatenate()([flat_movie, flat_user])
64 | dense_1 = Dense(128)(concat)
65 | dense_2 = Dense(32)(dense_1)
66 | out = Dense(1)(dense_2)
67 |
68 | # Create and compile the model
69 | model = Model([user_in, movie_in], out)
70 | model.compile('adam', 'mean_squared_error')
71 |
72 | # Train the model on the train set
73 | model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1)
74 |
75 | # Evaluate on the test set
76 | print(metrics.mean_squared_error(test.rating,
77 | model.predict([test.userId, test.movieId])))
78 |
--------------------------------------------------------------------------------
/Chapter12/single_dot_model.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Input, Embedding, Flatten, Dot
2 | from keras.models import Model
3 | from sklearn.model_selection import train_test_split
4 | from sklearn import metrics
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 | np.random.seed(123456)
10 |
11 |
12 | def get_data():
13 | # Read the data and drop timestamp
14 | data = pd.read_csv('ratings.csv')
15 | data.drop('timestamp', axis=1, inplace=True)
16 |
17 | # Re-map the indices
18 | users = data.userId.unique()
19 | movies = data.movieId.unique()
20 | # Create maps from old to new indices
21 | moviemap={}
22 | for i in range(len(movies)):
23 | moviemap[movies[i]]=i
24 | usermap={}
25 | for i in range(len(users)):
26 | usermap[users[i]]=i
27 |
28 | # Change the indices
29 | data.movieId = data.movieId.apply(lambda x: moviemap[x])
30 | data.userId = data.userId.apply(lambda x: usermap[x])
31 |
32 | # Shuffle the data
33 | data = data.sample(frac=1.0).reset_index(drop=True)
34 |
35 | # Create a train/test split
36 | train, test = train_test_split(data, test_size=0.2)
37 |
38 | n_users = len(users)
39 | n_movies = len(movies)
40 |
41 | return train, test, n_users, n_movies
42 |
43 |
44 | train, test, n_users, n_movies = get_data()
45 |
46 | fts = 5
47 |
48 | # Movie part. Input accepts the index as input
49 | # and passes it to the Embedding layer. Finally,
50 | # Flatten transforms Embedding's output to a
51 | # one-dimensional tensor.
52 | movie_in = Input(shape=[1], name="Movie")
53 | mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in)
54 | flat_movie = Flatten(name="FlattenM")(mov_embed)
55 |
56 | # Repeat for the user.
57 | user_in = Input(shape=[1], name="User")
58 | user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in)
59 | flat_user = Flatten(name="FlattenU")(user_inuser_embed)
60 |
61 | # Calculate the dot-product of the two embeddings
62 | prod = Dot(name="Mult", axes=1)([flat_movie, flat_user])
63 |
64 | # Create and compile the model
65 | model = Model([user_in, movie_in], prod)
66 | model.compile('adam', 'mean_squared_error')
67 |
68 | # Train the model on the train set
69 | model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1)
70 |
71 | # Evaluate on the test set
72 | print(metrics.mean_squared_error(test.rating,
73 | model.predict([test.userId, test.movieId])))
74 |
--------------------------------------------------------------------------------
/Chapter13/clustering.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from sklearn import cluster
4 |
5 | data = pd.read_csv('WHR.csv')
6 | regs = pd.read_csv('Regions.csv')
7 |
8 | recents = data[data.Year == 2017]
9 | recents = recents.dropna(axis=1, how="all")
10 | recents = recents.fillna(recents.median())
11 |
12 |
13 | def find_region(country):
14 | return regs[regs['Country name']==country].Region.values[-1]
15 |
16 | def find_region_size(region):
17 | return regs.groupby('Region')['Country name'].count()[region]
18 |
19 | km = cluster.KMeans(3)
20 | fits = recents[['Log GDP per capita',
21 | 'Social support', 'Healthy life expectancy at birth',
22 | 'Freedom to make life choices', 'Generosity',
23 | 'Perceptions of corruption','Positive affect', 'Negative affect',
24 | 'Confidence in national government', 'Democratic Quality',
25 | 'Delivery Quality']].values
26 | preds = km.fit_predict(fits)
27 | recents['Cluster'] = preds
28 |
29 |
30 | grouped = recents.groupby('Cluster')['Country name']
31 | for key, item in grouped:
32 | countries = grouped.get_group(key).values
33 | regions = {x: 0 for x in regs.Region.unique()}
34 | for country in countries:
35 | regions[find_region(country)] = regions[find_region(country)]+1
36 | print(key, countries, regions, "\n\n")
37 | x, y = [], []
38 | for k in regions:
39 | x.append(k)
40 | y.append(regions[k]/find_region_size(k))
41 | plt.figure()
42 | plt.bar(x, y)
43 | plt.xticks(rotation=90)
44 |
45 |
46 |
47 |
48 | recents = recents.dropna(axis=1, how="any")
49 | recents = recents.fillna(recents.median())
50 |
51 |
52 |
53 | km = cluster.KMeans(10)
54 | preds = km.fit_predict(recents.drop(['Year', 'Country name'], axis=1).values)
55 | recents['Cluster'] = preds
56 |
57 | grouped = recents.groupby('Cluster')['Country name']
58 | for key, item in grouped:
59 | countries = grouped.get_group(key).values
60 | regions = {x: 0 for x in regs.Region.unique()}
61 | for country in countries:
62 | regions[find_region(country)] = regions[find_region(country)]+1
63 | print(key, countries, regions, "\n\n")
64 | x, y = [], []
65 | for k in regions:
66 | x.append(k)
67 | y.append(regions[k]/find_region_size(k))
68 | plt.figure()
69 | plt.bar(x, y)
70 | plt.xticks(rotation=90)
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/Chapter13/ensemble_cluster.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import openensembles as oe
4 | import pandas as pd
5 |
6 |
7 | from sklearn import metrics
8 |
9 |
10 | # Load the datasets
11 | data = pd.read_csv('WHR.csv')
12 | regs = pd.read_csv('Regions.csv')
13 |
14 | # Use the 2017 data and fill any NaNs
15 | recents = data[data.Year == 2017]
16 | recents = recents.dropna(axis=1, how="all")
17 | recents = recents.fillna(recents.median())
18 |
19 |
20 | # Use only these specific features
21 | columns = ['Log GDP per capita',
22 | 'Social support', 'Healthy life expectancy at birth',
23 | 'Freedom to make life choices', 'Generosity',
24 | 'Perceptions of corruption','Positive affect', 'Negative affect',
25 | 'Confidence in national government', 'Democratic Quality',
26 | 'Delivery Quality']
27 |
28 | # Create the data object
29 | cluster_data = oe.data(recents[columns], columns)
30 |
31 |
32 | np.random.seed(123456)
33 | results = {'K':[], 'size':[], 'silhouette': []}
34 | # Test different ensemble setups
35 | Ks = [2, 4, 6, 8, 10, 12, 14]
36 | sizes = [5, 10, 20, 50]
37 | for K in Ks:
38 | for ensemble_size in sizes:
39 | ensemble = oe.cluster(cluster_data)
40 | for i in range(ensemble_size):
41 | name = f'kmeans_{ensemble_size}_{i}'
42 | ensemble.cluster('parent', 'kmeans', name, K)
43 |
44 | preds = ensemble.finish_co_occ_linkage(threshold=0.5)
45 | print(f'K: {K}, size {ensemble_size}:', end=' ')
46 | silhouette = metrics.silhouette_score(recents[columns],
47 | preds.labels['co_occ_linkage'])
48 | print('%.2f' % silhouette)
49 | results['K'].append(K)
50 | results['size'].append(ensemble_size)
51 | results['silhouette'].append(silhouette)
52 |
53 | results_df = pd.DataFrame(results)
54 | cross = pd.crosstab(results_df.K, results_df['size'],
55 | results_df['silhouette'], aggfunc=lambda x: x)
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/Chapter13/ensemble_cluster_normalized.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import openensembles as oe
4 | import pandas as pd
5 |
6 |
7 | from sklearn import metrics
8 |
9 |
10 | # Load the datasets
11 | data = pd.read_csv('WHR.csv')
12 | regs = pd.read_csv('Regions.csv')
13 |
14 | # Use the 2017 data and fill any NaNs
15 | recents = data[data.Year == 2017]
16 | recents = recents.dropna(axis=1, how="all")
17 | recents = recents.fillna(recents.median())
18 |
19 |
20 | # Use only these specific features
21 | columns = ['Log GDP per capita',
22 | 'Social support', 'Healthy life expectancy at birth',
23 | 'Freedom to make life choices', 'Generosity',
24 | 'Perceptions of corruption','Positive affect', 'Negative affect',
25 | 'Confidence in national government', 'Democratic Quality',
26 | 'Delivery Quality']
27 |
28 | # Normalize the features by subtracting the mean
29 | # and dividing by the standard deviation
30 | normalized = recents[columns]
31 | normalized = normalized - normalized.mean()
32 | normalized = normalized / normalized.std()
33 | # Create the data object
34 | cluster_data = oe.data(recents[columns], columns)
35 |
36 |
37 | np.random.seed(123456)
38 | results = {'K':[], 'size':[], 'silhouette': []}
39 | # Test different ensemble setups
40 | Ks = [2, 4, 6, 8, 10, 12, 14]
41 | sizes = [5, 10, 20, 50]
42 | for K in Ks:
43 | for ensemble_size in sizes:
44 | ensemble = oe.cluster(cluster_data)
45 | for i in range(ensemble_size):
46 | name = f'kmeans_{ensemble_size}_{i}'
47 | ensemble.cluster('parent', 'kmeans', name, K)
48 |
49 | preds = ensemble.finish_co_occ_linkage(threshold=0.5)
50 | print(f'K: {K}, size {ensemble_size}:', end=' ')
51 | silhouette = metrics.silhouette_score(recents[columns],
52 | preds.labels['co_occ_linkage'])
53 | print('%.2f' % silhouette)
54 | results['K'].append(K)
55 | results['size'].append(ensemble_size)
56 | results['silhouette'].append(silhouette)
57 |
58 | results_df = pd.DataFrame(results)
59 | cross = pd.crosstab(results_df.K, results_df['size'],
60 | results_df['silhouette'], aggfunc=lambda x: x)
61 |
62 |
--------------------------------------------------------------------------------
/Chapter13/ensemble_cluster_tsne.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import openensembles as oe
4 | import pandas as pd
5 |
6 |
7 | from sklearn import metrics
8 | from sklearn.manifold import t_sne
9 |
10 |
11 | # Load the datasets
12 | data = pd.read_csv('WHR.csv')
13 | regs = pd.read_csv('Regions.csv')
14 |
15 | # Use the 2017 data and fill any NaNs
16 | recents = data[data.Year == 2017]
17 | recents = recents.dropna(axis=1, how="all")
18 | recents = recents.fillna(recents.median())
19 |
20 |
21 | # Use only these specific features
22 | columns = ['Log GDP per capita',
23 | 'Social support', 'Healthy life expectancy at birth',
24 | 'Freedom to make life choices', 'Generosity',
25 | 'Perceptions of corruption','Positive affect', 'Negative affect',
26 | 'Confidence in national government', 'Democratic Quality',
27 | 'Delivery Quality']
28 |
29 | # Transform the data with TSNE
30 | tsne = t_sne.TSNE()
31 | transformed = pd.DataFrame(tsne.fit_transform(recents[columns]))
32 | # Create the data object
33 | cluster_data = oe.data(transformed, [0, 1])
34 |
35 |
36 | np.random.seed(123456)
37 | results = {'K':[], 'size':[], 'silhouette': []}
38 | # Test different ensemble setups
39 | Ks = [2, 4, 6, 8, 10, 12, 14]
40 | sizes = [5, 10, 20, 50]
41 | for K in Ks:
42 | for ensemble_size in sizes:
43 | ensemble = oe.cluster(cluster_data)
44 | for i in range(ensemble_size):
45 | name = f'kmeans_{ensemble_size}_{i}'
46 | ensemble.cluster('parent', 'kmeans', name, K)
47 |
48 | preds = ensemble.finish_co_occ_linkage(threshold=0.5)
49 | print(f'K: {K}, size {ensemble_size}:', end=' ')
50 | silhouette = metrics.silhouette_score(recents[columns],
51 | preds.labels['co_occ_linkage'])
52 | print('%.2f' % silhouette)
53 | results['K'].append(K)
54 | results['size'].append(ensemble_size)
55 | results['silhouette'].append(silhouette)
56 |
57 | results_df = pd.DataFrame(results)
58 | cross = pd.crosstab(results_df.K, results_df['size'],
59 | results_df['silhouette'], aggfunc=lambda x: x)
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/Chapter13/exploratory.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 |
4 | from matplotlib import cm
5 |
6 |
7 | data = pd.read_csv('WHR.csv')
8 | regs = pd.read_csv('Regions.csv')
9 |
10 | def find_region(country):
11 | if country in list(regs['Country name'].values):
12 | return regs[regs['Country name']==country].Region.values[-1]
13 | return 'None'
14 |
15 | recents = data[data.Year == 2018]
16 | recents = recents.dropna(axis=1, how="all")
17 | recents = recents.fillna(recents.median())
18 | recents['Region'] = recents['Country name'].apply(lambda x: find_region(x))
19 |
20 |
21 |
22 |
23 | cmap = cm.get_cmap('viridis')
24 | recents.groupby('Region')['Country name'].count().plot(kind='pie', labels=None, cmap=cmap, autopct='%1.0f%%', textprops={'color':"w"})
25 | plt.ylabel('')
26 | plt.xticks()
27 | plt.legend(labels = recents.groupby('Region')['Country name'].count().index, bbox_to_anchor=(1, 1.05))
28 |
29 |
30 | data[['Year', 'Life Ladder']].set_index('Year').boxplot(by='Year', grid=False)
31 | plt.suptitle("")
32 | plt.title('Life Ladder')
33 | plt.xlabel('Year')
34 |
35 | data.groupby('Year')['Life Ladder'].count().plot()
36 | plt.title('Countries per Year')
37 | plt.xlabel('Year')
38 | plt.ylabel('Countries')
39 |
40 |
41 | def create_scatter(col, nc, nr, index):
42 | plt.subplot(nc, nr, index)
43 | render = data.sample(frac=0.3)
44 | plt.scatter(render[col], render['Life Ladder'])
45 | plt.title(str(col)[:20])
46 |
47 | i = 1
48 | for key in ['Log GDP per capita',
49 | 'Social support', 'Healthy life expectancy at birth',
50 | 'Freedom to make life choices', 'Generosity',
51 | 'Perceptions of corruption','Positive affect', 'Negative affect',
52 | 'Confidence in national government', 'Democratic Quality',
53 | 'Delivery Quality']:
54 | create_scatter(key, 4, 3, i)
55 | i += 1
56 |
57 |
58 | t = data[data['Year']==2005].copy()
59 | countries = list(t['Country name'].values)
60 | filtered = data[data['Country name'].isin(countries)]
61 |
62 | filtered[['Year', 'Life Ladder']].set_index('Year').boxplot(by='Year', grid=False)
63 | plt.suptitle("")
64 | plt.title('Life Ladder - Same Countries')
65 | plt.xlabel('Year')
66 |
67 | from sklearn.manifold import t_sne
68 |
69 | t = t_sne.TSNE()
70 | data = data.fillna(data.median())
71 | transformed = t.fit_transform(data[['Log GDP per capita',
72 | 'Social support', 'Healthy life expectancy at birth',
73 | 'Freedom to make life choices', 'Generosity',
74 | 'Perceptions of corruption','Positive affect', 'Negative affect',
75 | 'Confidence in national government', 'Democratic Quality',
76 | 'Delivery Quality']].values)
77 |
78 | plt.scatter(transformed[:,0], transformed[:,1], c=data['Life Ladder'].values)
79 |
80 | regions = {x: 0 for x in regs.Region.unique()}
81 | i = 0
82 | for r in regions:
83 | regions[r] = i
84 | i += 1
85 | regions['None'] = i
86 |
87 | plt.scatter(transformed[:,0], transformed[:,1], c=data['Region'].apply(lambda x: regions[x]).values)
88 |
--------------------------------------------------------------------------------
/Chapter13/insights.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import openensembles as oe
4 | import pandas as pd
5 |
6 | from sklearn import metrics
7 | from sklearn.manifold import t_sne
8 |
9 | np.random.seed(123456)
10 |
11 | # Load the datasets
12 | data = pd.read_csv('WHR.csv')
13 | regs = pd.read_csv('Regions.csv')
14 |
15 | # Use the 2017 data and fill any NaNs
16 | recents = data[data.Year == 2017]
17 | recents = recents.dropna(axis=1, how="all")
18 | recents = recents.fillna(recents.median())
19 |
20 |
21 | # Use only these specific features
22 | columns = ['Log GDP per capita',
23 | 'Social support', 'Healthy life expectancy at birth',
24 | 'Freedom to make life choices', 'Generosity',
25 | 'Perceptions of corruption','Positive affect', 'Negative affect',
26 | 'Confidence in national government', 'Democratic Quality',
27 | 'Delivery Quality']
28 |
29 |
30 | # Transform the data with TSNE
31 | tsne = t_sne.TSNE()
32 | transformed = pd.DataFrame(tsne.fit_transform(recents[columns]))
33 | # Create the data object
34 | cluster_data = oe.data(transformed, [0, 1])
35 |
36 | # Create the ensemble
37 | ensemble = oe.cluster(cluster_data)
38 | for i in range(20):
39 | name = f'kmeans({i}-tsne'
40 | ensemble.cluster('parent', 'kmeans', name, 10)
41 |
42 | # Create the cluster labels
43 | preds = ensemble.finish_co_occ_linkage(threshold=0.5)
44 |
45 |
46 | # Add Life Ladder to columns
47 | columns = ['Life Ladder', 'Log GDP per capita',
48 | 'Social support', 'Healthy life expectancy at birth',
49 | 'Freedom to make life choices', 'Generosity',
50 | 'Perceptions of corruption','Positive affect', 'Negative affect',
51 | 'Confidence in national government', 'Democratic Quality',
52 | 'Delivery Quality']
53 | # Add the cluster to the dataframe and group by the cluster
54 | recents['Cluster'] = preds.labels['co_occ_linkage']
55 | grouped = recents.groupby('Cluster')
56 | # Get the means
57 | means = grouped.mean()[columns]
58 |
59 | # Create barplots
60 | def create_bar(col, nc, nr, index):
61 | plt.subplot(nc, nr, index)
62 | values = means.sort_values('Life Ladder')[col]
63 | mn = min(values) * 0.98
64 | mx = max(values) * 1.02
65 | values.plot(kind='bar', ylim=[mn, mx])
66 | plt.title(col[:18])
67 |
68 | # Plot for each feature
69 | plt.figure(1)
70 | i = 1
71 | for col in columns:
72 | create_bar(col, 4, 3, i)
73 | i += 1
74 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Hands-On Ensemble Learning with Python
5 |
6 |
7 |
8 | This is the code repository for [Hands-On Ensemble Learning with Python](https://www.packtpub.com/data/hands-on-ensemble-learning-with-python), published by Packt.
9 |
10 | **Build highly optimized ensemble machine learning models using scikit-learn and Keras**
11 |
12 | ## What is this book about?
13 | Ensembling is a technique for combining two or more similar or dissimilar machine learning algorithms to create a model that delivers superior predictive power. This book will demonstrate how you can use a variety of weak algorithms to make a strong predictive model.
14 |
15 | With its hands-on approach, you'll not only get up to speed on the basic theory, but also the application of various ensemble learning techniques. Using examples and real-world datasets, you'll be able to produce better machine learning models to solve supervised learning problems such as classification and regression. Later in the book, you'll go on to leverage ensemble learning techniques such as clustering to produce unsupervised machine learning models. As you progress, the chapters will cover different machine learning algorithms that are widely used in the practical world to make predictions and classifications. You'll even get to grips with using Python libraries such as scikit-learn and Keras to implement different ensemble models.
16 |
17 | By the end of this book, you will be well versed in ensemble learning and have the skills you need to understand which ensemble method is required for which problem, in order to successfully implement them in real-world scenarios.
18 |
19 | This book covers the following exciting features:
20 | * Implement ensemble methods to generate models with high accuracy
21 | * Overcome challenges such as bias and variance
22 | * Explore machine learning algorithms to evaluate model performance
23 | * Understand how to construct, evaluate, and apply ensemble models
24 | * Analyze tweets in real time using Twitter's streaming API
25 | * Use Keras to build an ensemble of neural networks for the MovieLens dataset
26 |
27 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789612853) today!
28 |
29 |
31 |
32 | ## Instructions and Navigations
33 | All of the code is organized into folders. For example, Chapter03.
34 |
35 | The code will look like the following:
36 | ```
37 | # Accuracy of hard voting
38 | print('-'*30)
39 | print('Hard Voting:', accuracy_score(y_test, hard_predictions))
40 | ```
41 |
42 | **Following is what you need for this book:**
43 | This book is for data analysts, data scientists, machine learning engineers, and other professionals who are looking to generate advanced models using ensemble techniques. Some understanding of machine learning concepts, Python programming and AWS will be beneficial.
44 |
45 | With the following software and hardware list you can run all code files present in the book (Chapter 1-13).
46 | ### Software and Hardware List
47 | | Chapter | Software required | OS required |
48 | | -------- | ------------------------------------ | ----------------------------------- |
49 | | All | Python(Jupyter notebook) | Windows, Mac OS X, and Linux (Any) |
50 |
51 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781789612851_ColorImages.pdf).
52 |
53 | ### Related products
54 | * Ensemble Machine Learning Cookbook [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/ensemble-machine-learning-cookbook) [[Amazon]](https://www.amazon.com/dp/1789136601)
55 |
56 |
57 | ## Get to Know the Author
58 | **George Kyriakides** is a Ph.D. researcher, studying distributed neural architecture search. His interests and experience include the automated generation and optimization of predictive models for a wide array of applications, such as image recognition, time series analysis, and financial applications. He holds an M.Sc. in computational methods and applications, and a B.Sc. in applied informatics, both from the University of Macedonia, Thessaloniki, Greece.
59 |
60 | **Konstantinos G. Margaritis** has been a teacher and researcher in computer science for more than 30 years. His research interests include parallel and distributed computing, as well as computational intelligence and machine learning. He holds an M.Eng. in electrical engineering (Aristotle University of Thessaloniki, Greece), as well as an M.Sc. and a Ph.D. in computer science (Loughborough University, UK). He is a professor at the Department of Applied Informatics, University of Macedonia, Thessaloniki, Greece.
61 |
62 |
63 | ### Suggestions and Feedback
64 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
65 |
66 |
67 | ### Download a free PDF
68 |
69 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
70 |