├── Chapter01 ├── algos_demo.py └── datasets_demo.py ├── Chapter02 ├── bias_variance.py └── motivation.py ├── Chapter03 ├── custom_voting_implementation.py ├── custom_voting_implementation_analysis.py ├── scikit_hard_voting.py ├── scikit_soft_voting.py ├── scikit_soft_voting_2knn.py └── scikit_soft_voting_analysis.py ├── Chapter04 ├── linear_nonlinear_example.py ├── stacking_classification.py ├── stacking_classification_analysis.py ├── stacking_classifiers.py ├── stacking_regression.py └── stacking_regression_analysis.py ├── Chapter05 ├── bagging_custom.py ├── bagging_custom_parallel.py ├── bagging_sklearn_classification.py ├── bagging_sklearn_regression.py ├── bootstrapping.py └── validation_curves.py ├── Chapter06 ├── adaboost_custom.py ├── adaboost_sklearn_classification.py ├── adaboost_sklearn_regression.py ├── boosting_overfit.py ├── dataset_segmentation.py ├── gradient_boosting_custom.py ├── gradient_boosting_sklearn_classification.py ├── gradient_boosting_sklearn_regression.py ├── xgb_classification.py └── xgb_regression.py ├── Chapter07 ├── extra_tree_classification.py ├── extra_tree_classification_validation_curves.py ├── extra_tree_regression.py ├── probability_to_choose.py ├── rf_classification.py ├── rf_classification_validation_curves.py └── rf_regression.py ├── Chapter08 ├── agglomerative.py ├── kmeans_cluster.py ├── kmeans_intro.py ├── kmeans_raw.py ├── oe_co_occurence.py ├── oe_graph_closure.py ├── oe_vote.py ├── oe_vote_tsne.py └── voting_example.py ├── Chapter09 ├── adaboost.py ├── bagging.py ├── base.py ├── dt_optimize.py ├── exploratory.py ├── logistic_regression.py ├── random_forest.py ├── stacking.py ├── stacking_classifier.py ├── unrelated_presentation_phd.py ├── voting.py └── xgboosting.py ├── Chapter10 ├── bagging.py ├── boosting.py ├── exploratory.py ├── random_forest.py ├── regression.py ├── simulator.py ├── simulator_plain.py ├── stacking.py ├── stacking_regressor.py ├── voting.py └── voting_regressor.py ├── Chapter11 ├── base_learners_twitter.py ├── comparisons.py ├── data_cleaning.py ├── exploratory.py └── stream_sentiment.py ├── Chapter12 ├── ensemble_fc_models.py ├── exploratory.py ├── single_dense_model.py └── single_dot_model.py ├── Chapter13 ├── clustering.py ├── ensemble_cluster.py ├── ensemble_cluster_normalized.py ├── ensemble_cluster_tsne.py ├── exploratory.py └── insights.py ├── LICENSE └── README.md /Chapter01/algos_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Feb 15 19:37:48 2019 4 | 5 | @author: George Kyriakides 6 | ge.kyriakides@gmail.com 7 | """ 8 | 9 | 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from mpl_toolkits.mplot3d import Axes3D 14 | from sklearn.decomposition import KernelPCA 15 | 16 | 17 | # ============================================================================= 18 | # OLS 19 | # ============================================================================= 20 | # --- SECTION 1 --- 21 | # Libraries and data loading 22 | from sklearn.datasets import load_diabetes 23 | from sklearn.linear_model import LinearRegression 24 | from sklearn import metrics 25 | diabetes = load_diabetes() 26 | 27 | 28 | # --- SECTION 2 --- 29 | # Split the data into train and test set 30 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 31 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 32 | 33 | # --- SECTION 3 --- 34 | # Instantiate, train and evaluate the model 35 | ols = LinearRegression() 36 | ols.fit(train_x, train_y) 37 | err = metrics.mean_squared_error(test_y, ols.predict(test_x)) 38 | r2 = metrics.r2_score(test_y, ols.predict(test_x)) 39 | 40 | # --- SECTION 4 --- 41 | # Print the model 42 | print('---OLS on diabetes dataset.---') 43 | print('Coefficients:') 44 | print('Intercept (b): %.2f'%ols.intercept_) 45 | for i in range(len(diabetes.feature_names)): 46 | print(diabetes.feature_names[i]+': %.2f'%ols.coef_[i]) 47 | print('-'*30) 48 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err) 49 | 50 | 51 | # ============================================================================= 52 | # LOGIT 53 | # ============================================================================= 54 | # --- SECTION 1 --- 55 | # Libraries and data loading 56 | from sklearn.linear_model import LogisticRegression 57 | from sklearn.datasets import load_breast_cancer 58 | from sklearn import metrics 59 | bc = load_breast_cancer() 60 | 61 | # --- SECTION 2 --- 62 | # Split the data into train and test set 63 | train_x, train_y = bc.data[:400], bc.target[:400] 64 | test_x, test_y = bc.data[400:], bc.target[400:] 65 | 66 | # --- SECTION 3 --- 67 | # Instantiate, train and evaluate the model 68 | logit = LogisticRegression() 69 | logit.fit(train_x, train_y) 70 | acc = metrics.accuracy_score(test_y, logit.predict(test_x)) 71 | 72 | # --- SECTION 4 --- 73 | # Print the model 74 | print('---Logistic Regression on breast cancer dataset.---') 75 | print('Coefficients:') 76 | print('Intercept (b): %.2f'%logit.intercept_) 77 | for i in range(len(bc.feature_names)): 78 | print(bc.feature_names[i]+': %.2f'%logit.coef_[0][i]) 79 | print('-'*30) 80 | print('Accuracy: %.2f \n'%acc) 81 | print(metrics.confusion_matrix(test_y, logit.predict(test_x))) 82 | 83 | # ============================================================================= 84 | # SVM FIGURE 85 | # ============================================================================= 86 | f = lambda x: 2 * x - 5 87 | f_upp = lambda x: 2 * x - 5 + 2 88 | f_lower = lambda x: 2 * x - 5 - 2 89 | 90 | pos = [] 91 | neg = [] 92 | 93 | np.random.seed(345234) 94 | for i in range(80): 95 | x = np.random.randint(15) 96 | y = np.random.randint(15) 97 | 98 | d = np.abs(2*x-y-5)/np.sqrt(2**2+1) 99 | if f(x) < y and d>=1: 100 | pos.append([x,y]) 101 | elif f(x) > y and d>=1 : 102 | neg.append([x,y]) 103 | 104 | pos.append([4, f_upp(4)]) 105 | neg.append([8, f_lower(8)]) 106 | 107 | 108 | plt.figure() 109 | plt.xticks([]) 110 | plt.yticks([]) 111 | plt.scatter(*zip(*pos)) 112 | plt.scatter(*zip(*neg)) 113 | 114 | plt.plot([0,10],[f(0),f(10)], linestyle='--', color='m') 115 | plt.plot([0,10],[f_upp(0),f_upp(10)], linestyle='--', color='red') 116 | plt.plot([0,10],[f_lower(0),f_lower(10)], linestyle='--', color='red') 117 | plt.plot([4,3],[f_lower(4),f_upp(3)], linestyle='-', color='black') 118 | plt.plot([7,6],[f_lower(7),f_upp(6)], linestyle='-', color='black') 119 | plt.xlabel('x') 120 | plt.ylabel('y') 121 | plt.title('SVM') 122 | 123 | # ============================================================================= 124 | # SVC 125 | # ============================================================================= 126 | # --- SECTION 1 --- 127 | # Libraries and data loading 128 | from sklearn.svm import SVC 129 | from sklearn.datasets import load_breast_cancer 130 | from sklearn import metrics 131 | 132 | # --- SECTION 2 --- 133 | # Split the data into train and test set 134 | train_x, train_y = bc.data[:400], bc.target[:400] 135 | test_x, test_y = bc.data[400:], bc.target[400:] 136 | 137 | # --- SECTION 3 --- 138 | # Instantiate, train and evaluate the model 139 | svc = SVC(kernel='linear') 140 | svc.fit(train_x, train_y) 141 | acc = metrics.accuracy_score(test_y, svc.predict(test_x)) 142 | 143 | # --- SECTION 4 --- 144 | # Print the model's accuracy 145 | print('---SVM on breast cancer dataset.---') 146 | print('Accuracy: %.2f \n'%acc) 147 | print(metrics.confusion_matrix(test_y, svc.predict(test_x))) 148 | 149 | # ============================================================================= 150 | # SVR 151 | # ============================================================================= 152 | # --- SECTION 1 --- 153 | # Libraries and data loading 154 | from sklearn.datasets import load_diabetes 155 | from sklearn.svm import SVR 156 | from sklearn import metrics 157 | diabetes = load_diabetes() 158 | 159 | 160 | # --- SECTION 2 --- 161 | # Split the data into train and test set 162 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 163 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 164 | 165 | # --- SECTION 3 --- 166 | # Instantiate, train and evaluate the model 167 | svr = SVR(kernel='linear', C=1000) 168 | svr.fit(train_x, train_y) 169 | err = metrics.mean_squared_error(test_y, svr.predict(test_x)) 170 | r2 = metrics.r2_score(test_y, svr.predict(test_x)) 171 | 172 | # --- SECTION 4 --- 173 | # Print the model 174 | print('---SVM on diabetes dataset.---') 175 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err) 176 | 177 | 178 | 179 | # ============================================================================= 180 | # MLP REGRESSION 181 | # ============================================================================= 182 | 183 | # --- SECTION 1 --- 184 | # Libraries and data loading 185 | from sklearn.datasets import load_diabetes 186 | from sklearn.neural_network import MLPRegressor 187 | from sklearn import metrics 188 | diabetes = load_diabetes() 189 | 190 | 191 | # --- SECTION 2 --- 192 | # Split the data into train and test set 193 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 194 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 195 | 196 | # --- SECTION 3 --- 197 | # Instantiate, train and evaluate the model 198 | mlpr = MLPRegressor(solver='sgd') 199 | mlpr.fit(train_x, train_y) 200 | err = metrics.mean_squared_error(test_y, mlpr.predict(test_x)) 201 | r2 = metrics.r2_score(test_y, mlpr.predict(test_x)) 202 | 203 | # --- SECTION 4 --- 204 | # Print the model 205 | print('---Neural Networks on diabetes dataset.---') 206 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err) 207 | 208 | # ============================================================================= 209 | # MLP CLASSIFICATION 210 | # ============================================================================= 211 | 212 | # --- SECTION 1 --- 213 | # Libraries and data loading 214 | from sklearn.datasets import load_breast_cancer 215 | from sklearn.neural_network import MLPClassifier 216 | from sklearn import metrics 217 | bc = load_breast_cancer() 218 | 219 | 220 | 221 | 222 | # --- SECTION 2 --- 223 | # Split the data into train and test set 224 | train_x, train_y = bc.data[:400], bc.target[:400] 225 | test_x, test_y = bc.data[400:], bc.target[400:] 226 | 227 | # --- SECTION 3 --- 228 | # Instantiate, train and evaluate the model 229 | mlpc = MLPClassifier(solver='lbfgs', random_state=12418) 230 | mlpc.fit(train_x, train_y) 231 | acc = metrics.accuracy_score(test_y, mlpc.predict(test_x)) 232 | 233 | # --- SECTION 4 --- 234 | # Print the model's accuracy 235 | print('---Neural Networks on breast cancer dataset.---') 236 | print('Accuracy: %.2f \n'%acc) 237 | print(metrics.confusion_matrix(test_y, mlpc.predict(test_x))) 238 | 239 | # ============================================================================= 240 | # MLP REGRESSION 241 | # ============================================================================= 242 | 243 | # --- SECTION 1 --- 244 | # Libraries and data loading 245 | from sklearn.datasets import load_diabetes 246 | from sklearn.neural_network import MLPRegressor 247 | from sklearn import metrics 248 | diabetes = load_diabetes() 249 | 250 | 251 | # --- SECTION 2 --- 252 | # Split the data into train and test set 253 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 254 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 255 | 256 | # --- SECTION 3 --- 257 | # Instantiate, train and evaluate the model 258 | mlpr = MLPRegressor(solver='sgd') 259 | mlpr.fit(train_x, train_y) 260 | err = metrics.mean_squared_error(test_y, mlpr.predict(test_x)) 261 | r2 = metrics.r2_score(test_y, mlpr.predict(test_x)) 262 | 263 | # --- SECTION 4 --- 264 | # Print the model 265 | print('---Neural Networks on diabetes dataset.---') 266 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err) 267 | 268 | # ============================================================================= 269 | # DTREE REGRESSION 270 | # ============================================================================= 271 | 272 | # --- SECTION 1 --- 273 | # Libraries and data loading 274 | from sklearn.datasets import load_diabetes 275 | from sklearn.tree import DecisionTreeRegressor 276 | from sklearn import metrics 277 | diabetes = load_diabetes() 278 | 279 | 280 | # --- SECTION 2 --- 281 | # Split the data into train and test set 282 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 283 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 284 | 285 | # --- SECTION 3 --- 286 | # Instantiate, train and evaluate the model 287 | dtr = DecisionTreeRegressor(max_depth=2) 288 | dtr.fit(train_x, train_y) 289 | err = metrics.mean_squared_error(test_y, dtr.predict(test_x)) 290 | r2 = metrics.r2_score(test_y, dtr.predict(test_x)) 291 | 292 | # --- SECTION 4 --- 293 | # Print the model 294 | print('---Neural Networks on diabetes dataset.---') 295 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err) 296 | 297 | # ============================================================================= 298 | # DTREE CLASSIFICATION 299 | # ============================================================================= 300 | 301 | # --- SECTION 1 --- 302 | # Libraries and data loading 303 | from sklearn.datasets import load_breast_cancer 304 | from sklearn.tree import DecisionTreeClassifier 305 | from sklearn import metrics 306 | bc = load_breast_cancer() 307 | 308 | 309 | 310 | # --- SECTION 2 --- 311 | # Split the data into train and test set 312 | train_x, train_y = bc.data[:400], bc.target[:400] 313 | test_x, test_y = bc.data[400:], bc.target[400:] 314 | 315 | # --- SECTION 3 --- 316 | # Instantiate, train and evaluate the model 317 | dtc = DecisionTreeClassifier(max_depth=2) 318 | dtc.fit(train_x, train_y) 319 | acc = metrics.accuracy_score(test_y, dtc.predict(test_x)) 320 | 321 | # --- SECTION 4 --- 322 | # Print the model's accuracy 323 | print('---Neural Networks on breast cancer dataset.---') 324 | print('Accuracy: %.2f \n'%acc) 325 | print(metrics.confusion_matrix(test_y, dtc.predict(test_x))) 326 | from sklearn.tree import export_graphviz 327 | export_graphviz(dtc, feature_names=bc.feature_names, 328 | class_names=bc.target_names, impurity=False) 329 | 330 | 331 | 332 | # ============================================================================= 333 | # KNN REGRESSION 334 | # ============================================================================= 335 | 336 | # --- SECTION 1 --- 337 | # Libraries and data loading 338 | from sklearn.datasets import load_diabetes 339 | from sklearn.neighbors import KNeighborsRegressor 340 | from sklearn import metrics 341 | diabetes = load_diabetes() 342 | 343 | 344 | # --- SECTION 2 --- 345 | # Split the data into train and test set 346 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 347 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 348 | 349 | # --- SECTION 3 --- 350 | # Instantiate, train and evaluate the model 351 | knnr = KNeighborsRegressor(n_neighbors=14) 352 | knnr.fit(train_x, train_y) 353 | err = metrics.mean_squared_error(test_y, knnr.predict(test_x)) 354 | r2 = metrics.r2_score(test_y, knnr.predict(test_x)) 355 | 356 | # --- SECTION 4 --- 357 | # Print the model 358 | print('---Neural Networks on diabetes dataset.---') 359 | print('R-squared: %.2f'%r2, ' MSE: %.2f \n'%err) 360 | 361 | # ============================================================================= 362 | # KNN CLASSIFICATION 363 | # ============================================================================= 364 | 365 | # --- SECTION 1 --- 366 | # Libraries and data loading 367 | from sklearn.datasets import load_breast_cancer 368 | from sklearn.neighbors import KNeighborsClassifier 369 | from sklearn import metrics 370 | bc = load_breast_cancer() 371 | 372 | 373 | 374 | # --- SECTION 2 --- 375 | # Split the data into train and test set 376 | train_x, train_y = bc.data[:400], bc.target[:400] 377 | test_x, test_y = bc.data[400:], bc.target[400:] 378 | 379 | # --- SECTION 3 --- 380 | # Instantiate, train and evaluate the model 381 | dtc = KNeighborsClassifier(n_neighbors=5) 382 | dtc.fit(train_x, train_y) 383 | acc = metrics.accuracy_score(test_y, dtc.predict(test_x)) 384 | 385 | # --- SECTION 4 --- 386 | # Print the model's accuracy 387 | print('---Neural Networks on breast cancer dataset.---') 388 | print('Accuracy: %.2f \n'%acc) 389 | print(metrics.confusion_matrix(test_y, dtc.predict(test_x))) 390 | 391 | 392 | # ============================================================================= 393 | # K-MEANS 394 | # ============================================================================= 395 | 396 | # --- SECTION 1 --- 397 | # Libraries and data loading 398 | from sklearn.datasets import load_breast_cancer 399 | from sklearn.cluster import KMeans 400 | bc = load_breast_cancer() 401 | 402 | 403 | bc.data=bc.data[:,:2] 404 | 405 | # --- SECTION 2 --- 406 | # Instantiate and train 407 | km = KMeans(n_clusters=3) 408 | km.fit(bc.data) 409 | 410 | # --- SECTION 3 --- 411 | # Create a point mesh to plot cluster areas 412 | 413 | # Step size of the mesh. 414 | h = .02 415 | 416 | # Plot the decision boundary. For that, we will assign a color to each 417 | x_min, x_max = bc.data[:, 0].min() - 1, bc.data[:, 0].max() + 1 418 | y_min, y_max = bc.data[:, 1].min() - 1, bc.data[:, 1].max() + 1 419 | 420 | # Create the actual mesh and cluster it 421 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 422 | Z = km.predict(np.c_[xx.ravel(), yy.ravel()]) 423 | 424 | # Put the result into a color plot 425 | Z = Z.reshape(xx.shape) 426 | plt.figure(1) 427 | plt.clf() 428 | plt.imshow(Z, interpolation='nearest', 429 | extent=(xx.min(), xx.max(), yy.min(), yy.max()), 430 | aspect='auto', origin='lower') 431 | 432 | # --- SECTION 4 --- 433 | # Plot the actual data 434 | c = km.predict(bc.data) 435 | 436 | r = c == 0 437 | b = c == 1 438 | g = c == 2 439 | 440 | 441 | plt.scatter(bc.data[r, 0], bc.data[r, 1], label='cluster 1', color='silver') 442 | plt.scatter(bc.data[b, 0], bc.data[b, 1], label='cluster 2', color='white') 443 | plt.scatter(bc.data[g, 0], bc.data[g, 1], label='cluster 3', color='black') 444 | plt.title('K-means') 445 | plt.xlim(x_min, x_max) 446 | plt.ylim(y_min, y_max) 447 | plt.xticks(()) 448 | plt.yticks(()) 449 | plt.xlabel(bc.feature_names[0]) 450 | plt.ylabel(bc.feature_names[1]) 451 | plt.show() 452 | plt.legend() -------------------------------------------------------------------------------- /Chapter01/datasets_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Feb 12 23:01:53 2019 4 | 5 | @author: George Kyriakides 6 | ge.kyriakides@gmail.com 7 | """ 8 | 9 | from sklearn.datasets import load_digits, load_breast_cancer, load_diabetes 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from mpl_toolkits.mplot3d import Axes3D 14 | from sklearn.cluster import KMeans 15 | from sklearn.decomposition import KernelPCA 16 | 17 | # ============================================================================= 18 | # DATASETS 19 | # ============================================================================= 20 | diabetes = load_diabetes() 21 | bc = load_breast_cancer() 22 | digits = load_digits() 23 | images_and_labels = list(zip(digits.images, digits.target)) 24 | for index, (image, label) in enumerate(images_and_labels[10:20]): 25 | plt.subplot(2, 5, index + 1) 26 | plt.axis('off') 27 | plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') 28 | plt.title('Target: %i' % label) 29 | 30 | 31 | # ============================================================================= 32 | # CLASSIFICATION 33 | # ============================================================================= 34 | f = lambda x: 2 * x - 5 35 | 36 | pos = [] 37 | neg = [] 38 | 39 | for i in range(30): 40 | x = np.random.randint(15) 41 | y = np.random.randint(15) 42 | 43 | if f(x) < y: 44 | pos.append([x,y]) 45 | else: 46 | neg.append([x,y]) 47 | 48 | 49 | plt.figure() 50 | plt.xticks([]) 51 | plt.yticks([]) 52 | plt.scatter(*zip(*pos)) 53 | plt.scatter(*zip(*neg)) 54 | plt.plot([0,10],[f(0),f(10)], linestyle='--', color='m') 55 | plt.xlabel('x') 56 | plt.ylabel('y') 57 | plt.title('Classification') 58 | 59 | # ============================================================================= 60 | # REGRESSION 61 | # ============================================================================= 62 | 63 | dat = [] 64 | 65 | 66 | for i in range(30): 67 | x = np.random.uniform(10) 68 | y = f(x) + np.random.uniform(-2.0,2.0) 69 | 70 | 71 | dat.append([x,y]) 72 | 73 | 74 | plt.figure() 75 | plt.xticks([]) 76 | plt.yticks([]) 77 | plt.scatter(*zip(*dat)) 78 | plt.plot([0,10],[f(0),f(10)], linestyle='--', color='m') 79 | plt.xlabel('x') 80 | plt.ylabel('y') 81 | plt.title('Regression') 82 | 83 | # ============================================================================= 84 | # CLUSTERING 85 | # ============================================================================= 86 | 87 | km = KMeans(n_clusters=3) 88 | dat = [] 89 | 90 | t = 0.5 91 | 92 | for i in range(300): 93 | 94 | 95 | c = np.random.randint(3) 96 | a = np.random.uniform() * 2 * 3.14 97 | r = t * np.sqrt(np.random.uniform()) 98 | 99 | x = r * np.cos(a) 100 | y = r * np.sin(a) 101 | 102 | 103 | dat.append([c+x, c+y]) 104 | 105 | 106 | c = km.fit_predict(dat) 107 | plt.figure() 108 | plt.xticks([]) 109 | plt.yticks([]) 110 | plt.scatter(*zip(*dat),c=c) 111 | plt.xlabel('x') 112 | plt.ylabel('y') 113 | plt.title('Clustering') 114 | 115 | 116 | # ============================================================================= 117 | # PCA 118 | # ============================================================================= 119 | 120 | from sklearn.datasets import make_circles 121 | 122 | pca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) 123 | x, y = make_circles(n_samples=400, factor=.3, noise=.05) 124 | 125 | 126 | pp = pca.fit_transform(x) 127 | plt.figure() 128 | plt.xticks([]) 129 | plt.yticks([]) 130 | plt.scatter(pp[:,0], pp[:,1], c=y) 131 | plt.xlabel('x') 132 | plt.ylabel('y') 133 | plt.title('Clustering') 134 | 135 | # ============================================================================= 136 | # TSNE 137 | # ============================================================================= 138 | 139 | from sklearn.manifold import TSNE 140 | 141 | tsne = TSNE() 142 | 143 | dat = tsne.fit_transform(bc.data) 144 | reds = bc.target == 0 145 | blues = bc.target == 1 146 | plt.scatter(dat[reds,0], dat[reds,1], label='malignant') 147 | plt.scatter(dat[blues,0], dat[blues,1], label='benign') 148 | plt.xlabel('1st Component') 149 | plt.ylabel('2nd Component') 150 | plt.title('Breast Cancer Data') 151 | plt.legend() 152 | 153 | # ============================================================================= 154 | # ROC 155 | # ============================================================================= 156 | import numpy as np 157 | from sklearn import metrics 158 | ax1 = plt.subplot() 159 | ax1.margins(0) 160 | np.random.seed(856522) 161 | y = np.random.choice([1,2], 30) 162 | scores = np.random.choice([i/100 for i in range(0,100)], 30) 163 | fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) 164 | 165 | x = [i/100 for i in range(0,100)] 166 | y = [i/100 for i in range(0,100)] 167 | plt.plot(x, y, linestyle='-.') 168 | plt.plot(fpr, tpr, label='ROC curve') 169 | 170 | plt.xlabel('Specificity') 171 | plt.ylabel('Sensitivity') 172 | plt.title('ROC') 173 | plt.legend() 174 | -------------------------------------------------------------------------------- /Chapter02/bias_variance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 23 19:44:13 2019 4 | 5 | @author: George Kyriakides 6 | ge.kyriakides@gmail.com 7 | """ 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | np.random.seed(123456) 13 | 14 | def f(x): 15 | return np.sin(x) 16 | 17 | def sample(size): 18 | max_v = 20 19 | step = size/max_v 20 | x = [x/step for x in range(size)] 21 | y = [f(x)+np.random.uniform(-0.25,0.25) for x in x] 22 | return np.array(x).reshape(-1,1), np.array(y).reshape(-1,1) 23 | 24 | 25 | # ============================================================================= 26 | # HIGH BIAS - UNDERFIT 27 | # ============================================================================= 28 | from sklearn.linear_model import LinearRegression 29 | x, y = sample(100) 30 | 31 | 32 | lr = LinearRegression() 33 | lr.fit(x, y) 34 | preds = lr.predict(x) 35 | plt.figure() 36 | plt.scatter(x, y, label='data') 37 | plt.plot(x, preds, color='orange', label='model') 38 | plt.title('Biased Model') 39 | plt.legend() 40 | 41 | # ============================================================================= 42 | # HIGH VARIANCE - OVERFIT 43 | # ============================================================================= 44 | from sklearn.tree import DecisionTreeRegressor 45 | x, y = sample(100) 46 | 47 | dt = DecisionTreeRegressor() 48 | dt.fit(x, y) 49 | plt.figure() 50 | plt.scatter(x, y, label='training data') 51 | x, y = sample(100) 52 | preds = dt.predict(x) 53 | plt.plot(x, preds, color='orange', label='model') 54 | plt.scatter(x, y, label='test data') 55 | plt.title('High Variance Model') 56 | plt.legend() 57 | 58 | 59 | # ============================================================================= 60 | # TRADEOFF 61 | # ============================================================================= 62 | def bias(complexity): 63 | return 100/complexity 64 | 65 | def variance(complexity): 66 | return np.exp(complexity/28) 67 | 68 | r = range(5, 100) 69 | 70 | variance_ = np.array([variance(x) for x in r]) 71 | bias_ = np.array([bias(x) for x in r]) 72 | sum_ = variance_ + bias_ 73 | mins = np.argmin(sum_) 74 | min_line = [mins for x in range(0, int(max(sum_)))] 75 | 76 | 77 | plt.figure() 78 | plt.plot(bias_, label=r'$bias^2$', linestyle='-') 79 | plt.plot(variance_, label='variance', linestyle=':') 80 | plt.plot(sum_, label='error', linestyle='-.') 81 | plt.plot(min_line, [x for x in range(0, int(max(sum_)))], linestyle='--') 82 | plt.title('Minimizing Error') 83 | plt.legend() 84 | 85 | 86 | # ============================================================================= 87 | # BEST MODEL 88 | # ============================================================================= 89 | from sklearn.tree import DecisionTreeRegressor 90 | x, y = sample(100) 91 | 92 | plt.figure() 93 | plt.scatter(x, y, label='training data') 94 | 95 | preds = f(x) 96 | plt.plot(x, preds, color='orange', label='model') 97 | plt.title('Perfect Model') 98 | plt.legend() -------------------------------------------------------------------------------- /Chapter02/motivation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 25 23:15:52 2019 4 | 5 | @author: George Kyriakides 6 | ge.kyriakides@gmail.com 7 | """ 8 | 9 | import numpy as np 10 | from scipy.special import binom 11 | import matplotlib.pyplot as plt 12 | 13 | # ============================================================================= 14 | # ENSEMBLE SIZE - ERROR PLOT 15 | # ============================================================================= 16 | def prob(size): 17 | err = 0.15 18 | half = int(np.ceil(size/2)) 19 | s = 0 20 | for i in range(half, size): 21 | s += binom(size, i)*np.power(err,i)*np.power((1-err),(size-i)) 22 | return s 23 | 24 | 25 | probs = [15] 26 | rg = range(3,14, 2) 27 | for sz in rg: 28 | probs.append(prob(sz)*100) 29 | print(sz, '%.2f'%(prob(sz)*100)) 30 | 31 | rg = range(1,14, 2) 32 | plt.figure() 33 | plt.bar([x for x in rg], probs) 34 | plt.title('Probability of error for ensemble') 35 | plt.xlabel('Number of base learners') 36 | plt.ylabel('Error %') 37 | plt.xticks([x for x in rg]) 38 | 39 | 40 | # ============================================================================= 41 | # VALIDATION CURVES 42 | # ============================================================================= 43 | 44 | # --- SECTION 1 --- 45 | # Libraries and data loading 46 | from sklearn.datasets import load_breast_cancer 47 | from sklearn.model_selection import validation_curve 48 | from sklearn.neighbors import KNeighborsClassifier 49 | 50 | bc = load_breast_cancer() 51 | 52 | 53 | # --- SECTION 2 --- 54 | # Create in-sample and out-of-sample scores 55 | x, y = bc.data, bc.target 56 | learner = KNeighborsClassifier() 57 | param_range = [2,3,4,5] 58 | train_scores, test_scores = validation_curve(learner, x, y, 59 | param_name='n_neighbors', 60 | param_range=param_range, 61 | cv=10, 62 | scoring="accuracy") 63 | 64 | # --- SECTION 3 --- 65 | # Calculate the average and standard deviation for each hyperparameter 66 | train_scores_mean = np.mean(train_scores, axis=1) 67 | train_scores_std = np.std(train_scores, axis=1) 68 | test_scores_mean = np.mean(test_scores, axis=1) 69 | test_scores_std = np.std(test_scores, axis=1) 70 | 71 | 72 | # --- SECTION 4 --- 73 | # Plot the scores 74 | plt.figure() 75 | plt.title('Validation curves') 76 | # Plot the standard deviations 77 | plt.fill_between(param_range, train_scores_mean - train_scores_std, 78 | train_scores_mean + train_scores_std, alpha=0.1, 79 | color="C1") 80 | plt.fill_between(param_range, test_scores_mean - test_scores_std, 81 | test_scores_mean + test_scores_std, alpha=0.1, color="C0") 82 | 83 | # Plot the means 84 | plt.plot(param_range, train_scores_mean, 'o-', color="C1", 85 | label="Training score") 86 | plt.plot(param_range, test_scores_mean, 'o-', color="C0", 87 | label="Cross-validation score") 88 | 89 | plt.xticks(param_range) 90 | plt.xlabel('Number of neighbors') 91 | plt.ylabel('Accuracy') 92 | plt.legend(loc="best") 93 | plt.show() 94 | 95 | 96 | # ============================================================================= 97 | # LEARNING CURVES 98 | # ============================================================================= 99 | 100 | # --- SECTION 1 --- 101 | # Libraries and data loading 102 | from sklearn.datasets import load_breast_cancer 103 | from sklearn.neighbors import KNeighborsClassifier 104 | from sklearn.model_selection import learning_curve 105 | bc = load_breast_cancer() 106 | 107 | 108 | # --- SECTION 2 --- 109 | # Create in-sample and out-of-sample scores 110 | x, y = bc.data, bc.target 111 | learner = KNeighborsClassifier() 112 | train_sizes = [50, 100, 150, 200, 250, 300] 113 | train_sizes, train_scores, test_scores = learning_curve(learner, x, y, 114 | train_sizes=train_sizes, 115 | cv=10) 116 | 117 | 118 | # --- SECTION 3 --- 119 | # Calculate the average and standard deviation for each hyperparameter 120 | train_scores_mean = np.mean(train_scores, axis=1) 121 | train_scores_std = np.std(train_scores, axis=1) 122 | test_scores_mean = np.mean(test_scores, axis=1) 123 | test_scores_std = np.std(test_scores, axis=1) 124 | 125 | # --- SECTION 4 --- 126 | # Plot the scores 127 | plt.figure() 128 | plt.title('Learning curves') 129 | # Plot the standard deviations 130 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 131 | train_scores_mean + train_scores_std, alpha=0.1, 132 | color="C1") 133 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 134 | test_scores_mean + test_scores_std, alpha=0.1, color="C0") 135 | 136 | # Plot the means 137 | plt.plot(train_sizes, train_scores_mean, 'o-', color="C1", 138 | label="Training score") 139 | plt.plot(train_sizes, test_scores_mean, 'o-', color="C0", 140 | label="Cross-validation score") 141 | 142 | plt.xticks(train_sizes) 143 | plt.xlabel('Size of training set (instances)') 144 | plt.ylabel('Accuracy') 145 | plt.legend(loc="best") 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /Chapter03/custom_voting_implementation.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Import the required libraries 3 | from sklearn import datasets, linear_model, svm, neighbors 4 | from sklearn.metrics import accuracy_score 5 | from numpy import argmax 6 | # Load the dataset 7 | breast_cancer = datasets.load_breast_cancer() 8 | x, y = breast_cancer.data, breast_cancer.target 9 | 10 | # --- SECTION 2 --- 11 | # Instantiate the learners (classifiers) 12 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5) 13 | learner_2 = linear_model.Perceptron(tol=1e-2, random_state=0) 14 | learner_3 = svm.SVC(gamma=0.001) 15 | 16 | # --- SECTION 3 --- 17 | # Split the train and test samples 18 | test_samples = 100 19 | x_train, y_train = x[:-test_samples], y[:-test_samples] 20 | x_test, y_test = x[-test_samples:], y[-test_samples:] 21 | 22 | # Fit learners with the train data 23 | learner_1.fit(x_train, y_train) 24 | learner_2.fit(x_train, y_train) 25 | learner_3.fit(x_train, y_train) 26 | 27 | # --- SECTION 4 --- 28 | # Each learner predicts the classes of the test data 29 | predictions_1 = learner_1.predict(x_test) 30 | predictions_2 = learner_2.predict(x_test) 31 | predictions_3 = learner_3.predict(x_test) 32 | 33 | # --- SECTION 5 --- 34 | # We combine the predictions with hard voting 35 | hard_predictions = [] 36 | # For each predicted sample 37 | for i in range(test_samples): 38 | # Count the votes for each class 39 | counts = [0 for _ in range(2)] 40 | counts[predictions_1[i]] = counts[predictions_1[i]]+1 41 | counts[predictions_2[i]] = counts[predictions_2[i]]+1 42 | counts[predictions_3[i]] = counts[predictions_3[i]]+1 43 | # Find the class with most votes 44 | final = argmax(counts) 45 | # Add the class to the final predictions 46 | hard_predictions.append(final) 47 | 48 | # --- SECTION 6 --- 49 | # Accuracies of base learners 50 | print('L1:', accuracy_score(y_test, predictions_1)) 51 | print('L2:', accuracy_score(y_test, predictions_2)) 52 | print('L3:', accuracy_score(y_test, predictions_3)) 53 | # Accuracy of hard voting 54 | print('-'*30) 55 | print('Hard Voting:', accuracy_score(y_test, hard_predictions)) 56 | -------------------------------------------------------------------------------- /Chapter03/custom_voting_implementation_analysis.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Import the required libraries 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | mpl.style.use('seaborn-paper') 6 | 7 | # --- SECTION 2 --- 8 | # Calculate the errors 9 | errors_1 = y_test-predictions_1 10 | errors_2 = y_test-predictions_2 11 | errors_3 = y_test-predictions_3 12 | 13 | 14 | # --- SECTION 3 --- 15 | # Discard correct predictions and plot each learner's errors 16 | x=[] 17 | y=[] 18 | for i in range(len(errors_1)): 19 | if not errors_1[i] == 0: 20 | x.append(i) 21 | y.append(errors_1[i]) 22 | plt.scatter(x, y, s=120, label='Learner 1 Errors') 23 | 24 | x=[] 25 | y=[] 26 | for i in range(len(errors_2)): 27 | if not errors_2[i] == 0: 28 | x.append(i) 29 | y.append(errors_2[i]) 30 | plt.scatter(x, y, marker='x', s=60, label='Learner 2 Errors') 31 | 32 | x=[] 33 | y=[] 34 | for i in range(len(errors_3)): 35 | if not errors_3[i] == 0: 36 | x.append(i) 37 | y.append(errors_3[i]) 38 | plt.scatter(x, y, s=20, label='Learner 3 Errors') 39 | 40 | plt.title('Learner errors') 41 | plt.xlabel('Test sample') 42 | plt.ylabel('Error') 43 | plt.legend() 44 | plt.show() -------------------------------------------------------------------------------- /Chapter03/scikit_hard_voting.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Import the required libraries 3 | from sklearn import datasets, linear_model, svm, neighbors 4 | from sklearn.ensemble import VotingClassifier 5 | from sklearn.metrics import accuracy_score 6 | # Load the dataset 7 | breast_cancer = datasets.load_breast_cancer() 8 | x, y = breast_cancer.data, breast_cancer.target 9 | 10 | # Split the train and test samples 11 | test_samples = 100 12 | x_train, y_train = x[:-test_samples], y[:-test_samples] 13 | x_test, y_test = x[-test_samples:], y[-test_samples:] 14 | 15 | 16 | # --- SECTION 2 --- 17 | # Instantiate the learners (classifiers) 18 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5) 19 | learner_2 = linear_model.Perceptron(tol=1e-2, random_state=0) 20 | learner_3 = svm.SVC(gamma=0.001) 21 | 22 | # --- SECTION 3 --- 23 | # Instantiate the voting classifier 24 | voting = VotingClassifier([('KNN', learner_1), 25 | ('Prc', learner_2), 26 | ('SVM', learner_3)]) 27 | 28 | 29 | # --- SECTION 4 --- 30 | # Fit classifier with the training data 31 | voting.fit(x_train, y_train) 32 | 33 | # --- SECTION 5 --- 34 | # Predict the most voted class 35 | hard_predictions = voting.predict(x_test) 36 | 37 | # --- SECTION 6 --- 38 | # Accuracy of hard voting 39 | print('-'*30) 40 | print('Hard Voting:', accuracy_score(y_test, hard_predictions)) 41 | -------------------------------------------------------------------------------- /Chapter03/scikit_soft_voting.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Import the required libraries 3 | from sklearn import datasets, naive_bayes, svm, neighbors 4 | from sklearn.ensemble import VotingClassifier 5 | from sklearn.metrics import accuracy_score 6 | # Load the dataset 7 | breast_cancer = datasets.load_breast_cancer() 8 | x, y = breast_cancer.data, breast_cancer.target 9 | 10 | # Split the train and test samples 11 | test_samples = 100 12 | x_train, y_train = x[:-test_samples], y[:-test_samples] 13 | x_test, y_test = x[-test_samples:], y[-test_samples:] 14 | 15 | # --- SECTION 2 --- 16 | # Instantiate the learners (classifiers) 17 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5) 18 | learner_2 = naive_bayes.GaussianNB() 19 | learner_3 = svm.SVC(gamma=0.001, probability=True) 20 | 21 | # --- SECTION 3 --- 22 | # Instantiate the voting classifier 23 | voting = VotingClassifier([('KNN', learner_1), 24 | ('NB', learner_2), 25 | ('SVM', learner_3)], 26 | voting='soft') 27 | 28 | 29 | 30 | 31 | # --- SECTION 4 --- 32 | # Fit classifier with the training data 33 | voting.fit(x_train, y_train) 34 | learner_1.fit(x_train, y_train) 35 | learner_2.fit(x_train, y_train) 36 | learner_3.fit(x_train, y_train) 37 | 38 | # --- SECTION 5 --- 39 | # Predict the most probable class 40 | hard_predictions = voting.predict(x_test) 41 | 42 | # --- SECTION 6 --- 43 | # Get the base learner predictions 44 | predictions_1 = learner_1.predict(x_test) 45 | predictions_2 = learner_2.predict(x_test) 46 | predictions_3 = learner_3.predict(x_test) 47 | 48 | # --- SECTION 7 --- 49 | # Accuracies of base learners 50 | print('L1:', accuracy_score(y_test, predictions_1)) 51 | print('L2:', accuracy_score(y_test, predictions_2)) 52 | print('L3:', accuracy_score(y_test, predictions_3)) 53 | # Accuracy of hard voting 54 | print('-'*30) 55 | print('Hard Voting:', accuracy_score(y_test, hard_predictions)) 56 | 57 | 58 | -------------------------------------------------------------------------------- /Chapter03/scikit_soft_voting_2knn.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Import the required libraries 3 | from sklearn import datasets, naive_bayes, svm, neighbors 4 | from sklearn.ensemble import VotingClassifier 5 | from sklearn.metrics import accuracy_score 6 | # Load the dataset 7 | breast_cancer = datasets.load_breast_cancer() 8 | x, y = breast_cancer.data, breast_cancer.target 9 | 10 | # Split the train and test samples 11 | test_samples = 100 12 | x_train, y_train = x[:-test_samples], y[:-test_samples] 13 | x_test, y_test = x[-test_samples:], y[-test_samples:] 14 | 15 | # --- SECTION 2 --- 16 | # Instantiate the learners (classifiers) 17 | learner_1 = neighbors.KNeighborsClassifier(n_neighbors=5) 18 | learner_2 = naive_bayes.GaussianNB() 19 | learner_3 = neighbors.KNeighborsClassifier(n_neighbors=50) 20 | 21 | # --- SECTION 3 --- 22 | # Instantiate the voting classifier 23 | voting = VotingClassifier([('5NN', learner_1), 24 | ('NB', learner_2), 25 | ('50NN', learner_3)], 26 | voting='soft') 27 | 28 | 29 | 30 | 31 | # --- SECTION 4 --- 32 | # Fit classifier with the training data 33 | voting.fit(x_train, y_train) 34 | learner_1.fit(x_train, y_train) 35 | learner_2.fit(x_train, y_train) 36 | learner_3.fit(x_train, y_train) 37 | 38 | # --- SECTION 5 --- 39 | # Predict the most probable class 40 | hard_predictions = voting.predict(x_test) 41 | 42 | # --- SECTION 6 --- 43 | # Get the base learner predictions 44 | predictions_1 = learner_1.predict(x_test) 45 | predictions_2 = learner_2.predict(x_test) 46 | predictions_3 = learner_3.predict(x_test) 47 | 48 | # --- SECTION 7 --- 49 | # Accuracies of base learners 50 | print('L1:', accuracy_score(y_test, predictions_1)) 51 | print('L2:', accuracy_score(y_test, predictions_2)) 52 | print('L3:', accuracy_score(y_test, predictions_3)) 53 | # Accuracy of hard voting 54 | print('-'*30) 55 | print('Hard Voting:', accuracy_score(y_test, hard_predictions)) 56 | 57 | # --- SECTION 1 --- 58 | # Import the required libraries 59 | import matplotlib as mpl 60 | import matplotlib.pyplot as plt 61 | mpl.style.use('seaborn-paper') 62 | 63 | 64 | # --- SECTION 2 --- 65 | # Get the wrongly predicted instances 66 | # and the predicted probabilities for the whole test set 67 | errors = y_test-hard_predictions 68 | 69 | probabilities_1 = learner_1.predict_proba(x_test) 70 | probabilities_2 = learner_2.predict_proba(x_test) 71 | probabilities_3 = learner_3.predict_proba(x_test) 72 | 73 | 74 | # --- SECTION 2 --- 75 | # Store the predicted probability for 76 | # each wrongly predicted instance, for each base learner 77 | # as well as the average predicted probability 78 | # 79 | x=[] 80 | y_1=[] 81 | y_2=[] 82 | y_3=[] 83 | y_avg=[] 84 | 85 | for i in range(len(errors)): 86 | if not errors[i] == 0: 87 | x.append(i) 88 | y_1.append(probabilities_1[i][0]) 89 | y_2.append(probabilities_2[i][0]) 90 | y_3.append(probabilities_3[i][0]) 91 | y_avg.append((probabilities_1[i][0]+probabilities_2[i][0]+probabilities_3[i][0])/3) 92 | 93 | # --- SECTION 3 --- 94 | # Plot the predicted probaiblity of each base learner as 95 | # a bar and the average probability as an X 96 | plt.bar(x, y_1, 3, label='5NN') 97 | plt.bar(x, y_2, 2, label='NB') 98 | plt.bar(x, y_3, 1, label='50NN') 99 | plt.scatter(x, y_avg, marker='x', c='k', s=150, label='Average Positive', zorder=10) 100 | 101 | y = [0.5 for x in range(len(errors))] 102 | plt.plot(y, c='k', linestyle='--') 103 | 104 | plt.title('Positive Probability') 105 | plt.xlabel('Test sample') 106 | plt.ylabel('probability') 107 | plt.legend() 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /Chapter03/scikit_soft_voting_analysis.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Import the required libraries 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | mpl.style.use('seaborn-paper') 6 | 7 | 8 | # --- SECTION 2 --- 9 | # Get the wrongly predicted instances 10 | # and the predicted probabilities for the whole test set 11 | errors = y_test-hard_predictions 12 | 13 | probabilities_1 = learner_1.predict_proba(x_test) 14 | probabilities_2 = learner_2.predict_proba(x_test) 15 | probabilities_3 = learner_3.predict_proba(x_test) 16 | 17 | 18 | # --- SECTION 2 --- 19 | # Store the predicted probability for 20 | # each wrongly predicted instance, for each base learner 21 | # as well as the average predicted probability 22 | # 23 | x=[] 24 | y_1=[] 25 | y_2=[] 26 | y_3=[] 27 | y_avg=[] 28 | 29 | for i in range(len(errors)): 30 | if not errors[i] == 0: 31 | x.append(i) 32 | y_1.append(probabilities_1[i][0]) 33 | y_2.append(probabilities_2[i][0]) 34 | y_3.append(probabilities_3[i][0]) 35 | y_avg.append((probabilities_1[i][0]+probabilities_2[i][0]+probabilities_3[i][0])/3) 36 | 37 | # --- SECTION 3 --- 38 | # Plot the predicted probaiblity of each base learner as 39 | # a bar and the average probability as an X 40 | plt.bar(x, y_1, 3, label='KNN') 41 | plt.bar(x, y_2, 2, label='NB') 42 | plt.bar(x, y_3, 1, label='SVM') 43 | plt.scatter(x, y_avg, marker='x', c='k', s=150, label='Average Positive', zorder=10) 44 | 45 | y = [0.5 for x in range(len(errors))] 46 | plt.plot(y, c='k', linestyle='--') 47 | 48 | plt.title('Positive Probability') 49 | plt.xlabel('Test sample') 50 | plt.ylabel('probability') 51 | plt.legend() 52 | plt.show() -------------------------------------------------------------------------------- /Chapter04/linear_nonlinear_example.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | x = [i for i in range(100)] 5 | y = [5 for i in range(100)] 6 | 7 | for i in range(30, 60): 8 | y[i] = 4+((i-45)**2)/230 9 | 10 | for i in range(100): 11 | y[i] = y[i] + np.random.uniform(-0.03, 0.03) 12 | 13 | plt.scatter(x, y, label='Data') 14 | 15 | y = [5 for i in range(100)] 16 | 17 | for i in range(20, 70): 18 | y[i] = 4+((i-45)**2)/230 19 | 20 | plt.plot([5 for i in range(100)], label='Linear $y=5$', color='C1') 21 | plt.plot(x[20:70], y[20:70], label='Non-Linear $y=x^2$', color='C2') 22 | plt.xlabel('x') 23 | plt.ylabel('y') 24 | plt.title('Linear and Non-Linear Relationships') 25 | plt.legend() 26 | 27 | -------------------------------------------------------------------------------- /Chapter04/stacking_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_breast_cancer 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn.neural_network import MLPClassifier 7 | from sklearn.naive_bayes import GaussianNB 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.model_selection import KFold 10 | from sklearn import metrics 11 | import numpy as np 12 | bc = load_breast_cancer() 13 | 14 | 15 | train_x, train_y = bc.data[:400], bc.target[:400] 16 | test_x, test_y = bc.data[400:], bc.target[400:] 17 | 18 | # --- SECTION 2 --- 19 | # Create the ensemble's base learners and meta learner 20 | # Append base learners to a list for ease of access 21 | base_learners = [] 22 | 23 | knn = KNeighborsClassifier(n_neighbors=2) 24 | base_learners.append(knn) 25 | 26 | dtr = DecisionTreeClassifier(max_depth=4, random_state=123456) 27 | base_learners.append(dtr) 28 | 29 | mlpc = MLPClassifier(hidden_layer_sizes =(100, ), solver='lbfgs', random_state=123456) 30 | base_learners.append(mlpc) 31 | 32 | 33 | meta_learner = LogisticRegression(solver='lbfgs') 34 | 35 | 36 | # --- SECTION 3 --- 37 | # Create the training meta data 38 | 39 | # Create variables to store meta data and their targets 40 | meta_data = np.zeros((len(base_learners), len(train_x))) 41 | meta_targets = np.zeros(len(train_x)) 42 | 43 | # Create the cross-validation folds 44 | KF = KFold(n_splits=5) 45 | meta_index = 0 46 | for train_indices, test_indices in KF.split(train_x): 47 | # Train each learner on the K-1 folds and create meta data for the Kth fold 48 | for i in range(len(base_learners)): 49 | learner = base_learners[i] 50 | 51 | learner.fit(train_x[train_indices], train_y[train_indices]) 52 | predictions = learner.predict_proba(train_x[test_indices])[:,0] 53 | 54 | meta_data[i][meta_index:meta_index+len(test_indices)] = predictions 55 | 56 | meta_targets[meta_index:meta_index+len(test_indices)] = train_y[test_indices] 57 | meta_index += len(test_indices) 58 | 59 | # Transpose the meta data to be fed into the meta learner 60 | meta_data = meta_data.transpose() 61 | 62 | # --- SECTION 4 --- 63 | # Create the meta data for the test set and evaluate the base learners 64 | test_meta_data = np.zeros((len(base_learners), len(test_x))) 65 | base_acc = [] 66 | for i in range(len(base_learners)): 67 | learner = base_learners[i] 68 | learner.fit(train_x, train_y) 69 | predictions = learner.predict_proba(test_x)[:,0] 70 | test_meta_data[i] = predictions 71 | 72 | acc = metrics.accuracy_score(test_y, learner.predict(test_x)) 73 | 74 | 75 | base_acc.append(acc) 76 | 77 | test_meta_data = test_meta_data.transpose() 78 | 79 | # --- SECTION 5 --- 80 | # Fit the meta learner on the train set and evaluate it on the test set 81 | meta_learner.fit(meta_data, meta_targets) 82 | ensemble_predictions = meta_learner.predict(test_meta_data) 83 | 84 | acc = metrics.accuracy_score(test_y, ensemble_predictions) 85 | 86 | # --- SECTION 6 --- 87 | # Print the results 88 | print('Acc Name') 89 | print('-'*20) 90 | for i in range(len(base_learners)): 91 | learner = base_learners[i] 92 | 93 | print(f'{base_acc[i]:.2f} {learner.__class__.__name__}') 94 | print(f'{acc:.2f} Ensemble') 95 | -------------------------------------------------------------------------------- /Chapter04/stacking_classification_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 18 20:19:55 2019 4 | 5 | @author: George Kyriakides 6 | ge.kyriakides@gmail.com 7 | """ 8 | 9 | base_errors = (test_meta_data.transpose() - test_y).transpose() 10 | prediction_errors = ensemble_predictions - test_y 11 | 12 | for i in range(len(prediction_errors)): 13 | if not prediction_errors[i] == 0.0: 14 | print(base_errors[i,:]) -------------------------------------------------------------------------------- /Chapter04/stacking_classifiers.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries 3 | import numpy as np 4 | 5 | from sklearn.model_selection import KFold 6 | from copy import deepcopy 7 | 8 | 9 | class StackingRegressor(): 10 | 11 | # --- SECTION 2 --- 12 | # The constructor 13 | def __init__(self, learners): 14 | # Create a list of sizes for each stacking level 15 | # And a list of deep copied learners 16 | self.level_sizes = [] 17 | self.learners = [] 18 | for learning_level in learners: 19 | 20 | self.level_sizes.append(len(learning_level)) 21 | level_learners = [] 22 | for learner in learning_level: 23 | level_learners.append(deepcopy(learner)) 24 | self.learners.append(level_learners) 25 | 26 | 27 | 28 | # --- SECTION 3 --- 29 | # The fit function. Creates training meta data for every level and trains 30 | # each level on the previous level's meta data 31 | def fit(self, x, y): 32 | # Create a list of training meta data, one for each stacking level 33 | # and another one for the targets. For the first level, the actual data 34 | # is used. 35 | meta_data = [x] 36 | meta_targets = [y] 37 | for i in range(len(self.learners)): 38 | level_size = self.level_sizes[i] 39 | 40 | # Create the meta data and target variables for this level 41 | data_z = np.zeros((level_size, len(x))) 42 | target_z = np.zeros(len(x)) 43 | 44 | train_x = meta_data[i] 45 | train_y = meta_targets[i] 46 | 47 | # Create the cross-validation folds 48 | KF = KFold(n_splits=5) 49 | meta_index = 0 50 | for train_indices, test_indices in KF.split(x): 51 | # Train each learner on the K-1 folds and create 52 | # meta data for the Kth fold 53 | for j in range(len(self.learners[i])): 54 | 55 | learner = self.learners[i][j] 56 | learner.fit(train_x[train_indices], train_y[train_indices]) 57 | predictions = learner.predict(train_x[test_indices]) 58 | 59 | data_z[j][meta_index:meta_index+len(test_indices)] = predictions 60 | 61 | target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices] 62 | meta_index += len(test_indices) 63 | 64 | # Add the data and targets to the meta data lists 65 | data_z = data_z.transpose() 66 | meta_data.append(data_z) 67 | meta_targets.append(target_z) 68 | 69 | 70 | # Train the learner on the whole previous meta data 71 | for learner in self.learners[i]: 72 | learner.fit(train_x, train_y) 73 | 74 | 75 | 76 | 77 | 78 | 79 | # --- SECTION 4 --- 80 | # The predict function. Creates meta data for the test data and returns 81 | # all of them. The actual predictions can be accessed with meta_data[-1] 82 | def predict(self, x): 83 | 84 | # Create a list of training meta data, one for each stacking level 85 | meta_data = [x] 86 | for i in range(len(self.learners)): 87 | level_size = self.level_sizes[i] 88 | 89 | data_z = np.zeros((level_size, len(x))) 90 | 91 | test_x = meta_data[i] 92 | 93 | # Create the cross-validation folds 94 | KF = KFold(n_splits=5) 95 | for train_indices, test_indices in KF.split(x): 96 | # Train each learner on the K-1 folds and create 97 | # meta data for the Kth fold 98 | for j in range(len(self.learners[i])): 99 | 100 | learner = self.learners[i][j] 101 | predictions = learner.predict(test_x) 102 | data_z[j] = predictions 103 | 104 | 105 | 106 | # Add the data and targets to the meta data lists 107 | data_z = data_z.transpose() 108 | meta_data.append(data_z) 109 | 110 | # Return the meta_data the final layer's prediction can be accessed 111 | # With meta_data[-1] 112 | return meta_data 113 | 114 | 115 | 116 | # --- SECTION 5 --- 117 | # Use the classifier 118 | from sklearn.datasets import load_diabetes 119 | from sklearn.neighbors import KNeighborsRegressor 120 | from sklearn.tree import DecisionTreeRegressor 121 | from sklearn.linear_model import LinearRegression, Ridge 122 | from sklearn import metrics 123 | diabetes = load_diabetes() 124 | 125 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 126 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 127 | 128 | base_learners = [] 129 | 130 | knn = KNeighborsRegressor(n_neighbors=5) 131 | base_learners.append(knn) 132 | 133 | dtr = DecisionTreeRegressor(max_depth=4, random_state=123456) 134 | base_learners.append(dtr) 135 | 136 | ridge = Ridge() 137 | base_learners.append(ridge) 138 | 139 | meta_learner = LinearRegression() 140 | 141 | # Instantiate the stacking regressor 142 | sc = StackingRegressor([[knn,dtr,ridge],[meta_learner]]) 143 | 144 | # Fit and predict 145 | sc.fit(train_x, train_y) 146 | meta_data = sc.predict(test_x) 147 | 148 | # Evaluate base learners and meta learner 149 | base_errors = [] 150 | base_r2 = [] 151 | for i in range(len(base_learners)): 152 | learner = base_learners[i] 153 | 154 | predictions = meta_data[1][:,i] 155 | 156 | err = metrics.mean_squared_error(test_y, predictions) 157 | r2 = metrics.r2_score(test_y, predictions) 158 | 159 | base_errors.append(err) 160 | base_r2.append(r2) 161 | 162 | err = metrics.mean_squared_error(test_y, meta_data[-1]) 163 | r2 = metrics.r2_score(test_y, meta_data[-1]) 164 | 165 | # Print the results 166 | print('ERROR R2 Name') 167 | print('-'*20) 168 | for i in range(len(base_learners)): 169 | learner = base_learners[i] 170 | 171 | print(f'{base_errors[i]:.1f} {base_r2[i]:.2f} {learner.__class__.__name__}') 172 | print(f'{err:.1f} {r2:.2f} Ensemble') 173 | -------------------------------------------------------------------------------- /Chapter04/stacking_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.neighbors import KNeighborsRegressor 5 | from sklearn.tree import DecisionTreeRegressor 6 | from sklearn.linear_model import LinearRegression, Ridge 7 | from sklearn.model_selection import KFold 8 | from sklearn import metrics 9 | import numpy as np 10 | diabetes = load_diabetes() 11 | 12 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 13 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 14 | 15 | # --- SECTION 2 --- 16 | # Create the ensemble's base learners and meta learner 17 | # Append base learners to a list for ease of access 18 | base_learners = [] 19 | 20 | knn = KNeighborsRegressor(n_neighbors=5) 21 | base_learners.append(knn) 22 | 23 | dtr = DecisionTreeRegressor(max_depth=4, random_state=123456) 24 | base_learners.append(dtr) 25 | 26 | ridge = Ridge() 27 | base_learners.append(ridge) 28 | 29 | meta_learner = LinearRegression() 30 | 31 | 32 | # --- SECTION 3 --- 33 | # Create the training meta data 34 | 35 | # Create variables to store meta data and their targets 36 | meta_data = np.zeros((len(base_learners), len(train_x))) 37 | meta_targets = np.zeros(len(train_x)) 38 | 39 | # Create the cross-validation folds 40 | KF = KFold(n_splits=5) 41 | meta_index = 0 42 | for train_indices, test_indices in KF.split(train_x): 43 | # Train each learner on the K-1 folds and create meta data for the Kth fold 44 | for i in range(len(base_learners)): 45 | learner = base_learners[i] 46 | 47 | learner.fit(train_x[train_indices], train_y[train_indices]) 48 | predictions = learner.predict(train_x[test_indices]) 49 | 50 | meta_data[i][meta_index:meta_index+len(test_indices)] = predictions 51 | 52 | meta_targets[meta_index:meta_index+len(test_indices)] = train_y[test_indices] 53 | meta_index += len(test_indices) 54 | 55 | # Transpose the meta data to be fed into the meta learner 56 | meta_data = meta_data.transpose() 57 | 58 | # --- SECTION 4 --- 59 | # Create the meta data for the test set and evaluate the base learners 60 | test_meta_data = np.zeros((len(base_learners), len(test_x))) 61 | base_errors = [] 62 | base_r2 = [] 63 | for i in range(len(base_learners)): 64 | learner = base_learners[i] 65 | learner.fit(train_x, train_y) 66 | predictions = learner.predict(test_x) 67 | test_meta_data[i] = predictions 68 | 69 | err = metrics.mean_squared_error(test_y, predictions) 70 | r2 = metrics.r2_score(test_y, predictions) 71 | 72 | base_errors.append(err) 73 | base_r2.append(r2) 74 | 75 | test_meta_data = test_meta_data.transpose() 76 | 77 | # --- SECTION 5 --- 78 | # Fit the meta learner on the train set and evaluate it on the test set 79 | meta_learner.fit(meta_data, meta_targets) 80 | ensemble_predictions = meta_learner.predict(test_meta_data) 81 | 82 | err = metrics.mean_squared_error(test_y, ensemble_predictions) 83 | r2 = metrics.r2_score(test_y, ensemble_predictions) 84 | 85 | # --- SECTION 6 --- 86 | # Print the results 87 | print('ERROR R2 Name') 88 | print('-'*20) 89 | for i in range(len(base_learners)): 90 | learner = base_learners[i] 91 | 92 | print(f'{base_errors[i]:.1f} {base_r2[i]:.2f} {learner.__class__.__name__}') 93 | print(f'{err:.1f} {r2:.2f} Ensemble') 94 | -------------------------------------------------------------------------------- /Chapter04/stacking_regression_analysis.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | knn_d = test_meta_data[:,0]-test_y 4 | dtr_d = test_meta_data[:,1]-test_y 5 | ridge_d = test_meta_data[:,2]-test_y 6 | meta_d = ensemble_predictions-test_y 7 | 8 | plt.plot(knn_d, label='KNN') 9 | plt.plot(dtr_d, label='DTree') 10 | plt.plot(ridge_d, label='Ridge') 11 | plt.plot(meta_d, label='Ensemble') 12 | plt.legend() -------------------------------------------------------------------------------- /Chapter05/bagging_custom.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn import metrics 6 | import numpy as np 7 | import time 8 | 9 | start = time.time() 10 | 11 | digits = load_digits() 12 | 13 | train_size = 1500 14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 16 | 17 | # --- SECTION 2 --- 18 | # Create our bootstrap samples and train the classifiers 19 | 20 | ensemble_size = 10 21 | base_learners = [] 22 | 23 | for _ in range(ensemble_size): 24 | # We sample indices in order to access features and targets 25 | bootstrap_sample_indices = np.random.randint(0, train_size, size=train_size) 26 | bootstrap_x = train_x[bootstrap_sample_indices] 27 | bootstrap_y = train_y[bootstrap_sample_indices] 28 | dtree = DecisionTreeClassifier() 29 | dtree.fit(bootstrap_x, bootstrap_y) 30 | base_learners.append(dtree) 31 | 32 | # --- SECTION 3 --- 33 | # Predict with the base learners and evaluate them 34 | base_predictions = [] 35 | base_accuracy = [] 36 | for learner in base_learners: 37 | predictions = learner.predict(test_x) 38 | base_predictions.append(predictions) 39 | acc = metrics.accuracy_score(test_y, predictions) 40 | base_accuracy.append(acc) 41 | 42 | # --- SECTION 4 --- 43 | # Combine the base learners' predictions 44 | 45 | ensemble_predictions = [] 46 | # Find the most voted class for each test instance 47 | for i in range(len(test_y)): 48 | # Count the votes for each class 49 | counts = [0 for _ in range(10)] 50 | for learner_predictions in base_predictions: 51 | counts[learner_predictions[i]] = counts[learner_predictions[i]]+1 52 | 53 | # Find the class with most votes 54 | final = np.argmax(counts) 55 | # Add the class to the final predictions 56 | ensemble_predictions.append(final) 57 | 58 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 59 | 60 | end = time.time() 61 | 62 | 63 | # --- SECTION 5 --- 64 | # Print the accuracies 65 | print('Base Learners:') 66 | print('-'*30) 67 | for index, acc in enumerate(sorted(base_accuracy)): 68 | print(f'Learner {index+1}: %.2f' % acc) 69 | print('-'*30) 70 | print('Bagging: %.2f' % ensemble_acc) 71 | 72 | print('Total time: %.2f' % (end - start)) 73 | -------------------------------------------------------------------------------- /Chapter05/bagging_custom_parallel.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries 3 | from sklearn.datasets import load_digits 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn import metrics 6 | import numpy as np 7 | import time 8 | 9 | 10 | from concurrent.futures import ProcessPoolExecutor 11 | 12 | # --- SECTION 2 --- 13 | # Define required functions 14 | train_size = 1500 15 | 16 | 17 | def create_learner(train_x, train_y): 18 | # We sample indices in order to access features and targets 19 | bootstrap_sample_indices = np.random.randint(0, train_size, size=train_size) 20 | bootstrap_x = train_x[bootstrap_sample_indices] 21 | bootstrap_y = train_y[bootstrap_sample_indices] 22 | dtree = DecisionTreeClassifier() 23 | dtree.fit(bootstrap_x, bootstrap_y) 24 | return dtree 25 | 26 | 27 | def predict(learner, test_x): 28 | return learner.predict(test_x) 29 | 30 | 31 | # --- SECTION 3 --- 32 | # Protect our main 33 | if __name__ == '__main__': 34 | 35 | start = time.time() 36 | digits = load_digits() 37 | 38 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 39 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 40 | 41 | ensemble_size = 1000 42 | base_learners = [] 43 | 44 | # --- SECTION 4 --- 45 | # Create the base learners 46 | with ProcessPoolExecutor() as executor: 47 | futures = [] 48 | for _ in range(ensemble_size): 49 | future = executor.submit(create_learner, train_x, train_y) 50 | futures.append(future) 51 | 52 | for future in futures: 53 | base_learners.append(future.result()) 54 | 55 | # --- SECTION 5 --- 56 | # Predict with the base learners and evaluate them 57 | base_predictions = [] 58 | base_accuracy = [] 59 | with ProcessPoolExecutor() as executor: 60 | futures = [] 61 | for learner in base_learners: 62 | future = executor.submit(predict, learner, test_x) 63 | futures.append(future) 64 | 65 | for future in futures: 66 | predictions = future.result() 67 | base_predictions.append(predictions) 68 | acc = metrics.accuracy_score(test_y, predictions) 69 | base_accuracy.append(acc) 70 | 71 | # --- SECTION 6 --- 72 | # Combine the base learners' predictions 73 | ensemble_predictions = [] 74 | # Find the most voted class for each test instance 75 | for i in range(len(test_y)): 76 | # Count the votes for each class 77 | counts = [0 for _ in range(10)] 78 | for learner_predictions in base_predictions: 79 | counts[learner_predictions[i]] = counts[learner_predictions[i]]+1 80 | 81 | # Find the class with most votes 82 | final = np.argmax(counts) 83 | # Add the class to the final predictions 84 | ensemble_predictions.append(final) 85 | 86 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 87 | 88 | end = time.time() 89 | 90 | # --- SECTION 7 --- 91 | # Print the accuracies 92 | print('Base Learners:') 93 | print('-'*30) 94 | for index, acc in enumerate(sorted(base_accuracy)): 95 | print(f'Learner {index+1}: %.2f' % acc) 96 | print('-'*30) 97 | print('Bagging: %.2f' % ensemble_acc) 98 | print('Total time: %.2f' % (end - start)) 99 | -------------------------------------------------------------------------------- /Chapter05/bagging_sklearn_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn.ensemble import BaggingClassifier 6 | from sklearn import metrics 7 | 8 | 9 | digits = load_digits() 10 | 11 | train_size = 1500 12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 14 | 15 | # --- SECTION 2 --- 16 | # Create the ensemble 17 | ensemble_size = 10 18 | ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 19 | n_estimators=ensemble_size, 20 | oob_score=True) 21 | 22 | # --- SECTION 3 --- 23 | # Train the ensemble 24 | ensemble.fit(train_x, train_y) 25 | 26 | # --- SECTION 4 --- 27 | # Evaluate the ensemble 28 | ensemble_predictions = ensemble.predict(test_x) 29 | 30 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 31 | 32 | # --- SECTION 5 --- 33 | # Print the accuracy 34 | print('Bagging: %.2f' % ensemble_acc) 35 | -------------------------------------------------------------------------------- /Chapter05/bagging_sklearn_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.tree import DecisionTreeRegressor 5 | from sklearn.ensemble import BaggingRegressor 6 | from sklearn import metrics 7 | import numpy as np 8 | diabetes = load_diabetes() 9 | 10 | np.random.seed(1234) 11 | 12 | train_x, train_y = diabetes.data[:400], diabetes.target[:400] 13 | test_x, test_y = diabetes.data[400:], diabetes.target[400:] 14 | 15 | # --- SECTION 2 --- 16 | # Create the ensemble and a single base learner for comparison 17 | estimator = DecisionTreeRegressor(max_depth=6) 18 | ensemble = BaggingRegressor(base_estimator=estimator, 19 | n_estimators=10) 20 | 21 | # --- SECTION 3 --- 22 | # Train and evaluate both the ensemble and the base learner 23 | ensemble.fit(train_x, train_y) 24 | ensemble_predictions = ensemble.predict(test_x) 25 | 26 | estimator.fit(train_x, train_y) 27 | single_predictions = estimator.predict(test_x) 28 | 29 | ensemble_r2 = metrics.r2_score(test_y, ensemble_predictions) 30 | ensemble_mse = metrics.mean_squared_error(test_y, ensemble_predictions) 31 | 32 | single_r2 = metrics.r2_score(test_y, single_predictions) 33 | single_mse = metrics.mean_squared_error(test_y, single_predictions) 34 | 35 | # --- SECTION 4 --- 36 | # Print the metrics 37 | print('Bagging r-squared: %.2f' % ensemble_r2) 38 | print('Bagging MSE: %.2f' % ensemble_mse) 39 | print('-'*30) 40 | print('Decision Tree r-squared: %.2f' % single_r2) 41 | print('Decision Tree MSE: %.2f' % single_mse) 42 | -------------------------------------------------------------------------------- /Chapter05/bootstrapping.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.datasets import load_diabetes 6 | 7 | diabetes = load_diabetes() 8 | 9 | # --- SECTION 2 --- 10 | # Print the original sample's statistics 11 | target = diabetes.target 12 | 13 | print(np.mean(target)) 14 | print(np.std(target)) 15 | 16 | # --- SECTION 3 --- 17 | # Create the bootstrap samples and statistics 18 | bootstrap_stats = [] 19 | 20 | for _ in range(10000): 21 | bootstrap_sample = np.random.choice(target, size=int(len(target)/1)) 22 | mean = np.mean(bootstrap_sample) 23 | std = np.std(bootstrap_sample) 24 | bootstrap_stats.append((mean, std)) 25 | 26 | bootstrap_stats = np.array(bootstrap_stats) 27 | 28 | 29 | # --- SECTION 4 --- 30 | # plot the histograms 31 | plt.figure() 32 | plt.subplot(2,1,1) 33 | std_err = np.std(bootstrap_stats[:,0]) 34 | plt.title('Mean, Std. Error: %.2f'%std_err) 35 | plt.hist(bootstrap_stats[:,0], bins=20) 36 | 37 | plt.subplot(2,1,2) 38 | std_err = np.std(bootstrap_stats[:,1]) 39 | plt.title('Std. Dev, Std. Error: %.2f'%std_err) 40 | plt.hist(bootstrap_stats[:,1], bins=20) 41 | plt.show() -------------------------------------------------------------------------------- /Chapter05/validation_curves.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn.ensemble import BaggingClassifier 6 | from sklearn.model_selection import validation_curve 7 | import warnings 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | 11 | warnings.filterwarnings("ignore") 12 | 13 | digits = load_digits() 14 | 15 | train_size = 1500 16 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 17 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 18 | 19 | 20 | # --- SECTION 2 --- 21 | # Create in-sample and out-of-sample scores 22 | x, y = train_x, train_y 23 | learner = BaggingClassifier() 24 | param_range = [x for x in range(1, 40, 2)] 25 | train_scores, test_scores = validation_curve(learner, x, y, 26 | param_name='n_estimators', 27 | param_range=param_range, 28 | cv=10, 29 | scoring="accuracy") 30 | 31 | # --- SECTION 3 --- 32 | # Calculate the average and standard deviation for each hyperparameter 33 | train_scores_mean = np.mean(train_scores, axis=1) 34 | train_scores_std = np.std(train_scores, axis=1) 35 | test_scores_mean = np.mean(test_scores, axis=1) 36 | test_scores_std = np.std(test_scores, axis=1) 37 | 38 | 39 | # --- SECTION 4 --- 40 | # Plot the scores 41 | plt.figure() 42 | plt.title('Validation curves') 43 | # Plot the standard deviations 44 | plt.fill_between(param_range, train_scores_mean - train_scores_std, 45 | train_scores_mean + train_scores_std, alpha=0.1, 46 | color="C1") 47 | plt.fill_between(param_range, test_scores_mean - test_scores_std, 48 | test_scores_mean + test_scores_std, alpha=0.1, color="C0") 49 | 50 | # Plot the means 51 | plt.plot(param_range, train_scores_mean, 'o-', color="C1", 52 | label="Training score") 53 | plt.plot(param_range, test_scores_mean, 'o-', color="C0", 54 | label="Cross-validation score") 55 | 56 | plt.xticks(param_range) 57 | plt.xlabel('Ensemble Size') 58 | plt.ylabel('Accuracy') 59 | plt.legend(loc="best") -------------------------------------------------------------------------------- /Chapter06/adaboost_custom.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from copy import deepcopy 4 | from sklearn.datasets import load_breast_cancer 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn import metrics 7 | 8 | import numpy as np 9 | 10 | bc = load_breast_cancer() 11 | 12 | train_size = 400 13 | train_x, train_y = bc.data[:train_size], bc.target[:train_size] 14 | test_x, test_y = bc.data[train_size:], bc.target[train_size:] 15 | 16 | np.random.seed(123456) 17 | 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 100 21 | base_classifier = DecisionTreeClassifier(max_depth=1) 22 | 23 | # Create the initial weights 24 | data_weights = np.zeros(train_size) + 1/train_size 25 | # Create a list of indices for the train set 26 | indices = [x for x in range(train_size)] 27 | 28 | base_learners = [] 29 | learners_errors = np.zeros(ensemble_size) 30 | learners_weights = np.zeros(ensemble_size) 31 | 32 | errs = [] 33 | # Create each base learner 34 | for i in range(ensemble_size): 35 | weak_learner = deepcopy(base_classifier) 36 | 37 | # Choose the samples by sampling with replacement. 38 | # Each instance's probability is dictated by its weight. 39 | data_indices = np.random.choice(indices, train_size, p=data_weights) 40 | sample_x, sample_y = train_x[data_indices], train_y[data_indices] 41 | 42 | # Fit the weak learner and evaluate it 43 | weak_learner.fit(sample_x, sample_y) 44 | predictions = weak_learner.predict(train_x) 45 | 46 | errors = predictions != train_y 47 | corrects = predictions == train_y 48 | 49 | # Calculate the weighted errors 50 | weighted_errors = data_weights*errors 51 | 52 | 53 | # The base learner's error is the average of the weighted errors 54 | learner_error = np.mean(weighted_errors) 55 | learners_errors[i] = learner_error 56 | 57 | # The learner's weight 58 | learner_weight = np.log((1-learner_error)/learner_error)/2 59 | learners_weights[i] = learner_weight 60 | 61 | # Update the data weights 62 | data_weights[errors] = np.exp(data_weights[errors] * learner_weight) 63 | data_weights[corrects] = np.exp(-data_weights[corrects] * learner_weight) 64 | 65 | data_weights = data_weights/sum(data_weights) 66 | # Save the learner 67 | base_learners.append(weak_learner) 68 | 69 | 70 | 71 | # --- SECTION 3 --- 72 | # Evaluate the ensemble 73 | ensemble_predictions = [] 74 | for learner, weight in zip(base_learners, learners_weights): 75 | # Calculate the weighted predictions 76 | prediction = learner.predict(test_x) 77 | ensemble_predictions.append(prediction*weight) 78 | 79 | # The final prediction is the weighted mean of the individual predictions 80 | ensemble_predictions = np.mean(ensemble_predictions, axis=0) >= 0.5 81 | 82 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 83 | 84 | # --- SECTION 4 --- 85 | # Print the accuracy 86 | print('Boosting: %.2f' % ensemble_acc) 87 | -------------------------------------------------------------------------------- /Chapter06/adaboost_sklearn_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_digits 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.ensemble import AdaBoostClassifier 8 | from sklearn import metrics 9 | 10 | 11 | digits = load_digits() 12 | 13 | train_size = 1500 14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 16 | 17 | np.random.seed(123456) 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 1000 21 | ensemble = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 22 | algorithm="SAMME", 23 | n_estimators=ensemble_size) 24 | 25 | # --- SECTION 3 --- 26 | # Train the ensemble 27 | ensemble.fit(train_x, train_y) 28 | 29 | # --- SECTION 4 --- 30 | # Evaluate the ensemble 31 | ensemble_predictions = ensemble.predict(test_x) 32 | 33 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 34 | 35 | # --- SECTION 5 --- 36 | # Print the accuracy 37 | print('Boosting: %.2f' % ensemble_acc) 38 | 39 | -------------------------------------------------------------------------------- /Chapter06/adaboost_sklearn_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from copy import deepcopy 4 | from sklearn.datasets import load_diabetes 5 | from sklearn.ensemble import AdaBoostRegressor 6 | from sklearn.tree import DecisionTreeRegressor 7 | from sklearn import metrics 8 | 9 | import numpy as np 10 | 11 | diabetes = load_diabetes() 12 | 13 | train_size = 400 14 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size] 15 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:] 16 | 17 | np.random.seed(123456) 18 | 19 | # --- SECTION 2 --- 20 | # Create the ensemble 21 | ensemble_size = 1000 22 | ensemble = AdaBoostRegressor(n_estimators=ensemble_size) 23 | 24 | # --- SECTION 3 --- 25 | # Evaluate the ensemble 26 | ensemble.fit(train_x, train_y) 27 | predictions = ensemble.predict(test_x) 28 | 29 | # --- SECTION 4 --- 30 | # Print the metrics 31 | r2 = metrics.r2_score(test_y, predictions) 32 | mse = metrics.mean_squared_error(test_y, predictions) 33 | 34 | print('Gradient Boosting:') 35 | print('R-squared: %.2f' % r2) 36 | print('MSE: %.2f' % mse) -------------------------------------------------------------------------------- /Chapter06/boosting_overfit.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | np.random.seed(12345) 5 | 6 | points = np.random.multivariate_normal([1, 1], ([1, 0.5],[0.5, 1]), 10) 7 | points2 = np.random.multivariate_normal([4, 4], ([1, 0.5],[0.5, 1]), 10) 8 | 9 | plt.scatter(*zip(*points), label='Class 1', marker='+', s=150) 10 | plt.scatter(*zip(*points2), label='Class 2', marker='_', s=150) 11 | plt.plot([-x+6 for x in range(0,10)], linestyle='--', 12 | color='black', label='class boundary') 13 | 14 | #plt.text(0,5, '+', fontsize=18) 15 | #plt.text(1.5,5.5, '_', fontsize=18) 16 | 17 | r = range(-5,10) 18 | stable = [x for x in r] 19 | 20 | plt.plot([1.45 for x in r], stable, linestyle='--', 21 | color='gray', label='outlier rules') 22 | plt.plot([1.9 for x in r], stable, linestyle='--', 23 | color='gray') 24 | 25 | plt.plot(stable,[0.85 for x in r], linestyle='--', 26 | color='gray') 27 | plt.plot(stable,[0.55 for x in r], linestyle='--', 28 | color='gray') 29 | plt.xticks([], []) 30 | plt.yticks([], []) 31 | 32 | 33 | 34 | plt.legend() -------------------------------------------------------------------------------- /Chapter06/dataset_segmentation.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | np.random.seed(654321) 5 | 6 | points = np.random.randint(0, 10, size=(10, 2)) 7 | classes = np.random.randint(0, 2, size=(10,)) 8 | 9 | 10 | positives = points[classes == 0] 11 | negatives = points[classes == 1] 12 | 13 | plt.scatter(*positives.T, marker='+', s=150) 14 | plt.scatter(*negatives.T, marker='_', s=150) 15 | plt.xticks([], []) 16 | plt.yticks([], []) 17 | 18 | plt.plot([1.5 for _ in range(12)], [x for x in range(-1, 11)], linestyle='--', color='black') 19 | plt.plot([4.5 for _ in range(12)], [x for x in range(-1, 11)], linestyle='--', color='black') 20 | 21 | plt.plot([x for x in range(-1, 8)], [1.5 for _ in range(9)], linestyle='--', color='black') -------------------------------------------------------------------------------- /Chapter06/gradient_boosting_custom.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from copy import deepcopy 4 | from sklearn.datasets import load_diabetes 5 | from sklearn.tree import DecisionTreeRegressor 6 | from sklearn import metrics 7 | 8 | import numpy as np 9 | 10 | diabetes = load_diabetes() 11 | 12 | train_size = 400 13 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size] 14 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:] 15 | 16 | np.random.seed(123456) 17 | 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | 21 | # Define the ensemble's size, learning rate and decision tree depth 22 | ensemble_size = 50 23 | learning_rate = 0.1 24 | base_classifier = DecisionTreeRegressor(max_depth=3) 25 | 26 | # Create placeholders for the base learners and each step's prediction 27 | base_learners = [] 28 | # Note that the initial prediction is the target variable's mean 29 | previous_predictions = np.zeros(len(train_y)) + np.mean(train_y) 30 | 31 | # Create the base learners 32 | for _ in range(ensemble_size): 33 | # Start by calcualting the pseudo-residuals 34 | errors = train_y - previous_predictions 35 | 36 | # Make a deep copy of the base classifier and train it on the 37 | # pseudo-residuals 38 | learner = deepcopy(base_classifier) 39 | learner.fit(train_x, errors) 40 | 41 | # Predict the residuals on the train set 42 | predictions = learner.predict(train_x) 43 | 44 | # Multiply the predictions with the learning rate and add the results 45 | # to the previous prediction 46 | previous_predictions = previous_predictions + learning_rate*predictions 47 | 48 | # Save the base learner 49 | base_learners.append(learner) 50 | 51 | # --- SECTION 3 --- 52 | # Evaluate the ensemble 53 | 54 | # Start with the train set's mean 55 | previous_predictions = np.zeros(len(test_y)) + np.mean(train_y) 56 | 57 | # For each base learner predict the pseudo-residuals for the test set and 58 | # add them to the previous prediction, after multiplying with the learning rate 59 | for learner in base_learners: 60 | predictions = learner.predict(test_x) 61 | previous_predictions = previous_predictions + learning_rate*predictions 62 | 63 | # --- SECTION 4 --- 64 | # Print the metrics 65 | r2 = metrics.r2_score(test_y, previous_predictions) 66 | mse = metrics.mean_squared_error(test_y, previous_predictions) 67 | 68 | print('Gradient Boosting:') 69 | print('R-squared: %.2f' % r2) 70 | print('MSE: %.2f' % mse) -------------------------------------------------------------------------------- /Chapter06/gradient_boosting_sklearn_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_digits 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.ensemble import GradientBoostingClassifier 8 | from sklearn import metrics 9 | 10 | 11 | digits = load_digits() 12 | 13 | train_size = 1500 14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 16 | 17 | np.random.seed(123456) 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 200 21 | learning_rate = 0.1 22 | ensemble = GradientBoostingClassifier(n_estimators=ensemble_size, 23 | learning_rate=learning_rate) 24 | 25 | # --- SECTION 3 --- 26 | # Train the ensemble 27 | ensemble.fit(train_x, train_y) 28 | 29 | # --- SECTION 4 --- 30 | # Evaluate the ensemble 31 | ensemble_predictions = ensemble.predict(test_x) 32 | 33 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 34 | 35 | # --- SECTION 5 --- 36 | # Print the accuracy 37 | print('Boosting: %.2f' % ensemble_acc) 38 | 39 | 40 | import matplotlib.pyplot as plt 41 | diffs = [ensemble.train_score_[i] - ensemble.train_score_[i-1] for i in range(1, len(ensemble.train_score_))] 42 | 43 | fig, ax1 = plt.subplots() 44 | ax1.plot(ensemble.train_score_, linestyle='--', label='Errors (Left axis)') 45 | 46 | 47 | ax2 = ax1.twinx() 48 | ax2.plot(diffs, label='Errors Differences (Right axis)') 49 | fig.legend() 50 | -------------------------------------------------------------------------------- /Chapter06/gradient_boosting_sklearn_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_diabetes 4 | from sklearn.ensemble import GradientBoostingRegressor 5 | from sklearn import metrics 6 | 7 | import numpy as np 8 | 9 | diabetes = load_diabetes() 10 | 11 | train_size = 400 12 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size] 13 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:] 14 | 15 | np.random.seed(123456) 16 | 17 | # --- SECTION 2 --- 18 | # Create the ensemble 19 | ensemble_size = 200 20 | learning_rate = 0.1 21 | ensemble = GradientBoostingRegressor(n_estimators=ensemble_size, 22 | learning_rate=learning_rate) 23 | 24 | # --- SECTION 3 --- 25 | # Evaluate the ensemble 26 | ensemble.fit(train_x, train_y) 27 | predictions = ensemble.predict(test_x) 28 | 29 | # --- SECTION 4 --- 30 | # Print the metrics 31 | r2 = metrics.r2_score(test_y, predictions) 32 | mse = metrics.mean_squared_error(test_y, predictions) 33 | 34 | print('Gradient Boosting:') 35 | print('R-squared: %.2f' % r2) 36 | print('MSE: %.2f' % mse) 37 | 38 | 39 | import matplotlib.pyplot as plt 40 | diffs = [ensemble.train_score_[i] - ensemble.train_score_[i-1] for i in range(1, len(ensemble.train_score_))] 41 | 42 | fig, ax1 = plt.subplots() 43 | ax1.plot(ensemble.train_score_, linestyle='--', label='Errors (Left axis)') 44 | 45 | 46 | ax2 = ax1.twinx() 47 | ax2.plot(diffs, label='Errors Differences (Right axis)') 48 | fig.legend() 49 | -------------------------------------------------------------------------------- /Chapter06/xgb_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from xgboost import XGBClassifier 5 | from sklearn import metrics 6 | 7 | import numpy as np 8 | 9 | digits = load_digits() 10 | 11 | train_size = 1500 12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 14 | 15 | np.random.seed(123456) 16 | # --- SECTION 2 --- 17 | # Create the ensemble 18 | ensemble_size = 100 19 | ensemble = XGBClassifier(n_estimators=ensemble_size, n_jobs=4) 20 | 21 | # --- SECTION 3 --- 22 | # Train the ensemble 23 | ensemble.fit(train_x, train_y) 24 | 25 | # --- SECTION 4 --- 26 | # Evaluate the ensemble 27 | ensemble_predictions = ensemble.predict(test_x) 28 | 29 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 30 | 31 | # --- SECTION 5 --- 32 | # Print the accuracy 33 | print('Boosting: %.2f' % ensemble_acc) 34 | -------------------------------------------------------------------------------- /Chapter06/xgb_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_diabetes 4 | from xgboost import XGBRegressor 5 | from sklearn import metrics 6 | 7 | import numpy as np 8 | 9 | diabetes = load_diabetes() 10 | 11 | train_size = 400 12 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size] 13 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:] 14 | 15 | np.random.seed(123456) 16 | 17 | # --- SECTION 2 --- 18 | # Create the ensemble 19 | ensemble_size = 200 20 | ensemble = XGBRegressor(n_estimators=ensemble_size, n_jobs=4, 21 | max_depth=1, learning_rate=0.1, 22 | objective ='reg:squarederror') 23 | 24 | # --- SECTION 3 --- 25 | # Evaluate the ensemble 26 | ensemble.fit(train_x, train_y) 27 | predictions = ensemble.predict(test_x) 28 | 29 | # --- SECTION 4 --- 30 | # Print the metrics 31 | r2 = metrics.r2_score(test_y, predictions) 32 | mse = metrics.mean_squared_error(test_y, predictions) 33 | 34 | print('Gradient Boosting:') 35 | print('R-squared: %.2f' % r2) 36 | print('MSE: %.2f' % mse) 37 | -------------------------------------------------------------------------------- /Chapter07/extra_tree_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.ensemble import ExtraTreesClassifier 5 | from sklearn import metrics 6 | import numpy as np 7 | 8 | digits = load_digits() 9 | 10 | 11 | train_size = 1500 12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 14 | 15 | np.random.seed(123456) 16 | # --- SECTION 2 --- 17 | # Create the ensemble 18 | ensemble_size = 500 19 | ensemble = ExtraTreesClassifier(n_estimators=ensemble_size, n_jobs=4) 20 | 21 | # --- SECTION 3 --- 22 | # Train the ensemble 23 | ensemble.fit(train_x, train_y) 24 | 25 | # --- SECTION 4 --- 26 | # Evaluate the ensemble 27 | ensemble_predictions = ensemble.predict(test_x) 28 | 29 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 30 | 31 | # --- SECTION 5 --- 32 | # Print the accuracy 33 | print('Extra Tree Forest: %.2f' % ensemble_acc) 34 | -------------------------------------------------------------------------------- /Chapter07/extra_tree_classification_validation_curves.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.ensemble import ExtraTreesClassifier 5 | from sklearn.model_selection import validation_curve 6 | from sklearn import metrics 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | digits = load_digits() 11 | 12 | 13 | train_size = 1500 14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 16 | 17 | np.random.seed(123456) 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 500 21 | ensemble = ExtraTreesClassifier(n_estimators=ensemble_size, n_jobs=4) 22 | 23 | param_range = [10, 50, 100, 150, 200, 250, 300, 350, 400] 24 | train_scores, test_scores = validation_curve(ensemble, train_x, train_y, 'n_estimators', param_range, 25 | cv=10, scoring='accuracy') 26 | 27 | # --- SECTION 3 --- 28 | # Calculate the average and standard deviation for each hyperparameter 29 | train_scores_mean = np.mean(train_scores, axis=1) 30 | train_scores_std = np.std(train_scores, axis=1) 31 | test_scores_mean = np.mean(test_scores, axis=1) 32 | test_scores_std = np.std(test_scores, axis=1) 33 | 34 | 35 | # --- SECTION 4 --- 36 | # Plot the scores 37 | plt.figure() 38 | plt.title('Validation curves (Extra Trees)') 39 | # Plot the standard deviations 40 | plt.fill_between(param_range, train_scores_mean - train_scores_std, 41 | train_scores_mean + train_scores_std, alpha=0.1, 42 | color="C1") 43 | plt.fill_between(param_range, test_scores_mean - test_scores_std, 44 | test_scores_mean + test_scores_std, alpha=0.1, color="C0") 45 | 46 | # Plot the means 47 | plt.plot(param_range, train_scores_mean, 'o-', color="C1", 48 | label="Training score") 49 | plt.plot(param_range, test_scores_mean, 'o-', color="C0", 50 | label="Cross-validation score") 51 | 52 | plt.xticks(param_range) 53 | plt.xlabel('Number of trees') 54 | plt.ylabel('Accuracy') 55 | plt.legend(loc="best") 56 | -------------------------------------------------------------------------------- /Chapter07/extra_tree_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from copy import deepcopy 4 | from sklearn.datasets import load_diabetes 5 | from sklearn.ensemble import ExtraTreesRegressor 6 | from sklearn import metrics 7 | 8 | import numpy as np 9 | 10 | diabetes = load_diabetes() 11 | 12 | train_size = 400 13 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size] 14 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:] 15 | 16 | np.random.seed(123456) 17 | 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 1000 21 | ensemble = ExtraTreesRegressor(n_estimators=ensemble_size, 22 | min_samples_leaf=10, n_jobs=4) 23 | 24 | # --- SECTION 3 --- 25 | # Evaluate the ensemble 26 | ensemble.fit(train_x, train_y) 27 | predictions = ensemble.predict(test_x) 28 | 29 | # --- SECTION 4 --- 30 | # Print the metrics 31 | r2 = metrics.r2_score(test_y, predictions) 32 | mse = metrics.mean_squared_error(test_y, predictions) 33 | 34 | print('Extra Trees:') 35 | print('R-squared: %.2f' % r2) 36 | print('MSE: %.2f' % mse) -------------------------------------------------------------------------------- /Chapter07/probability_to_choose.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | p = 0 6 | def prob(relevant, irrelevant, select): 7 | p = 1 - relevant/(relevant+irrelevant) 8 | p_none = np.power(p, select) 9 | at_least_one = 1 - p_none 10 | return at_least_one 11 | 12 | 13 | data = np.zeros((10,10)) 14 | for i in range(1, 11): 15 | for j in range(1, 11): 16 | select = int(np.floor(np.sqrt(j*10))) 17 | data[-1+i,-1+j] = prob(i,j*10,select) 18 | 19 | 20 | fig, ax = plt.subplots() 21 | plt.gray() 22 | cs = ax.imshow(data, extent=[10,100,10,1]) 23 | ax.set_aspect(10) 24 | plt.xlabel('Irrelevant Features') 25 | plt.ylabel('Relevant Features') 26 | plt.title('Probability to select at least one relevant feature') 27 | fig.colorbar(cs) -------------------------------------------------------------------------------- /Chapter07/rf_classification.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn import metrics 6 | import numpy as np 7 | 8 | digits = load_digits() 9 | 10 | 11 | train_size = 1500 12 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 13 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 14 | 15 | np.random.seed(123456) 16 | # --- SECTION 2 --- 17 | # Create the ensemble 18 | ensemble_size = 500 19 | ensemble = RandomForestClassifier(n_estimators=ensemble_size, n_jobs=4) 20 | 21 | # --- SECTION 3 --- 22 | # Train the ensemble 23 | ensemble.fit(train_x, train_y) 24 | 25 | # --- SECTION 4 --- 26 | # Evaluate the ensemble 27 | ensemble_predictions = ensemble.predict(test_x) 28 | 29 | ensemble_acc = metrics.accuracy_score(test_y, ensemble_predictions) 30 | 31 | # --- SECTION 5 --- 32 | # Print the accuracy 33 | print('Random Forest: %.2f' % ensemble_acc) 34 | -------------------------------------------------------------------------------- /Chapter07/rf_classification_validation_curves.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from sklearn.datasets import load_digits 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.model_selection import validation_curve 6 | from sklearn import metrics 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | digits = load_digits() 11 | 12 | 13 | train_size = 1500 14 | train_x, train_y = digits.data[:train_size], digits.target[:train_size] 15 | test_x, test_y = digits.data[train_size:], digits.target[train_size:] 16 | 17 | np.random.seed(123456) 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 500 21 | ensemble = RandomForestClassifier(n_estimators=ensemble_size, n_jobs=4) 22 | 23 | param_range = [10, 50, 100, 150, 200, 250, 300, 350, 400] 24 | train_scores, test_scores = validation_curve(ensemble, train_x, train_y, 'n_estimators', param_range, 25 | cv=10, scoring='accuracy') 26 | 27 | # --- SECTION 3 --- 28 | # Calculate the average and standard deviation for each hyperparameter 29 | train_scores_mean = np.mean(train_scores, axis=1) 30 | train_scores_std = np.std(train_scores, axis=1) 31 | test_scores_mean = np.mean(test_scores, axis=1) 32 | test_scores_std = np.std(test_scores, axis=1) 33 | 34 | 35 | # --- SECTION 4 --- 36 | # Plot the scores 37 | plt.figure() 38 | plt.title('Validation curves (Random Forest)') 39 | # Plot the standard deviations 40 | plt.fill_between(param_range, train_scores_mean - train_scores_std, 41 | train_scores_mean + train_scores_std, alpha=0.1, 42 | color="C1") 43 | plt.fill_between(param_range, test_scores_mean - test_scores_std, 44 | test_scores_mean + test_scores_std, alpha=0.1, color="C0") 45 | 46 | # Plot the means 47 | plt.plot(param_range, train_scores_mean, 'o-', color="C1", 48 | label="Training score") 49 | plt.plot(param_range, test_scores_mean, 'o-', color="C0", 50 | label="Cross-validation score") 51 | 52 | plt.xticks(param_range) 53 | plt.xlabel('Number of trees') 54 | plt.ylabel('Accuracy') 55 | plt.legend(loc="best") 56 | -------------------------------------------------------------------------------- /Chapter07/rf_regression.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | from copy import deepcopy 4 | from sklearn.datasets import load_diabetes 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn import metrics 7 | 8 | import numpy as np 9 | 10 | diabetes = load_diabetes() 11 | 12 | train_size = 400 13 | train_x, train_y = diabetes.data[:train_size], diabetes.target[:train_size] 14 | test_x, test_y = diabetes.data[train_size:], diabetes.target[train_size:] 15 | 16 | np.random.seed(123456) 17 | 18 | # --- SECTION 2 --- 19 | # Create the ensemble 20 | ensemble_size = 1000 21 | ensemble = RandomForestRegressor(n_estimators=ensemble_size, 22 | min_samples_leaf=20, n_jobs=4) 23 | 24 | # --- SECTION 3 --- 25 | # Evaluate the ensemble 26 | ensemble.fit(train_x, train_y) 27 | predictions = ensemble.predict(test_x) 28 | 29 | # --- SECTION 4 --- 30 | # Print the metrics 31 | r2 = metrics.r2_score(test_y, predictions) 32 | mse = metrics.mean_squared_error(test_y, predictions) 33 | 34 | print('Random Forest:') 35 | print('R-squared: %.2f' % r2) 36 | print('MSE: %.2f' % mse) -------------------------------------------------------------------------------- /Chapter08/agglomerative.py: -------------------------------------------------------------------------------- 1 | from scipy.cluster import hierarchy 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | a = np.random.uniform(size=10) 6 | Z = hierarchy.linkage(a, 'single') 7 | plt.figure() 8 | dn = hierarchy.dendrogram(Z) 9 | plt.title('Hierarchical Clustering Dendrogram') -------------------------------------------------------------------------------- /Chapter08/kmeans_cluster.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from sklearn.cluster import KMeans 5 | from sklearn.datasets import load_breast_cancer 6 | from sklearn.manifold import TSNE 7 | 8 | 9 | np.random.seed(123456) 10 | 11 | bc = load_breast_cancer() 12 | tsne = TSNE() 13 | 14 | data = tsne.fit_transform(bc.data) 15 | reds = bc.target == 0 16 | blues = bc.target == 1 17 | plt.scatter(data[reds, 0], data[reds, 1], label='malignant') 18 | plt.scatter(data[blues, 0], data[blues, 1], label='benign') 19 | plt.xlabel('1st Component') 20 | plt.ylabel('2nd Component') 21 | plt.title('Breast Cancer dataa') 22 | plt.legend() 23 | 24 | 25 | plt.figure() 26 | plt.title('2, 4, and 6 clusters.') 27 | for clusters in [2, 4, 6]: 28 | km = KMeans(n_clusters=clusters) 29 | preds = km.fit_predict(data) 30 | plt.subplot(1, 3, clusters/2) 31 | plt.scatter(*zip(*data), c=preds) 32 | 33 | classified = {x: {'m': 0, 'b': 0} for x in range(clusters)} 34 | 35 | for i in range(len(data)): 36 | cluster = preds[i] 37 | label = bc.target[i] 38 | label = 'm' if label == 0 else 'b' 39 | classified[cluster][label] = classified[cluster][label]+1 40 | 41 | print('-'*40) 42 | for c in classified: 43 | print('Cluster %d. Malignant percentage: ' % c, end=' ') 44 | print(classified[c], end=' ') 45 | print('%.3f' % (classified[c]['m'] / 46 | (classified[c]['m'] + classified[c]['b']))) 47 | 48 | print(metrics.homogeneity_score(bc.target, preds)) 49 | print(metrics.silhouette_score(data, preds)) 50 | -------------------------------------------------------------------------------- /Chapter08/kmeans_intro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.cluster import KMeans 5 | 6 | np.random.seed(87654) 7 | 8 | dat = [] 9 | 10 | t = 0.5 11 | 12 | for i in range(20): 13 | 14 | # dat.append(np.random.uniform(size=2)) 15 | c = np.random.randint(3) 16 | a = np.random.uniform() * 2 * 3.14 17 | r = t * np.sqrt(np.random.uniform()) 18 | 19 | x = r * np.cos(a) 20 | y = r * np.sin(a) 21 | 22 | 23 | dat.append([c/4+x, c/4+y]) 24 | 25 | plt.figure() 26 | for i in range(1, 5): 27 | np.random.seed(98765432) 28 | 29 | inits = np.array([[0.95,0.95],[0.95,0.95],[0.95,0.95] 30 | 31 | ]) 32 | km = KMeans(n_clusters=3, init=inits, max_iter=i, n_init=1) 33 | plt.subplot(2, 2, i) 34 | plt.xticks([]) 35 | plt.yticks([]) 36 | km.fit(dat) 37 | km.cluster_centers_ = np.sort(km.cluster_centers_, axis=0) 38 | c = km.predict(dat) 39 | plt.scatter(*zip(*dat), c=c) 40 | c = km.fit_predict(km.cluster_centers_) 41 | plt.scatter(*zip(*km.cluster_centers_), c='w', marker='*', s=240, edgecolors='r') 42 | plt.title('Iteration: %d'%i) 43 | print(km.cluster_centers_) 44 | 45 | -------------------------------------------------------------------------------- /Chapter08/kmeans_raw.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from sklearn.cluster import KMeans 5 | from sklearn.datasets import load_breast_cancer 6 | from sklearn.manifold import TSNE 7 | 8 | 9 | np.random.seed(123456) 10 | 11 | bc = load_breast_cancer() 12 | data = bc.data 13 | 14 | 15 | #plt.figure() 16 | #plt.title('2, 4, and 6 clusters.') 17 | for clusters in [2, 4, 6]: 18 | km = KMeans(n_clusters=clusters) 19 | preds = km.fit_predict(data) 20 | # plt.subplot(1,3,clusters/2) 21 | # plt.scatter(*zip(*data), c=preds) 22 | 23 | classified = {x: {'m': 0, 'b': 0} for x in range(clusters)} 24 | 25 | for i in range(len(data)): 26 | cluster = preds[i] 27 | label = bc.target[i] 28 | label = 'm' if label == 0 else 'b' 29 | classified[cluster][label] = classified[cluster][label]+1 30 | 31 | print('-'*40) 32 | for c in classified: 33 | print('Cluster %d. Malignant percentage: ' % c, end=' ') 34 | print(classified[c], end=' ') 35 | print('%.3f' % (classified[c]['m'] / 36 | (classified[c]['m'] + classified[c]['b']))) 37 | 38 | print(metrics.homogeneity_score(bc.target, preds)) 39 | print(metrics.silhouette_score(data, preds)) 40 | -------------------------------------------------------------------------------- /Chapter08/oe_co_occurence.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import openensembles as oe 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn.metrics 7 | 8 | from sklearn.datasets import load_breast_cancer 9 | 10 | 11 | bc = load_breast_cancer() 12 | 13 | # --- SECTION 2 --- 14 | # Create the data object 15 | cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names) 16 | 17 | np.random.seed(123456) 18 | 19 | 20 | # --- SECTION 3 --- 21 | # Create the ensembles and calculate the homogeneity score 22 | for K in [2, 3, 4, 5, 6, 7]: 23 | for ensemble_size in [3, 4, 5]: 24 | ensemble = oe.cluster(cluster_data) 25 | for i in range(ensemble_size): 26 | name = f'kmeans_{ensemble_size}_{i}' 27 | ensemble.cluster('parent', 'kmeans', name, K) 28 | 29 | preds = ensemble.finish_co_occ_linkage(threshold=0.5) 30 | print(f'K: {K}, size {ensemble_size}:', end=' ') 31 | print('%.2f' % sklearn.metrics.homogeneity_score( 32 | bc.target, preds.labels['co_occ_linkage'])) 33 | 34 | 35 | -------------------------------------------------------------------------------- /Chapter08/oe_graph_closure.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import openensembles as oe 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn.metrics 7 | 8 | from sklearn.datasets import load_breast_cancer 9 | 10 | bc = load_breast_cancer() 11 | 12 | # --- SECTION 2 --- 13 | # Create the data object 14 | cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names) 15 | 16 | np.random.seed(123456) 17 | # --- SECTION 3 --- 18 | # Create the ensembles and calculate the homogeneity score 19 | for K in [2, 3, 4, 5, 6, 7]: 20 | for ensemble_size in [3, 4, 5]: 21 | ensemble = oe.cluster(cluster_data) 22 | for i in range(ensemble_size): 23 | name = f'kmeans_{ensemble_size}_{i}' 24 | ensemble.cluster('parent', 'kmeans', name, K) 25 | 26 | preds = ensemble.finish_graph_closure(threshold=0.5) 27 | print(f'K: {K}, size {ensemble_size}:', end=' ') 28 | print('%.2f' % sklearn.metrics.homogeneity_score( 29 | bc.target, preds.labels['graph_closure'])) 30 | 31 | 32 | -------------------------------------------------------------------------------- /Chapter08/oe_vote.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import openensembles as oe 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn.metrics 7 | 8 | from sklearn.datasets import load_breast_cancer 9 | 10 | 11 | bc = load_breast_cancer() 12 | 13 | # --- SECTION 2 --- 14 | # Create the data object 15 | cluster_data = oe.data(pd.DataFrame(bc.data), bc.feature_names) 16 | 17 | np.random.seed(123456) 18 | # --- SECTION 3 --- 19 | # Create the ensembles and calculate the homogeneity score 20 | for K in [2, 3, 4, 5, 6, 7]: 21 | for ensemble_size in [3, 4, 5]: 22 | ensemble = oe.cluster(cluster_data) 23 | for i in range(ensemble_size): 24 | name = f'kmeans_{ensemble_size}_{i}' 25 | ensemble.cluster('parent', 'kmeans', name, K) 26 | 27 | preds = ensemble.finish_majority_vote(threshold=0.5) 28 | print(f'K: {K}, size {ensemble_size}:', end=' ') 29 | print('%.2f' % sklearn.metrics.homogeneity_score( 30 | bc.target, preds.labels['majority_vote'])) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Chapter08/oe_vote_tsne.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import openensembles as oe 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn.metrics 7 | 8 | from sklearn.datasets import load_breast_cancer 9 | from sklearn.manifold import TSNE 10 | 11 | bc = load_breast_cancer() 12 | t = TSNE() 13 | # --- SECTION 2 --- 14 | # Create the data object 15 | cluster_data = oe.data(pd.DataFrame(t.fit_transform(bc.data)), [0,1]) 16 | 17 | np.random.seed(123456) 18 | # --- SECTION 3 --- 19 | # Create the ensembles and calculate the homogeneity score 20 | for K in [2, 3, 4, 5, 6, 7]: 21 | for ensemble_size in [3, 4, 5]: 22 | ensemble = oe.cluster(cluster_data) 23 | for i in range(ensemble_size): 24 | name = f'kmeans_{ensemble_size}_{i}' 25 | ensemble.cluster('parent', 'kmeans', name, K) 26 | 27 | preds = ensemble.finish_majority_vote(threshold=0.5) 28 | print(f'K: {K}, size {ensemble_size}:', end=' ') 29 | print('%.2f' % sklearn.metrics.homogeneity_score( 30 | bc.target, preds.labels['majority_vote'])) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Chapter08/voting_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.cluster import KMeans 5 | 6 | np.random.seed(123456) 7 | 8 | dat = [] 9 | 10 | t = 0.5 11 | 12 | for i in range(10): 13 | 14 | # dat.append(np.random.uniform(size=2)) 15 | c = np.random.randint(3) 16 | a = np.random.uniform() * 2 * 3.14 17 | r = t * np.sqrt(np.random.uniform()) 18 | 19 | x = r * np.cos(a) 20 | y = r * np.sin(a) 21 | 22 | 23 | dat.append([c/2+x, c/2+y]) 24 | 25 | 26 | clusterers = [] 27 | for _ in range(3): 28 | km = KMeans(n_clusters=3) 29 | noise = np.random.uniform(low=-0.5, high=0.5, size=(3,2)) 30 | km.cluster_centers_ = np.array([[0, 0], [0.5, 0.5], [1, 1]]) + noise 31 | clusterers.append(km) 32 | 33 | 34 | 35 | plt.figure() 36 | for i, clusterer in enumerate(clusterers): 37 | 38 | plt.subplot(1, 3, i+1) 39 | plt.xticks([]) 40 | plt.yticks([]) 41 | c = clusterer.predict(dat) 42 | print(c) 43 | plt.scatter(*zip(*dat), c=c) 44 | c = clusterer.predict(clusterer.cluster_centers_) 45 | plt.scatter(*zip(*clusterer.cluster_centers_), c='w', marker='*', s=240, edgecolors='r') 46 | plt.title('Clustering: %d'%i) 47 | 48 | 49 | plt.figure() 50 | dat = np.array(dat) 51 | plt.xticks([]) 52 | plt.yticks([]) 53 | c = np.array([0, 0, 1, 1, 1, 0, 0, 0, 0, 1]) 54 | plt.scatter(*zip(*dat[c == 0]), c='C0', label='Cluster 0') 55 | plt.scatter(*zip(*dat[c == 1]), c='C1', label='Cluster 1') 56 | plt.legend() 57 | plt.title('Voting Clustering') 58 | 59 | 60 | -------------------------------------------------------------------------------- /Chapter09/adaboost.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.ensemble import AdaBoostClassifier 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.utils import shuffle 9 | from sklearn import metrics 10 | 11 | 12 | 13 | np.random.seed(123456) 14 | data = pd.read_csv('creditcard.csv') 15 | data.Time = (data.Time-data.Time.min())/data.Time.std() 16 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 17 | 18 | # Train-Test slpit of 70%-30% 19 | x_train, x_test, y_train, y_test = train_test_split( 20 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 21 | 22 | # --- SECTION 2 --- 23 | # Ensemble evaluation 24 | ensemble = AdaBoostClassifier(n_estimators=70, learning_rate=1.0) 25 | 26 | ensemble.fit(x_train, y_train) 27 | 28 | print('AdaBoost f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 29 | print('AdaBoost recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 30 | 31 | 32 | 33 | # --- SECTION 3 --- 34 | # Filter features according to their correlation to the target 35 | np.random.seed(123456) 36 | threshold = 0.1 37 | 38 | correlations = data.corr()['Class'].drop('Class') 39 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 40 | fs.append('Class') 41 | data = data[fs] 42 | 43 | x_train, x_test, y_train, y_test = train_test_split( 44 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 45 | 46 | ensemble = AdaBoostClassifier(n_estimators=70, learning_rate=1.0) 47 | 48 | ensemble.fit(x_train, y_train) 49 | 50 | print('AdaBoost f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 51 | print('AdaBoost recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 52 | -------------------------------------------------------------------------------- /Chapter09/bagging.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.ensemble import BaggingClassifier 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.model_selection import train_test_split 9 | from sklearn import metrics 10 | 11 | 12 | 13 | 14 | np.random.seed(123456) 15 | data = pd.read_csv('creditcard.csv') 16 | data.Time = (data.Time-data.Time.min())/data.Time.std() 17 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 18 | 19 | # Train-Test slpit of 70%-30% 20 | x_train, x_test, y_train, y_test = train_test_split( 21 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 22 | 23 | # --- SECTION 2 --- 24 | # Ensemble evaluation 25 | ensemble = BaggingClassifier(n_estimators=10, 26 | base_estimator=DecisionTreeClassifier(max_depth=8)) 27 | 28 | ensemble.fit(x_train, y_train) 29 | 30 | print('Bagging f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 31 | print('Bagging recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 32 | 33 | 34 | 35 | # --- SECTION 3 --- 36 | # Filter features according to their correlation to the target 37 | np.random.seed(123456) 38 | threshold = 0.1 39 | 40 | correlations = data.corr()['Class'].drop('Class') 41 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 42 | fs.append('Class') 43 | data = data[fs] 44 | 45 | x_train, x_test, y_train, y_test = train_test_split( 46 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 47 | 48 | ensemble = BaggingClassifier(n_estimators=10, 49 | base_estimator=DecisionTreeClassifier(max_depth=8)) 50 | 51 | ensemble.fit(x_train, y_train) 52 | 53 | print('Bagging f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 54 | print('Bagging recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 55 | -------------------------------------------------------------------------------- /Chapter09/base.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.naive_bayes import GaussianNB 9 | from sklearn.model_selection import train_test_split 10 | from sklearn import metrics 11 | 12 | 13 | 14 | 15 | np.random.seed(123456) 16 | data = pd.read_csv('creditcard.csv') 17 | data.Time = (data.Time-data.Time.min())/data.Time.std() 18 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 19 | 20 | # Train-Test slpit of 70%-30% 21 | x_train, x_test, y_train, y_test = train_test_split( 22 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 23 | 24 | # --- SECTION 2 --- 25 | # Base learners evaluation 26 | base_classifiers = [('DT', DecisionTreeClassifier(max_depth=6)), 27 | ('NB', GaussianNB()), 28 | ('LR', LogisticRegression())] 29 | 30 | for bc in base_classifiers: 31 | lr = bc[1] 32 | lr.fit(x_train, y_train) 33 | 34 | predictions = lr.predict(x_test) 35 | print(bc[0]+' f1', metrics.f1_score(y_test, predictions)) 36 | print(bc[0]+' recall', metrics.recall_score(y_test, predictions)) 37 | print(metrics.confusion_matrix(y_test, predictions)) 38 | 39 | # --- SECTION 3 --- 40 | # Filter features according to their correlation to the target 41 | np.random.seed(123456) 42 | threshold = 0.1 43 | 44 | correlations = data.corr()['Class'].drop('Class') 45 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 46 | fs.append('Class') 47 | data = data[fs] 48 | 49 | x_train, x_test, y_train, y_test = train_test_split( 50 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 51 | 52 | for bc in base_classifiers: 53 | lr = bc[1] 54 | lr.fit(x_train, y_train) 55 | 56 | predictions = lr.predict(x_test) 57 | print(bc[0]+' f1', metrics.f1_score(y_test, predictions)) 58 | print(bc[0]+' recall', metrics.recall_score(y_test, predictions)) 59 | print(metrics.confusion_matrix(y_test, predictions)) -------------------------------------------------------------------------------- /Chapter09/dt_optimize.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.naive_bayes import GaussianNB 10 | from sklearn.model_selection import train_test_split 11 | from sklearn import metrics 12 | 13 | 14 | 15 | 16 | np.random.seed(123456) 17 | data = pd.read_csv('creditcard.csv') 18 | data.Time = (data.Time-data.Time.min())/data.Time.std() 19 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 20 | 21 | # Train-Test slpit of 70%-30% 22 | x_train, x_test, y_train, y_test = train_test_split( 23 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 24 | 25 | # --- SECTION 2 --- 26 | # Base learners evaluation 27 | base_classifiers = [('DT', DecisionTreeClassifier(max_depth=6)), 28 | ('NB', GaussianNB()), 29 | ('LR', LogisticRegression())] 30 | 31 | raw_f1 = [] 32 | raw_recall = [] 33 | range_ = [x for x in range(3,12)] 34 | for max_d in range_: 35 | lr = DecisionTreeClassifier(max_depth=max_d) 36 | lr.fit(x_train, y_train) 37 | 38 | predictions = lr.predict(x_test) 39 | raw_f1.append(metrics.f1_score(y_test, predictions)) 40 | raw_recall.append(metrics.recall_score(y_test, predictions)) 41 | 42 | plt.plot(range_, raw_f1, label='Raw F1') 43 | plt.plot(range_, raw_recall, label='Raw Recall') 44 | print(raw_f1) 45 | print(raw_recall) 46 | # --- SECTION 3 --- 47 | # Filter features according to their correlation to the target 48 | np.random.seed(123456) 49 | threshold = 0.1 50 | 51 | correlations = data.corr()['Class'].drop('Class') 52 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 53 | fs.append('Class') 54 | data = data[fs] 55 | 56 | x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 57 | 58 | filter_f1 = [] 59 | filter_recall = [] 60 | for max_d in range_: 61 | lr = DecisionTreeClassifier(max_depth=max_d) 62 | lr.fit(x_train, y_train) 63 | 64 | predictions = lr.predict(x_test) 65 | filter_f1.append(metrics.f1_score(y_test, predictions)) 66 | filter_recall.append(metrics.recall_score(y_test, predictions)) 67 | 68 | print(filter_f1) 69 | print(filter_recall) 70 | 71 | plt.plot(range_, filter_f1, label='Filtered F1') 72 | plt.plot(range_, filter_recall, label='Filtered Recall') -------------------------------------------------------------------------------- /Chapter09/exploratory.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | 4 | from sklearn.model_selection import train_test_split 5 | 6 | import warnings 7 | 8 | warnings.filterwarnings("ignore") 9 | 10 | data = pd.read_csv('creditcard.csv') 11 | 12 | data.Time = (data.Time-data.Time.min())/data.Time.std() 13 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 14 | 15 | plt.figure() 16 | data.groupby('Class').V1.count().plot(kind='bar') 17 | plt.title('0-1 Class distribution') 18 | 19 | plt.figure() 20 | ax = data.Amount.hist(grid=False, bins=50) 21 | ax.set_yscale("log", nonposy='clip') 22 | plt.title('Amount') 23 | 24 | plt.figure() 25 | data.Time.hist(grid=False, bins=50) 26 | plt.title('Time') 27 | 28 | plt.figure() 29 | correlations = data.corr()['Class'].drop('Class') 30 | correlations.sort_values().plot(kind='bar') 31 | plt.title('Correlations to Class') 32 | 33 | 34 | 35 | 36 | 37 | frauds = data[data.Class == 1] 38 | non_frauds = data[data.Class == 0] 39 | 40 | frauds_no = len(frauds) 41 | 42 | balanced_data = pd.concat([frauds, non_frauds.sample(frauds_no)]) 43 | 44 | plt.figure() 45 | balanced_data.groupby('Class').V1.count().plot(kind='bar') 46 | plt.title('0-1 Class distribution (subsampled)') 47 | 48 | plt.figure() 49 | ax = balanced_data.Amount.hist(grid=False, bins=50) 50 | ax.set_yscale("log", nonposy='clip') 51 | plt.title('Amount (subsampled)') 52 | 53 | plt.figure() 54 | correlations = balanced_data.corr()['Class'].drop('Class') 55 | correlations.sort_values().plot(kind='bar') 56 | plt.title('Correlations to Class (subsampled)') 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /Chapter09/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 15 19:20:55 2019 4 | 5 | @author: George Kyriakides 6 | ge.kyriakides@gmail.com 7 | """ 8 | import numpy as np 9 | import pandas as pd 10 | 11 | 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.utils import shuffle 15 | from sklearn import metrics 16 | 17 | 18 | np.random.seed(123456) 19 | data = pd.read_csv('creditcard.csv') 20 | data.Time = (data.Time-data.Time.min())/data.Time.std() 21 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 22 | 23 | x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 24 | 25 | 26 | X, Y = shuffle(x_train, y_train) 27 | 28 | lr = LogisticRegression() 29 | lr.fit(X, Y) 30 | 31 | print('f1', metrics.f1_score(y_test, lr.predict(x_test))) 32 | print('recall', metrics.recall_score(y_test, lr.predict(x_test))) 33 | 34 | 35 | # ============================================================================= 36 | # Selected Features 37 | # ============================================================================= 38 | 39 | np.random.seed(123456) 40 | threshold = 0.1 41 | 42 | correlations = data.corr()['Class'].drop('Class') 43 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 44 | fs.append('Class') 45 | data = data[fs] 46 | 47 | x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 48 | 49 | X, Y = shuffle(x_train, y_train) 50 | lr = LogisticRegression() 51 | lr.fit(X, Y) 52 | 53 | print('f1', metrics.f1_score(y_test, lr.predict(x_test))) 54 | print('recall', metrics.recall_score(y_test, lr.predict(x_test))) 55 | -------------------------------------------------------------------------------- /Chapter09/random_forest.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.utils import shuffle 10 | from sklearn import metrics 11 | 12 | 13 | np.random.seed(123456) 14 | data = pd.read_csv('creditcard.csv') 15 | data.Time = (data.Time-data.Time.min())/data.Time.std() 16 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 17 | 18 | np.random.seed(123456) 19 | data = pd.read_csv('creditcard.csv') 20 | data.Time = (data.Time-data.Time.min())/data.Time.std() 21 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 22 | 23 | # Train-Test slpit of 70%-30% 24 | x_train, x_test, y_train, y_test = train_test_split( 25 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 26 | 27 | # --- SECTION 2 --- 28 | # Ensemble evaluation 29 | ensemble = RandomForestClassifier(criterion='entropy', n_jobs=4) 30 | 31 | ensemble.fit(x_train, y_train) 32 | 33 | print('RF f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 34 | print('RF recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 35 | 36 | 37 | 38 | # --- SECTION 3 --- 39 | # Filter features according to their correlation to the target 40 | np.random.seed(123456) 41 | threshold = 0.1 42 | 43 | correlations = data.corr()['Class'].drop('Class') 44 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 45 | fs.append('Class') 46 | data = data[fs] 47 | 48 | x_train, x_test, y_train, y_test = train_test_split( 49 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 50 | 51 | ensemble = RandomForestClassifier(criterion='entropy', n_jobs=4) 52 | 53 | ensemble.fit(x_train, y_train) 54 | 55 | print('RF f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 56 | print('RF recall', metrics.recall_score(y_test, ensemble.predict(x_test))) -------------------------------------------------------------------------------- /Chapter09/stacking.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from stacking_classifier import Stacking 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.naive_bayes import GaussianNB 10 | from sklearn.svm import LinearSVC 11 | from sklearn.model_selection import train_test_split 12 | from sklearn import metrics 13 | 14 | 15 | np.random.seed(123456) 16 | data = pd.read_csv('creditcard.csv') 17 | data.Time = (data.Time-data.Time.min())/data.Time.std() 18 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 19 | 20 | # Train-Test slpit of 70%-30% 21 | x_train, x_test, y_train, y_test = train_test_split( 22 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 23 | 24 | # --- SECTION 2 --- 25 | # Ensemble evaluation 26 | base_classifiers = [DecisionTreeClassifier(max_depth=5), 27 | GaussianNB(), 28 | LogisticRegression(), 29 | DecisionTreeClassifier(max_depth=3), 30 | DecisionTreeClassifier(max_depth=8)] 31 | 32 | l1_classifiers = [DecisionTreeClassifier(max_depth=2), LinearSVC()] 33 | 34 | ensemble = Stacking(learner_levels=[base_classifiers, l1_classifiers, 35 | [LogisticRegression()]]) 36 | 37 | 38 | ensemble.fit(x_train, y_train) 39 | 40 | print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 41 | print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 42 | 43 | 44 | # --- SECTION 3 --- 45 | # Filter features according to their correlation to the target 46 | np.random.seed(123456) 47 | threshold = 0.1 48 | 49 | correlations = data.corr()['Class'].drop('Class') 50 | fs = list(correlations[(abs(correlations) > threshold)].index.values) 51 | fs.append('Class') 52 | data = data[fs] 53 | 54 | x_train, x_test, y_train, y_test = train_test_split( 55 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 56 | 57 | ensemble = Stacking(learner_levels=[base_classifiers, l1_classifiers, 58 | [LogisticRegression()]]) 59 | 60 | ensemble.fit(x_train, y_train) 61 | 62 | print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 63 | print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 64 | -------------------------------------------------------------------------------- /Chapter09/stacking_classifier.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | from copy import deepcopy 3 | from sklearn.model_selection import KFold 4 | 5 | import numpy as np 6 | 7 | class Stacking(BaseEstimator): 8 | 9 | # --- SECTION 2 --- 10 | # The constructor 11 | def __init__(self, learner_levels): 12 | # Create a list of sizes for each stacking level 13 | # And a list of deep copied learners 14 | self.level_sizes = [] 15 | self.learners = [] 16 | self.learner_levels = learner_levels 17 | for learning_level in self.learner_levels: 18 | 19 | self.level_sizes.append(len(learning_level)) 20 | level_learners = [] 21 | for learner in learning_level: 22 | level_learners.append(deepcopy(learner)) 23 | self.learners.append(level_learners) 24 | 25 | 26 | 27 | # --- SECTION 3 --- 28 | # The fit function. Creates training meta data for every level and trains 29 | # each level on the previous level's meta data 30 | def fit(self, x, y): 31 | # Create a list of training meta data, one for each stacking level 32 | # and another one for the targets. For the first level, the actual data 33 | # is used. 34 | meta_data = [x] 35 | meta_targets = [y] 36 | for i in range(len(self.learners)): 37 | level_size = self.level_sizes[i] 38 | 39 | # Create the meta data and target variables for this level 40 | data_z = np.zeros((level_size, len(x))) 41 | target_z = np.zeros(len(x)) 42 | 43 | train_x = meta_data[i] 44 | train_y = meta_targets[i] 45 | 46 | # Create the cross-validation folds 47 | KF = KFold(n_splits=3) 48 | meta_index = 0 49 | for train_indices, test_indices in KF.split(x): 50 | # Train each learner on the K-1 folds and create 51 | # meta data for the Kth fold 52 | for j in range(len(self.learners[i])): 53 | 54 | learner = self.learners[i][j] 55 | learner.fit(train_x[train_indices], train_y[train_indices]) 56 | predictions = learner.predict(train_x[test_indices]) 57 | 58 | data_z[j][meta_index:meta_index+len(test_indices)] = predictions 59 | 60 | target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices] 61 | meta_index += len(test_indices) 62 | 63 | # Add the data and targets to the meta data lists 64 | data_z = data_z.transpose() 65 | meta_data.append(data_z) 66 | meta_targets.append(target_z) 67 | 68 | 69 | # Train the learner on the whole previous meta data 70 | for learner in self.learners[i]: 71 | learner.fit(train_x, train_y) 72 | 73 | 74 | 75 | 76 | 77 | 78 | # --- SECTION 4 --- 79 | # The predict function. Creates meta data for the test data and returns 80 | # all of them. The actual predictions can be accessed with meta_data[-1] 81 | def predict(self, x): 82 | 83 | # Create a list of training meta data, one for each stacking level 84 | meta_data = [x] 85 | for i in range(len(self.learners)): 86 | level_size = self.level_sizes[i] 87 | 88 | data_z = np.zeros((level_size, len(x))) 89 | 90 | test_x = meta_data[i] 91 | 92 | # Create the cross-validation folds 93 | KF = KFold(n_splits=3) 94 | for train_indices, test_indices in KF.split(x): 95 | # Train each learner on the K-1 folds and create 96 | # meta data for the Kth fold 97 | for j in range(len(self.learners[i])): 98 | 99 | learner = self.learners[i][j] 100 | predictions = learner.predict(test_x) 101 | data_z[j] = predictions 102 | 103 | 104 | 105 | # Add the data and targets to the meta data lists 106 | data_z = data_z.transpose() 107 | meta_data.append(data_z) 108 | 109 | # Return the meta_data the final layer's prediction can be accessed 110 | # With meta_data[-1] 111 | return meta_data[-1] 112 | 113 | def predict_proba(self, x): 114 | 115 | # Create a list of training meta data, one for each stacking level 116 | meta_data = [x] 117 | for i in range(len(self.learners)-1): 118 | level_size = self.level_sizes[i] 119 | 120 | data_z = np.zeros((level_size, len(x))) 121 | 122 | test_x = meta_data[i] 123 | 124 | # Create the cross-validation folds 125 | KF = KFold(n_splits=5) 126 | for train_indices, test_indices in KF.split(x): 127 | # Train each learner on the K-1 folds and create 128 | # meta data for the Kth fold 129 | for j in range(len(self.learners[i])): 130 | 131 | learner = self.learners[i][j] 132 | predictions = learner.predict(test_x) 133 | data_z[j] = predictions 134 | 135 | 136 | 137 | # Add the data and targets to the meta data lists 138 | data_z = data_z.transpose() 139 | meta_data.append(data_z) 140 | 141 | learner = self.learners[-1][-1] 142 | return learner.predict_proba(meta_data[-1]) -------------------------------------------------------------------------------- /Chapter09/unrelated_presentation_phd.py: -------------------------------------------------------------------------------- 1 | 2 | percentage_f = lambda x: '%.2f %%'%x if x>2.5 else '' 3 | 4 | 5 | counts = publisher.groupby('Publisher').Publisher.count() 6 | sorted_vals = counts.sort_values(ascending=False) 7 | 8 | explode = [0.6 if x<10 else 0 for x in sorted_vals.values ] 9 | 10 | sorted_vals.plot.pie(autopct=percentage_f, explode=explode) -------------------------------------------------------------------------------- /Chapter09/voting.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.ensemble import VotingClassifier 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.naive_bayes import GaussianNB 10 | from sklearn.model_selection import train_test_split 11 | from sklearn import metrics 12 | 13 | 14 | 15 | 16 | np.random.seed(123456) 17 | data = pd.read_csv('creditcard.csv') 18 | data.Time = (data.Time-data.Time.min())/data.Time.std() 19 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 20 | 21 | # Train-Test slpit of 70%-30% 22 | x_train, x_test, y_train, y_test = train_test_split( 23 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 24 | 25 | # --- SECTION 2 --- 26 | # Ensemble evaluation 27 | base_classifiers = [('DT', DecisionTreeClassifier(max_depth=5)), 28 | ('NB', GaussianNB()), 29 | ('ensemble', LogisticRegression()), 30 | ('DT2', DecisionTreeClassifier(max_depth=3)), 31 | ('DT3', DecisionTreeClassifier(max_depth=8))] 32 | 33 | ensemble = VotingClassifier(base_classifiers) 34 | ensemble.fit(x_train, y_train) 35 | 36 | print('Voting f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 37 | print('Voting recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 38 | 39 | 40 | 41 | # --- SECTION 3 --- 42 | # Filter features according to their correlation to the target 43 | np.random.seed(123456) 44 | threshold = 0.1 45 | 46 | correlations = data.corr()['Class'].drop('Class') 47 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 48 | fs.append('Class') 49 | data = data[fs] 50 | 51 | x_train, x_test, y_train, y_test = train_test_split( 52 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 53 | 54 | ensemble = VotingClassifier(base_classifiers) 55 | ensemble.fit(x_train, y_train) 56 | 57 | print('Voting f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 58 | print('Voting recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 59 | -------------------------------------------------------------------------------- /Chapter09/xgboosting.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries and data loading 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.utils import shuffle 9 | from sklearn import metrics 10 | 11 | from xgboost import XGBClassifier 12 | 13 | 14 | np.random.seed(123456) 15 | data = pd.read_csv('creditcard.csv') 16 | data.Time = (data.Time-data.Time.min())/data.Time.std() 17 | data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std() 18 | 19 | # Train-Test slpit of 70%-30% 20 | x_train, x_test, y_train, y_test = train_test_split( 21 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 22 | 23 | # --- SECTION 2 --- 24 | # Ensemble evaluation 25 | ensemble = XGBClassifier(max_depth=5, n_jobs=4) 26 | 27 | ensemble.fit(x_train, y_train) 28 | 29 | print('XGB f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 30 | print('XGB recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 31 | 32 | 33 | 34 | # --- SECTION 3 --- 35 | # Filter features according to their correlation to the target 36 | np.random.seed(123456) 37 | threshold = 0.1 38 | 39 | correlations = data.corr()['Class'].drop('Class') 40 | fs = list(correlations[(abs(correlations)>threshold)].index.values) 41 | fs.append('Class') 42 | data = data[fs] 43 | 44 | x_train, x_test, y_train, y_test = train_test_split( 45 | data.drop('Class', axis=1).values, data.Class.values, test_size=0.3) 46 | 47 | ensemble = XGBClassifier(max_depth=5, n_jobs=4) 48 | 49 | ensemble.fit(x_train, y_train) 50 | 51 | print('XGB f1', metrics.f1_score(y_test, ensemble.predict(x_test))) 52 | print('XGB recall', metrics.recall_score(y_test, ensemble.predict(x_test))) 53 | -------------------------------------------------------------------------------- /Chapter10/bagging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from simulator import simulate 5 | from sklearn import metrics 6 | from sklearn.ensemble import BaggingRegressor 7 | from sklearn.tree import DecisionTreeRegressor 8 | from sklearn.model_selection import train_test_split 9 | 10 | np.random.seed(123456) 11 | 12 | lr = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=1)) 13 | 14 | data = pd.read_csv('BTC-USD.csv') 15 | data = data.dropna() 16 | data.Date = pd.to_datetime(data.Date) 17 | data.set_index('Date', drop=True, inplace=True) 18 | diffs = (data.Close.diff()/data.Close).values[1:] 19 | 20 | diff_len = len(diffs) 21 | 22 | 23 | 24 | def create_x_data(lags=1): 25 | diff_data = np.zeros((diff_len, lags)) 26 | 27 | for lag in range(1, lags+1): 28 | this_data = diffs[:-lag] 29 | diff_data[lag:, lag-1] = this_data 30 | 31 | return diff_data 32 | 33 | x_data = create_x_data(lags=20)*100 34 | y_data = diffs*100 35 | 36 | # REPRODUCIBILITY 37 | x_data = np.around(x_data, decimals=8) 38 | y_data = np.around(y_data, decimals=8) 39 | 40 | # ============================================================================= 41 | # WALK FORWARD 42 | # ============================================================================= 43 | 44 | window = 150 45 | preds = np.zeros(diff_len-window) 46 | for i in range(diff_len-window-1): 47 | x_train = x_data[i:i+window, :] 48 | y_train = y_data[i:i+window] 49 | lr.fit(x_train, y_train) 50 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1)) 51 | 52 | 53 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds)) 54 | simulate(data, preds) 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /Chapter10/boosting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from simulator import simulate 5 | from sklearn import metrics 6 | from sklearn.model_selection import train_test_split 7 | 8 | from xgboost import XGBRegressor 9 | np.random.seed(123456) 10 | 11 | 12 | lr = XGBRegressor(n_jobs=5, max_depth=2, n_estimators=10, reg_alpha=0.5) 13 | 14 | data = pd.read_csv('BTC-USD.csv') 15 | data = data.dropna() 16 | data.Date = pd.to_datetime(data.Date) 17 | data.set_index('Date', drop=True, inplace=True) 18 | diffs = (data.Close.diff()/data.Close).values[1:] 19 | 20 | diff_len = len(diffs) 21 | 22 | 23 | 24 | def create_x_data(lags=1): 25 | diff_data = np.zeros((diff_len, lags)) 26 | ma_data = np.zeros((diff_len, lags)) 27 | 28 | diff_ma = (data.Close.diff()/data.Close).rolling(15).mean().fillna(0).values[1:] 29 | for lag in range(1, lags+1): 30 | this_data = diffs[:-lag] 31 | diff_data[lag:, lag-1] = this_data 32 | 33 | this_data = diff_ma[:-lag] 34 | ma_data[lag:, lag-1] = this_data 35 | return np.concatenate((diff_data, ma_data), axis=1) 36 | 37 | 38 | x_data = create_x_data(lags=30)*100 39 | y_data = diffs*100 40 | 41 | # REPRODUCIBILITY 42 | x_data = np.around(x_data, decimals=8) 43 | y_data = np.around(y_data, decimals=8) 44 | 45 | # ============================================================================= 46 | # WALK FORWARD 47 | # ============================================================================= 48 | 49 | window = 150 50 | preds = np.zeros(diff_len-window) 51 | for i in range(diff_len-window-1): 52 | x_train = x_data[i:i+window, :] 53 | y_train = y_data[i:i+window] 54 | lr.fit(x_train, y_train) 55 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1)) 56 | 57 | 58 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds)) 59 | simulate(data, preds) 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /Chapter10/exploratory.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 7 | 8 | # Read the data, parse the dates and set the dates as index 9 | data = pd.read_csv('BTC-USD.csv') 10 | data = data.dropna() 11 | data.Date = pd.to_datetime(data.Date) 12 | data.set_index('Date', drop=True, inplace=True) 13 | 14 | 15 | # ============================================================================= 16 | # ORIGINAL 17 | # ============================================================================= 18 | # Plot ACF-> Non-Stationary 19 | plot_acf(data.Close, lags=30) 20 | plt.xlabel('Date') 21 | plt.ylabel('Correlation') 22 | 23 | # ============================================================================= 24 | # Percentage Differences 25 | # ============================================================================= 26 | 27 | # Make two subplots 28 | fig, axes = plt.subplots(nrows=2, ncols=1) 29 | 30 | # Calculate the percentage differences 31 | diffs = data.Close.diff()/data.Close 32 | 33 | # Plot the rolling deviation 34 | diffs.rolling(30).std().plot(ax=axes[0]) 35 | plt.xlabel('Date') 36 | plt.ylabel('Std. Dev.') 37 | axes[0].title.set_text('Transformed Data Rolling Std.Dev.') 38 | 39 | diffs = diffs.dropna() 40 | 41 | # Plot ACF for percentage diffs 42 | plot_acf(diffs, lags=60, ax=axes[1]) 43 | plt.xlabel('Date') 44 | plt.ylabel('Correlation') 45 | 46 | # Plot the changes 47 | plt.figure() 48 | diffs.plot() 49 | plt.xlabel('Date') 50 | plt.ylabel('Change %') 51 | plt.title('Transformed Data') 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /Chapter10/random_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from simulator import simulate 5 | from sklearn import metrics 6 | from sklearn.ensemble import RandomForestRegressor 7 | from sklearn.model_selection import train_test_split 8 | 9 | np.random.seed(123456) 10 | lr = RandomForestRegressor(n_estimators=50, max_depth=2, n_jobs=5) 11 | 12 | data = pd.read_csv('BTC-USD.csv') 13 | data = data.dropna() 14 | data.Date = pd.to_datetime(data.Date) 15 | data.set_index('Date', drop=True, inplace=True) 16 | diffs = (data.Close.diff()/data.Close).values[1:] 17 | 18 | diff_len = len(diffs) 19 | 20 | 21 | 22 | def create_x_data(lags=1): 23 | diff_data = np.zeros((diff_len, lags)) 24 | ma_data = np.zeros((diff_len, lags)) 25 | 26 | diff_ma = (data.Close.diff()/data.Close).rolling(15).mean().fillna(0).values[1:] 27 | for lag in range(1, lags+1): 28 | this_data = diffs[:-lag] 29 | diff_data[lag:, lag-1] = this_data 30 | 31 | this_data = diff_ma[:-lag] 32 | ma_data[lag:, lag-1] = this_data 33 | return np.concatenate((diff_data, ma_data), axis=1) 34 | 35 | x_data = create_x_data(lags=30)*100 36 | y_data = diffs*100 37 | 38 | # REPRODUCIBILITY 39 | x_data = np.around(x_data, decimals=8) 40 | y_data = np.around(y_data, decimals=8) 41 | 42 | # ============================================================================= 43 | # WALK FORWARD 44 | # ============================================================================= 45 | 46 | window = 150 47 | preds = np.zeros(diff_len-window) 48 | for i in range(diff_len-window-1): 49 | x_train = x_data[i:i+window, :] 50 | y_train = y_data[i:i+window] 51 | lr.fit(x_train, y_train) 52 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1)) 53 | 54 | 55 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds)) 56 | simulate(data, preds) 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /Chapter10/regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from simulator import simulate 5 | from sklearn import metrics 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.model_selection import train_test_split 8 | 9 | np.random.seed(123456) 10 | lr = LinearRegression() 11 | 12 | data = pd.read_csv('BTC-USD.csv') 13 | data = data.dropna() 14 | data.Date = pd.to_datetime(data.Date) 15 | data.set_index('Date', drop=True, inplace=True) 16 | diffs = (data.Close.diff()/data.Close).values[1:] 17 | 18 | diff_len = len(diffs) 19 | 20 | 21 | 22 | def create_x_data(lags=1): 23 | diff_data = np.zeros((diff_len, lags)) 24 | 25 | for lag in range(1, lags+1): 26 | this_data = diffs[:-lag] 27 | diff_data[lag:, lag-1] = this_data 28 | 29 | return diff_data 30 | 31 | # REPRODUCIBILITY 32 | x_data = create_x_data(lags=20)*100 33 | y_data = diffs*100 34 | 35 | 36 | x_data = np.around(x_data, decimals=8) 37 | y_data = np.around(y_data, decimals=8) 38 | 39 | # ============================================================================= 40 | # WALK FORWARD 41 | # ============================================================================= 42 | 43 | window = 150 44 | preds = np.zeros(diff_len-window) 45 | for i in range(diff_len-window-1): 46 | x_train = x_data[i:i+window, :] 47 | y_train = y_data[i:i+window] 48 | lr.fit(x_train, y_train) 49 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1)) 50 | 51 | 52 | print('Percentages MSE: %.2f'%metrics.mean_absolute_error(y_data[window:], preds)) 53 | simulate(data, preds) 54 | 55 | -------------------------------------------------------------------------------- /Chapter10/simulator.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from sklearn import metrics 6 | 7 | 8 | def simulate(data, preds): 9 | # Constants and placeholders 10 | buy_threshold = 0.5 11 | stake = 100 12 | 13 | true, pred, balances = [], [], [] 14 | 15 | buy_price = 0 16 | buy_points, sell_points = [], [] 17 | balance = 0 18 | 19 | start_index = len(data)-len(preds)-1 20 | # Calcualte predicted values 21 | for i in range(len(preds)): 22 | 23 | last_close = data.Close[i+start_index-1] 24 | current_close = data.Close[i+start_index] 25 | 26 | # Save predicted values and true values 27 | true.append(current_close) 28 | pred.append(last_close*(1+preds[i]/100)) 29 | 30 | 31 | # Buy/Sell according to signal 32 | if preds[i] > buy_threshold and buy_price == 0: 33 | buy_price = true[-1] 34 | buy_points.append(i) 35 | 36 | elif preds[i] < -buy_threshold and not buy_price == 0: 37 | profit = (current_close - buy_price) * stake/buy_price 38 | balance += profit 39 | buy_price = 0 40 | sell_points.append(i) 41 | 42 | balances.append(balance) 43 | 44 | 45 | true = np.array(true) 46 | pred = np.array(pred) 47 | 48 | # Create plots 49 | plt.figure() 50 | 51 | plt.subplot(2, 1, 1) 52 | plt.plot(true, label='True') 53 | plt.plot(pred, label='pred') 54 | plt.scatter(buy_points, true[buy_points]+500, marker='v', 55 | c='blue', s=5, zorder=10) 56 | plt.scatter(sell_points, true[sell_points]-500, marker='^' 57 | , c='red', s=5, zorder=10) 58 | plt.title('Trades') 59 | 60 | plt.subplot(2, 1, 2) 61 | plt.plot(balances) 62 | plt.title('Profit') 63 | print('MSE: %.2f'%metrics.mean_squared_error(true, pred)) 64 | balance_df = pd.DataFrame(balances) 65 | 66 | pct_returns = balance_df.diff()/stake 67 | pct_returns = pct_returns[pct_returns != 0].dropna() 68 | 69 | 70 | print('Sharpe: %.2f'%(np.mean(pct_returns)/np.std(pct_returns))) -------------------------------------------------------------------------------- /Chapter10/simulator_plain.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from sklearn import metrics 6 | 7 | 8 | def simulate(data, preds): 9 | true, pred= [], [] 10 | 11 | start_index = len(data)-len(preds)-1 12 | for i in range(len(preds)): 13 | 14 | last_close = data.Close[i+start_index-1] 15 | current_close = data.Close[i+start_index] 16 | 17 | true.append(current_close) 18 | pred.append(last_close*(1+preds[i]/100)) 19 | 20 | 21 | 22 | 23 | 24 | true = np.array(true) 25 | pred = np.array(pred) 26 | 27 | plt.figure() 28 | 29 | plt.plot(true, label='True') 30 | plt.plot(pred, label='pred') 31 | 32 | print('MSE: %.2f'%metrics.mean_squared_error(true, pred)) 33 | -------------------------------------------------------------------------------- /Chapter10/stacking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from simulator import simulate 5 | from sklearn import metrics 6 | from sklearn.neighbors import KNeighborsRegressor 7 | from sklearn.linear_model import LinearRegression 8 | from sklearn.svm import SVR 9 | from sklearn.model_selection import train_test_split 10 | from stacking_regressor import StackingRegressor 11 | 12 | np.random.seed(123456) 13 | 14 | 15 | lr = SVR() 16 | 17 | data = pd.read_csv('BTC-USD.csv') 18 | data = data.dropna() 19 | data.Date = pd.to_datetime(data.Date) 20 | data.set_index('Date', drop=True, inplace=True) 21 | diffs = (data.Close.diff()/data.Close).values[1:] 22 | 23 | diff_len = len(diffs) 24 | 25 | base_learners = [[SVR(), KNeighborsRegressor()], 26 | [LinearRegression()]] 27 | 28 | lr = StackingRegressor(base_learners) 29 | 30 | def create_x_data(lags=1): 31 | diff_data = np.zeros((diff_len, lags)) 32 | 33 | for lag in range(1, lags+1): 34 | this_data = diffs[:-lag] 35 | diff_data[lag:, lag-1] = this_data 36 | 37 | return diff_data 38 | 39 | x_data = create_x_data(lags=20)*100 40 | y_data = diffs*100 41 | 42 | # REPRODUCIBILITY 43 | x_data = np.around(x_data, decimals=8) 44 | y_data = np.around(y_data, decimals=8) 45 | 46 | # ============================================================================= 47 | # WALK FORWARD 48 | # ============================================================================= 49 | 50 | window = 150 51 | preds = np.zeros(diff_len-window) 52 | for i in range(diff_len-window-1): 53 | x_train = x_data[i:i+window, :] 54 | y_train = y_data[i:i+window] 55 | lr.fit(x_train, y_train) 56 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1))[-1] 57 | 58 | 59 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds)) 60 | simulate(data, preds) 61 | 62 | -------------------------------------------------------------------------------- /Chapter10/stacking_regressor.py: -------------------------------------------------------------------------------- 1 | # --- SECTION 1 --- 2 | # Libraries 3 | import numpy as np 4 | 5 | from sklearn.model_selection import KFold 6 | from copy import deepcopy 7 | 8 | 9 | class StackingRegressor(): 10 | 11 | # --- SECTION 2 --- 12 | # The constructor 13 | def __init__(self, learners): 14 | # Create a list of sizes for each stacking level 15 | # And a list of deep copied learners 16 | self.level_sizes = [] 17 | self.learners = [] 18 | for learning_level in learners: 19 | 20 | self.level_sizes.append(len(learning_level)) 21 | level_learners = [] 22 | for learner in learning_level: 23 | level_learners.append(deepcopy(learner)) 24 | self.learners.append(level_learners) 25 | 26 | 27 | 28 | # --- SECTION 3 --- 29 | # The fit function. Creates training meta data for every level and trains 30 | # each level on the previous level's meta data 31 | def fit(self, x, y): 32 | # Create a list of training meta data, one for each stacking level 33 | # and another one for the targets. For the first level, the actual data 34 | # is used. 35 | meta_data = [x] 36 | meta_targets = [y] 37 | for i in range(len(self.learners)): 38 | level_size = self.level_sizes[i] 39 | 40 | # Create the meta data and target variables for this level 41 | data_z = np.zeros((level_size, len(x))) 42 | target_z = np.zeros(len(x)) 43 | 44 | train_x = meta_data[i] 45 | train_y = meta_targets[i] 46 | 47 | # Create the cross-validation folds 48 | KF = KFold(n_splits=5) 49 | meta_index = 0 50 | for train_indices, test_indices in KF.split(x): 51 | # Train each learner on the K-1 folds and create 52 | # meta data for the Kth fold 53 | for j in range(len(self.learners[i])): 54 | 55 | learner = self.learners[i][j] 56 | learner.fit(train_x[train_indices], train_y[train_indices]) 57 | predictions = learner.predict(train_x[test_indices]) 58 | 59 | data_z[j][meta_index:meta_index+len(test_indices)] = predictions 60 | 61 | target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices] 62 | meta_index += len(test_indices) 63 | 64 | # Add the data and targets to the meta data lists 65 | data_z = data_z.transpose() 66 | meta_data.append(data_z) 67 | meta_targets.append(target_z) 68 | 69 | 70 | # Train the learner on the whole previous meta data 71 | for learner in self.learners[i]: 72 | learner.fit(train_x, train_y) 73 | 74 | 75 | 76 | 77 | 78 | 79 | # --- SECTION 4 --- 80 | # The predict function. Creates meta data for the test data and returns 81 | # all of them. The actual predictions can be accessed with meta_data[-1] 82 | def predict(self, x): 83 | 84 | # Create a list of training meta data, one for each stacking level 85 | meta_data = [x] 86 | for i in range(len(self.learners)): 87 | level_size = self.level_sizes[i] 88 | 89 | data_z = np.zeros((level_size, len(x))) 90 | 91 | test_x = meta_data[i] 92 | 93 | 94 | for j in range(len(self.learners[i])): 95 | 96 | learner = self.learners[i][j] 97 | predictions = learner.predict(test_x) 98 | data_z[j] = predictions 99 | 100 | 101 | 102 | # Add the data and targets to the meta data lists 103 | data_z = data_z.transpose() 104 | meta_data.append(data_z) 105 | 106 | # Return the meta_data the final layer's prediction can be accessed 107 | # With meta_data[-1] 108 | return meta_data 109 | 110 | -------------------------------------------------------------------------------- /Chapter10/voting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from simulator import simulate 5 | from sklearn import metrics 6 | from sklearn.neighbors import KNeighborsRegressor 7 | from sklearn.linear_model import LinearRegression 8 | from sklearn.svm import SVR 9 | from voting_regressor import VotingRegressor 10 | 11 | np.random.seed(123456) 12 | 13 | 14 | lr = SVR() 15 | 16 | data = pd.read_csv('BTC-USD.csv') 17 | data = data.dropna() 18 | data.Date = pd.to_datetime(data.Date) 19 | data.set_index('Date', drop=True, inplace=True) 20 | diffs = (data.Close.diff()/data.Close).values[1:] 21 | 22 | diff_len = len(diffs) 23 | 24 | base_learners = [('SVR', SVR()), 25 | ('LR', LinearRegression()), 26 | ('KNN', KNeighborsRegressor())] 27 | 28 | lr = VotingRegressor(base_learners) 29 | def create_x_data(lags=1): 30 | diff_data = np.zeros((diff_len, lags)) 31 | ma_data = np.zeros((diff_len, lags)) 32 | 33 | diff_ma = (data.Close.diff()/data.Close).rolling(15).mean().fillna(0).values[1:] 34 | for lag in range(1, lags+1): 35 | this_data = diffs[:-lag] 36 | diff_data[lag:, lag-1] = this_data 37 | 38 | this_data = diff_ma[:-lag] 39 | ma_data[lag:, lag-1] = this_data 40 | return np.concatenate((diff_data, ma_data), axis=1) 41 | 42 | x_data = create_x_data(lags=20)*100 43 | y_data = diffs*100 44 | 45 | # REPRODUCIBILITY 46 | x_data = np.around(x_data, decimals=8) 47 | y_data = np.around(y_data, decimals=8) 48 | 49 | # ============================================================================= 50 | # WALK FORWARD 51 | # ============================================================================= 52 | 53 | window = 150 54 | preds = np.zeros(diff_len-window) 55 | for i in range(diff_len-window-1): 56 | x_train = x_data[i:i+window, :] 57 | y_train = y_data[i:i+window] 58 | lr.fit(x_train, y_train) 59 | preds[i] = lr.predict(x_data[i+window+1, :].reshape(1, -1)) 60 | 61 | 62 | print('Percentages MSE: %.2f'%metrics.mean_squared_error(y_data[window:], preds)) 63 | simulate(data, preds) 64 | -------------------------------------------------------------------------------- /Chapter10/voting_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import deepcopy 3 | 4 | class VotingRegressor(): 5 | 6 | # Accepts a list of (name, classifier) tuples 7 | def __init__(self, base_learners): 8 | self.base_learners = {} 9 | for name, learner in base_learners: 10 | self.base_learners[name] = deepcopy(learner) 11 | 12 | 13 | # Fits each individual base learner 14 | def fit(self, x_data, y_data): 15 | for name in self.base_learners: 16 | learner = self.base_learners[name] 17 | learner.fit(x_data, y_data) 18 | 19 | # Generates the predictions 20 | def predict(self, x_data): 21 | 22 | # Create the predictions matrix 23 | predictions = np.zeros((len(x_data), len(self.base_learners))) 24 | 25 | names = list(self.base_learners.keys()) 26 | 27 | # For each base learner 28 | for i in range(len(self.base_learners)): 29 | name = names[i] 30 | learner = self.base_learners[name] 31 | 32 | # Store the predictions in a column 33 | preds = learner.predict(x_data) 34 | predictions[:,i] = preds 35 | 36 | # Take the row-average 37 | predictions = np.mean(predictions, axis=1) 38 | return predictions -------------------------------------------------------------------------------- /Chapter11/base_learners_twitter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn import metrics 5 | from sklearn.ensemble import VotingClassifier 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from sklearn.linear_model import LogisticRegression, RidgeClassifier 8 | from sklearn.naive_bayes import MultinomialNB 9 | # Load the data 10 | data = pd.read_csv('sent140_preprocessed.csv') 11 | data = data.dropna() 12 | 13 | 14 | 15 | 16 | # Set the train and test sizes 17 | train_size = 10000 18 | test_start = 10000 19 | test_end = 100000 20 | 21 | 22 | 23 | def check_features_ngrams(features, n_grams, classifiers): 24 | 25 | print(features, n_grams) 26 | 27 | # Create the IDF feature extractor 28 | tf = TfidfVectorizer(max_features=features, ngram_range=n_grams, 29 | stop_words='english') 30 | 31 | # Create the IDF features 32 | tf.fit(data.text) 33 | transformed = tf.transform(data.text) 34 | 35 | np.random.seed(123456) 36 | 37 | def check_classifier(name, classifier): 38 | print('--'+name+'--') 39 | 40 | # Train the classifier 41 | x_data = transformed[:train_size].toarray() 42 | y_data = data.polarity[:train_size].values 43 | 44 | classifier.fit(x_data, y_data) 45 | i_s = metrics.accuracy_score(y_data, classifier.predict(x_data)) 46 | 47 | # Evaluate on the test set 48 | x_data = transformed[test_start:test_end].toarray() 49 | y_data = data.polarity[test_start:test_end].values 50 | oos = metrics.accuracy_score(y_data, classifier.predict(x_data)) 51 | 52 | # Expor the results 53 | with open("outs.txt","a") as f: 54 | f.write(str(features)+',') 55 | f.write(str(n_grams[-1])+',') 56 | f.write(name+',') 57 | f.write('%.4f'%i_s+',') 58 | f.write('%.4f'%oos+'\n') 59 | 60 | for name, classifier in classifiers: 61 | check_classifier(name, classifier) 62 | 63 | 64 | # Create csv header 65 | with open("outs.txt","a") as f: 66 | f.write('features,ngram_range,classifier,train_acc,test_acc') 67 | 68 | # Test all features and n-grams combinations 69 | for features in [500, 1000, 5000, 10000, 20000, 30000]: 70 | for n_grams in [(1, 1), (1, 2), (1, 3)]: 71 | 72 | # Create the ensemble 73 | voting = VotingClassifier([('LR', LogisticRegression()), 74 | ('NB', MultinomialNB()), 75 | ('Ridge', RidgeClassifier())]) 76 | 77 | # Create the named classifiers 78 | classifiers = [('LR', LogisticRegression()), 79 | ('NB', MultinomialNB()), 80 | ('Ridge', RidgeClassifier()), 81 | ('Voting', voting)] 82 | 83 | # Evaluate them 84 | check_features_ngrams(features, n_grams, classifiers) 85 | 86 | 87 | -------------------------------------------------------------------------------- /Chapter11/comparisons.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | 5 | 6 | res = pd.read_csv('outs_old.csv') 7 | 8 | i = 1 9 | for key, grp in res.groupby(['classifier']): 10 | plt.subplot(2, 2, i) 11 | i += 1 12 | plt.title(str(key)) 13 | for key2, grp2 in grp.groupby(['ngram_range']): 14 | plt.plot(grp2.features.values, grp2['test_acc'].values, label=str(key2)+'-gram') 15 | # plt.xscale('log') 16 | plt.legend() 17 | plt.xlabel('features') 18 | plt.ylabel('accuracy') 19 | plt.xscale('log') -------------------------------------------------------------------------------- /Chapter11/data_cleaning.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import re 4 | from nltk.corpus import stopwords 5 | from nltk.stem import PorterStemmer 6 | from string import punctuation 7 | 8 | # Read the data and assign labels 9 | labels = ['polarity', 'id', 'date', 'query', 'user', 'text'] 10 | data = pd.read_csv("sent140.csv", names=labels) 11 | 12 | # Keep only text and polarity, change polarity to 0-1 13 | data = data[['text', 'polarity']] 14 | data.polarity.replace(4, 1, inplace=True) 15 | 16 | # Create a list of stopwords 17 | stops = stopwords.words("english") 18 | 19 | # Add stop variants without single quotes 20 | no_quotes = [] 21 | for word in stops: 22 | if "'" in word: 23 | no_quotes.append(re.sub(r'\'', '', word)) 24 | stops.extend(no_quotes) 25 | 26 | 27 | def clean_string(string): 28 | # Remove HTML entities 29 | tmp = re.sub(r'\&\w*;', '', string) 30 | # Remove @user 31 | tmp = re.sub(r'@(\w+)', '', tmp) 32 | # Remove links 33 | tmp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', '', tmp) 34 | # Lowercase 35 | tmp = tmp.lower() 36 | # Remove Hashtags 37 | tmp = re.sub(r'#(\w+)', '', tmp) 38 | # Remove repeating chars 39 | tmp = re.sub(r'(.)\1{1,}', r'\1\1', tmp) 40 | # Remove anything that is not letters 41 | tmp = re.sub("[^a-zA-Z]", " ", tmp) 42 | # Remove anything that is less than two characters 43 | tmp = re.sub(r'\b\w{1,2}\b', '', tmp) 44 | # Remove multiple spaces 45 | tmp = re.sub(r'\s\s+', ' ', tmp) 46 | return tmp 47 | 48 | 49 | 50 | def preprocess(string): 51 | 52 | stemmer = PorterStemmer() 53 | # Remove any punctuation character 54 | removed_punc = ''.join([char for char in string if char not in punctuation]) 55 | 56 | cleaned = [] 57 | # Remove any stopword 58 | for word in removed_punc.split(' '): 59 | if word not in stops: 60 | cleaned.append(stemmer.stem(word.lower())) 61 | return ' '.join(cleaned) 62 | 63 | 64 | 65 | 66 | # Shuffle 67 | data = data.sample(frac=1).reset_index(drop=True) 68 | # Clean 69 | data.text = data.text.apply(clean_string) 70 | # Pre-process 71 | data.text = data.text.apply(preprocess) 72 | # Save to CSV 73 | data.to_csv('sent140_preprocessed.csv', index=False) -------------------------------------------------------------------------------- /Chapter11/exploratory.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | 4 | from collections import Counter 5 | 6 | # Read the data and assign labels 7 | labels = ['polarity', 'id', 'date', 'query', 'user', 'text'] 8 | data = pd.read_csv("sent140.csv", names=labels) 9 | 10 | # Plot polarities 11 | data.groupby('polarity').id.count().plot(kind='bar') 12 | 13 | # Get most frequent words 14 | data['words'] = data.text.str.split() 15 | 16 | words = [] 17 | # Get a list of all words 18 | for w in data.words: 19 | words.extend(w) 20 | 21 | # Get the frequencies and plot 22 | freqs = Counter(words).most_common(30) 23 | plt.plot(*zip(*freqs)) 24 | plt.xticks(rotation=80) 25 | plt.ylabel('Count') 26 | plt.title('30 most common words.') 27 | 28 | -------------------------------------------------------------------------------- /Chapter11/stream_sentiment.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | from sklearn.ensemble import VotingClassifier 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.linear_model import LogisticRegression, RidgeClassifier 7 | from sklearn.naive_bayes import MultinomialNB 8 | from tweepy import OAuthHandler, Stream, StreamListener 9 | 10 | # Please fill your API keys as strings 11 | consumer_key="" 12 | consumer_secret="" 13 | 14 | access_token="" 15 | access_token_secret="" 16 | 17 | 18 | 19 | # Load the data 20 | data = pd.read_csv('sent140_preprocessed.csv') 21 | data = data.dropna() 22 | # Replicate our voting classifier for 30.000 features and 1-3 n-grams 23 | train_size = 10000 24 | 25 | tf = TfidfVectorizer(max_features=30000, ngram_range=(1, 3), 26 | stop_words='english') 27 | tf.fit(data.text) 28 | transformed = tf.transform(data.text) 29 | 30 | x_data = transformed[:train_size].toarray() 31 | y_data = data.polarity[:train_size].values 32 | 33 | voting = VotingClassifier([('LR', LogisticRegression()), 34 | ('NB', MultinomialNB()), 35 | ('Ridge', RidgeClassifier())]) 36 | 37 | voting.fit(x_data, y_data) 38 | 39 | 40 | # Define the streaming classifier 41 | class StreamClassifier(StreamListener): 42 | 43 | def __init__(self, classifier, vectorizer, api=None): 44 | super().__init__(api) 45 | self.clf = classifier 46 | self.vec = vectorizer 47 | 48 | # What to do when a tweet arrives 49 | def on_data(self, data): 50 | # Create a json object 51 | json_format = json.loads(data) 52 | # Get the tweet's text 53 | text = json_format['text'] 54 | 55 | features = self.vec.transform([text]).toarray() 56 | print(text, self.clf.predict(features)) 57 | return True 58 | 59 | # If an error occurs, print the status 60 | def on_error(self, status): 61 | print(status) 62 | 63 | # Create the classifier and authentication handlers 64 | classifier = StreamClassifier(classifier=voting, vectorizer=tf) 65 | auth = OAuthHandler(consumer_key, consumer_secret) 66 | auth.set_access_token(access_token, access_token_secret) 67 | 68 | # Listen for specific hashtags 69 | stream = Stream(auth, classifier) 70 | stream.filter(track=['basketball']) -------------------------------------------------------------------------------- /Chapter12/ensemble_fc_models.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate 2 | from keras.models import Model 3 | from sklearn import metrics 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import BayesianRidge 6 | 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import pandas as pd 11 | 12 | np.random.seed(123456) 13 | 14 | def get_data(): 15 | # Read the data and drop timestamp 16 | data = pd.read_csv('ratings.csv') 17 | data.drop('timestamp', axis=1, inplace=True) 18 | 19 | # Re-map the indices 20 | users = data.userId.unique() 21 | movies = data.movieId.unique() 22 | # Create maps from old to new indices 23 | moviemap={} 24 | for i in range(len(movies)): 25 | moviemap[movies[i]]=i 26 | usermap={} 27 | for i in range(len(users)): 28 | usermap[users[i]]=i 29 | 30 | # Change the indices 31 | data.movieId = data.movieId.apply(lambda x: moviemap[x]) 32 | data.userId = data.userId.apply(lambda x: usermap[x]) 33 | 34 | # Shuffle the data 35 | data = data.sample(frac=1.0).reset_index(drop=True) 36 | 37 | # Create a train/test split 38 | train, test = train_test_split(data, test_size=0.2) 39 | 40 | n_users = len(users) 41 | n_movies = len(movies) 42 | 43 | return train, test, n_users, n_movies 44 | 45 | 46 | train, test, n_users, n_movies = get_data() 47 | 48 | 49 | def create_model(n_features=5, train_model=True, load_weights=False): 50 | fts = n_features 51 | 52 | # Movie part. Input accepts the index as input 53 | # and passes it to the Embedding layer. Finally, 54 | # Flatten transforms Embedding's output to a 55 | # one-dimensional tensor. 56 | movie_in = Input(shape=[1], name="Movie") 57 | mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in) 58 | flat_movie = Flatten(name="FlattenM")(mov_embed) 59 | 60 | # Repeat for the user. 61 | user_in = Input(shape=[1], name="User") 62 | user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in) 63 | flat_user = Flatten(name="FlattenU")(user_inuser_embed) 64 | 65 | # Concatenate the Embedding layers and feed them 66 | # to the Dense part of the network 67 | concat = Concatenate()([flat_movie, flat_user]) 68 | dense_1 = Dense(128)(concat) 69 | dense_2 = Dense(32)(dense_1) 70 | out = Dense(1)(dense_2) 71 | 72 | # Create and compile the model 73 | model = Model([user_in, movie_in], out) 74 | model.compile('adam', 'mean_squared_error') 75 | # Train the model 76 | model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1) 77 | 78 | return model 79 | 80 | def predictions(model): 81 | preds = model.predict([test.userId, test.movieId]) 82 | return preds 83 | 84 | # Create base and meta learner 85 | model5 = create_model(5) 86 | model10 = create_model(10) 87 | model15 = create_model(15) 88 | meta_learner = BayesianRidge() 89 | 90 | # Predict on the test set 91 | preds5 = predictions(model5) 92 | preds10 = predictions(model10) 93 | preds15 = predictions(model15) 94 | # Create a single array with the predictions 95 | preds = np.stack([preds5, preds10, preds15], axis=-1).reshape(-1, 3) 96 | 97 | 98 | # Fit the meta learner on all but the last 1000 test samples 99 | meta_learner.fit(preds[:-1000], test.rating[:-1000]) 100 | 101 | # Evaluate the base learners and the meta learner on the last 102 | # 1000 test samples 103 | print('Base Learner 5 Features') 104 | print(metrics.mean_squared_error(test.rating[-1000:], preds5[-1000:])) 105 | print('Base Learner 10 Features') 106 | print(metrics.mean_squared_error(test.rating[-1000:], preds10[-1000:])) 107 | print('Base Learner 15 Features') 108 | print(metrics.mean_squared_error(test.rating[-1000:], preds15[-1000:])) 109 | print('Ensemble') 110 | print(metrics.mean_squared_error(test.rating[-1000:], meta_learner.predict(preds[-1000:]))) 111 | 112 | -------------------------------------------------------------------------------- /Chapter12/exploratory.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | 4 | data = pd.read_csv('ratings.csv') 5 | 6 | print(data.head()) 7 | data.drop('timestamp', axis=1, inplace=True) 8 | 9 | 10 | data.rating.hist(grid=False) 11 | plt.ylabel('Frequency') 12 | plt.ylabel('Rating') 13 | plt.title('Rating Distribution') 14 | 15 | data.describe() -------------------------------------------------------------------------------- /Chapter12/single_dense_model.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate 2 | from keras.models import Model 3 | from sklearn.model_selection import train_test_split 4 | from sklearn import metrics 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | np.random.seed(123456) 10 | 11 | 12 | def get_data(): 13 | # Read the data and drop timestamp 14 | data = pd.read_csv('ratings.csv') 15 | data.drop('timestamp', axis=1, inplace=True) 16 | 17 | # Re-map the indices 18 | users = data.userId.unique() 19 | movies = data.movieId.unique() 20 | # Create maps from old to new indices 21 | moviemap={} 22 | for i in range(len(movies)): 23 | moviemap[movies[i]]=i 24 | usermap={} 25 | for i in range(len(users)): 26 | usermap[users[i]]=i 27 | 28 | # Change the indices 29 | data.movieId = data.movieId.apply(lambda x: moviemap[x]) 30 | data.userId = data.userId.apply(lambda x: usermap[x]) 31 | 32 | # Shuffle the data 33 | data = data.sample(frac=1.0).reset_index(drop=True) 34 | 35 | # Create a train/test split 36 | train, test = train_test_split(data, test_size=0.2) 37 | 38 | n_users = len(users) 39 | n_movies = len(movies) 40 | 41 | return train, test, n_users, n_movies 42 | 43 | 44 | train, test, n_users, n_movies = get_data() 45 | 46 | fts = 5 47 | 48 | # Movie part. Input accepts the index as input 49 | # and passes it to the Embedding layer. Finally, 50 | # Flatten transforms Embedding's output to a 51 | # one-dimensional tensor. 52 | movie_in = Input(shape=[1], name="Movie") 53 | mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in) 54 | flat_movie = Flatten(name="FlattenM")(mov_embed) 55 | 56 | # Repeat for the user. 57 | user_in = Input(shape=[1], name="User") 58 | user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in) 59 | flat_user = Flatten(name="FlattenU")(user_inuser_embed) 60 | 61 | # Concatenate the Embedding layers and feed them 62 | # to the Dense part of the network 63 | concat = Concatenate()([flat_movie, flat_user]) 64 | dense_1 = Dense(128)(concat) 65 | dense_2 = Dense(32)(dense_1) 66 | out = Dense(1)(dense_2) 67 | 68 | # Create and compile the model 69 | model = Model([user_in, movie_in], out) 70 | model.compile('adam', 'mean_squared_error') 71 | 72 | # Train the model on the train set 73 | model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1) 74 | 75 | # Evaluate on the test set 76 | print(metrics.mean_squared_error(test.rating, 77 | model.predict([test.userId, test.movieId]))) 78 | -------------------------------------------------------------------------------- /Chapter12/single_dot_model.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Embedding, Flatten, Dot 2 | from keras.models import Model 3 | from sklearn.model_selection import train_test_split 4 | from sklearn import metrics 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | np.random.seed(123456) 10 | 11 | 12 | def get_data(): 13 | # Read the data and drop timestamp 14 | data = pd.read_csv('ratings.csv') 15 | data.drop('timestamp', axis=1, inplace=True) 16 | 17 | # Re-map the indices 18 | users = data.userId.unique() 19 | movies = data.movieId.unique() 20 | # Create maps from old to new indices 21 | moviemap={} 22 | for i in range(len(movies)): 23 | moviemap[movies[i]]=i 24 | usermap={} 25 | for i in range(len(users)): 26 | usermap[users[i]]=i 27 | 28 | # Change the indices 29 | data.movieId = data.movieId.apply(lambda x: moviemap[x]) 30 | data.userId = data.userId.apply(lambda x: usermap[x]) 31 | 32 | # Shuffle the data 33 | data = data.sample(frac=1.0).reset_index(drop=True) 34 | 35 | # Create a train/test split 36 | train, test = train_test_split(data, test_size=0.2) 37 | 38 | n_users = len(users) 39 | n_movies = len(movies) 40 | 41 | return train, test, n_users, n_movies 42 | 43 | 44 | train, test, n_users, n_movies = get_data() 45 | 46 | fts = 5 47 | 48 | # Movie part. Input accepts the index as input 49 | # and passes it to the Embedding layer. Finally, 50 | # Flatten transforms Embedding's output to a 51 | # one-dimensional tensor. 52 | movie_in = Input(shape=[1], name="Movie") 53 | mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in) 54 | flat_movie = Flatten(name="FlattenM")(mov_embed) 55 | 56 | # Repeat for the user. 57 | user_in = Input(shape=[1], name="User") 58 | user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in) 59 | flat_user = Flatten(name="FlattenU")(user_inuser_embed) 60 | 61 | # Calculate the dot-product of the two embeddings 62 | prod = Dot(name="Mult", axes=1)([flat_movie, flat_user]) 63 | 64 | # Create and compile the model 65 | model = Model([user_in, movie_in], prod) 66 | model.compile('adam', 'mean_squared_error') 67 | 68 | # Train the model on the train set 69 | model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1) 70 | 71 | # Evaluate on the test set 72 | print(metrics.mean_squared_error(test.rating, 73 | model.predict([test.userId, test.movieId]))) 74 | -------------------------------------------------------------------------------- /Chapter13/clustering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from sklearn import cluster 4 | 5 | data = pd.read_csv('WHR.csv') 6 | regs = pd.read_csv('Regions.csv') 7 | 8 | recents = data[data.Year == 2017] 9 | recents = recents.dropna(axis=1, how="all") 10 | recents = recents.fillna(recents.median()) 11 | 12 | 13 | def find_region(country): 14 | return regs[regs['Country name']==country].Region.values[-1] 15 | 16 | def find_region_size(region): 17 | return regs.groupby('Region')['Country name'].count()[region] 18 | 19 | km = cluster.KMeans(3) 20 | fits = recents[['Log GDP per capita', 21 | 'Social support', 'Healthy life expectancy at birth', 22 | 'Freedom to make life choices', 'Generosity', 23 | 'Perceptions of corruption','Positive affect', 'Negative affect', 24 | 'Confidence in national government', 'Democratic Quality', 25 | 'Delivery Quality']].values 26 | preds = km.fit_predict(fits) 27 | recents['Cluster'] = preds 28 | 29 | 30 | grouped = recents.groupby('Cluster')['Country name'] 31 | for key, item in grouped: 32 | countries = grouped.get_group(key).values 33 | regions = {x: 0 for x in regs.Region.unique()} 34 | for country in countries: 35 | regions[find_region(country)] = regions[find_region(country)]+1 36 | print(key, countries, regions, "\n\n") 37 | x, y = [], [] 38 | for k in regions: 39 | x.append(k) 40 | y.append(regions[k]/find_region_size(k)) 41 | plt.figure() 42 | plt.bar(x, y) 43 | plt.xticks(rotation=90) 44 | 45 | 46 | 47 | 48 | recents = recents.dropna(axis=1, how="any") 49 | recents = recents.fillna(recents.median()) 50 | 51 | 52 | 53 | km = cluster.KMeans(10) 54 | preds = km.fit_predict(recents.drop(['Year', 'Country name'], axis=1).values) 55 | recents['Cluster'] = preds 56 | 57 | grouped = recents.groupby('Cluster')['Country name'] 58 | for key, item in grouped: 59 | countries = grouped.get_group(key).values 60 | regions = {x: 0 for x in regs.Region.unique()} 61 | for country in countries: 62 | regions[find_region(country)] = regions[find_region(country)]+1 63 | print(key, countries, regions, "\n\n") 64 | x, y = [], [] 65 | for k in regions: 66 | x.append(k) 67 | y.append(regions[k]/find_region_size(k)) 68 | plt.figure() 69 | plt.bar(x, y) 70 | plt.xticks(rotation=90) 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /Chapter13/ensemble_cluster.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import openensembles as oe 4 | import pandas as pd 5 | 6 | 7 | from sklearn import metrics 8 | 9 | 10 | # Load the datasets 11 | data = pd.read_csv('WHR.csv') 12 | regs = pd.read_csv('Regions.csv') 13 | 14 | # Use the 2017 data and fill any NaNs 15 | recents = data[data.Year == 2017] 16 | recents = recents.dropna(axis=1, how="all") 17 | recents = recents.fillna(recents.median()) 18 | 19 | 20 | # Use only these specific features 21 | columns = ['Log GDP per capita', 22 | 'Social support', 'Healthy life expectancy at birth', 23 | 'Freedom to make life choices', 'Generosity', 24 | 'Perceptions of corruption','Positive affect', 'Negative affect', 25 | 'Confidence in national government', 'Democratic Quality', 26 | 'Delivery Quality'] 27 | 28 | # Create the data object 29 | cluster_data = oe.data(recents[columns], columns) 30 | 31 | 32 | np.random.seed(123456) 33 | results = {'K':[], 'size':[], 'silhouette': []} 34 | # Test different ensemble setups 35 | Ks = [2, 4, 6, 8, 10, 12, 14] 36 | sizes = [5, 10, 20, 50] 37 | for K in Ks: 38 | for ensemble_size in sizes: 39 | ensemble = oe.cluster(cluster_data) 40 | for i in range(ensemble_size): 41 | name = f'kmeans_{ensemble_size}_{i}' 42 | ensemble.cluster('parent', 'kmeans', name, K) 43 | 44 | preds = ensemble.finish_co_occ_linkage(threshold=0.5) 45 | print(f'K: {K}, size {ensemble_size}:', end=' ') 46 | silhouette = metrics.silhouette_score(recents[columns], 47 | preds.labels['co_occ_linkage']) 48 | print('%.2f' % silhouette) 49 | results['K'].append(K) 50 | results['size'].append(ensemble_size) 51 | results['silhouette'].append(silhouette) 52 | 53 | results_df = pd.DataFrame(results) 54 | cross = pd.crosstab(results_df.K, results_df['size'], 55 | results_df['silhouette'], aggfunc=lambda x: x) 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /Chapter13/ensemble_cluster_normalized.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import openensembles as oe 4 | import pandas as pd 5 | 6 | 7 | from sklearn import metrics 8 | 9 | 10 | # Load the datasets 11 | data = pd.read_csv('WHR.csv') 12 | regs = pd.read_csv('Regions.csv') 13 | 14 | # Use the 2017 data and fill any NaNs 15 | recents = data[data.Year == 2017] 16 | recents = recents.dropna(axis=1, how="all") 17 | recents = recents.fillna(recents.median()) 18 | 19 | 20 | # Use only these specific features 21 | columns = ['Log GDP per capita', 22 | 'Social support', 'Healthy life expectancy at birth', 23 | 'Freedom to make life choices', 'Generosity', 24 | 'Perceptions of corruption','Positive affect', 'Negative affect', 25 | 'Confidence in national government', 'Democratic Quality', 26 | 'Delivery Quality'] 27 | 28 | # Normalize the features by subtracting the mean 29 | # and dividing by the standard deviation 30 | normalized = recents[columns] 31 | normalized = normalized - normalized.mean() 32 | normalized = normalized / normalized.std() 33 | # Create the data object 34 | cluster_data = oe.data(recents[columns], columns) 35 | 36 | 37 | np.random.seed(123456) 38 | results = {'K':[], 'size':[], 'silhouette': []} 39 | # Test different ensemble setups 40 | Ks = [2, 4, 6, 8, 10, 12, 14] 41 | sizes = [5, 10, 20, 50] 42 | for K in Ks: 43 | for ensemble_size in sizes: 44 | ensemble = oe.cluster(cluster_data) 45 | for i in range(ensemble_size): 46 | name = f'kmeans_{ensemble_size}_{i}' 47 | ensemble.cluster('parent', 'kmeans', name, K) 48 | 49 | preds = ensemble.finish_co_occ_linkage(threshold=0.5) 50 | print(f'K: {K}, size {ensemble_size}:', end=' ') 51 | silhouette = metrics.silhouette_score(recents[columns], 52 | preds.labels['co_occ_linkage']) 53 | print('%.2f' % silhouette) 54 | results['K'].append(K) 55 | results['size'].append(ensemble_size) 56 | results['silhouette'].append(silhouette) 57 | 58 | results_df = pd.DataFrame(results) 59 | cross = pd.crosstab(results_df.K, results_df['size'], 60 | results_df['silhouette'], aggfunc=lambda x: x) 61 | 62 | -------------------------------------------------------------------------------- /Chapter13/ensemble_cluster_tsne.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import openensembles as oe 4 | import pandas as pd 5 | 6 | 7 | from sklearn import metrics 8 | from sklearn.manifold import t_sne 9 | 10 | 11 | # Load the datasets 12 | data = pd.read_csv('WHR.csv') 13 | regs = pd.read_csv('Regions.csv') 14 | 15 | # Use the 2017 data and fill any NaNs 16 | recents = data[data.Year == 2017] 17 | recents = recents.dropna(axis=1, how="all") 18 | recents = recents.fillna(recents.median()) 19 | 20 | 21 | # Use only these specific features 22 | columns = ['Log GDP per capita', 23 | 'Social support', 'Healthy life expectancy at birth', 24 | 'Freedom to make life choices', 'Generosity', 25 | 'Perceptions of corruption','Positive affect', 'Negative affect', 26 | 'Confidence in national government', 'Democratic Quality', 27 | 'Delivery Quality'] 28 | 29 | # Transform the data with TSNE 30 | tsne = t_sne.TSNE() 31 | transformed = pd.DataFrame(tsne.fit_transform(recents[columns])) 32 | # Create the data object 33 | cluster_data = oe.data(transformed, [0, 1]) 34 | 35 | 36 | np.random.seed(123456) 37 | results = {'K':[], 'size':[], 'silhouette': []} 38 | # Test different ensemble setups 39 | Ks = [2, 4, 6, 8, 10, 12, 14] 40 | sizes = [5, 10, 20, 50] 41 | for K in Ks: 42 | for ensemble_size in sizes: 43 | ensemble = oe.cluster(cluster_data) 44 | for i in range(ensemble_size): 45 | name = f'kmeans_{ensemble_size}_{i}' 46 | ensemble.cluster('parent', 'kmeans', name, K) 47 | 48 | preds = ensemble.finish_co_occ_linkage(threshold=0.5) 49 | print(f'K: {K}, size {ensemble_size}:', end=' ') 50 | silhouette = metrics.silhouette_score(recents[columns], 51 | preds.labels['co_occ_linkage']) 52 | print('%.2f' % silhouette) 53 | results['K'].append(K) 54 | results['size'].append(ensemble_size) 55 | results['silhouette'].append(silhouette) 56 | 57 | results_df = pd.DataFrame(results) 58 | cross = pd.crosstab(results_df.K, results_df['size'], 59 | results_df['silhouette'], aggfunc=lambda x: x) 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /Chapter13/exploratory.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | 4 | from matplotlib import cm 5 | 6 | 7 | data = pd.read_csv('WHR.csv') 8 | regs = pd.read_csv('Regions.csv') 9 | 10 | def find_region(country): 11 | if country in list(regs['Country name'].values): 12 | return regs[regs['Country name']==country].Region.values[-1] 13 | return 'None' 14 | 15 | recents = data[data.Year == 2018] 16 | recents = recents.dropna(axis=1, how="all") 17 | recents = recents.fillna(recents.median()) 18 | recents['Region'] = recents['Country name'].apply(lambda x: find_region(x)) 19 | 20 | 21 | 22 | 23 | cmap = cm.get_cmap('viridis') 24 | recents.groupby('Region')['Country name'].count().plot(kind='pie', labels=None, cmap=cmap, autopct='%1.0f%%', textprops={'color':"w"}) 25 | plt.ylabel('') 26 | plt.xticks() 27 | plt.legend(labels = recents.groupby('Region')['Country name'].count().index, bbox_to_anchor=(1, 1.05)) 28 | 29 | 30 | data[['Year', 'Life Ladder']].set_index('Year').boxplot(by='Year', grid=False) 31 | plt.suptitle("") 32 | plt.title('Life Ladder') 33 | plt.xlabel('Year') 34 | 35 | data.groupby('Year')['Life Ladder'].count().plot() 36 | plt.title('Countries per Year') 37 | plt.xlabel('Year') 38 | plt.ylabel('Countries') 39 | 40 | 41 | def create_scatter(col, nc, nr, index): 42 | plt.subplot(nc, nr, index) 43 | render = data.sample(frac=0.3) 44 | plt.scatter(render[col], render['Life Ladder']) 45 | plt.title(str(col)[:20]) 46 | 47 | i = 1 48 | for key in ['Log GDP per capita', 49 | 'Social support', 'Healthy life expectancy at birth', 50 | 'Freedom to make life choices', 'Generosity', 51 | 'Perceptions of corruption','Positive affect', 'Negative affect', 52 | 'Confidence in national government', 'Democratic Quality', 53 | 'Delivery Quality']: 54 | create_scatter(key, 4, 3, i) 55 | i += 1 56 | 57 | 58 | t = data[data['Year']==2005].copy() 59 | countries = list(t['Country name'].values) 60 | filtered = data[data['Country name'].isin(countries)] 61 | 62 | filtered[['Year', 'Life Ladder']].set_index('Year').boxplot(by='Year', grid=False) 63 | plt.suptitle("") 64 | plt.title('Life Ladder - Same Countries') 65 | plt.xlabel('Year') 66 | 67 | from sklearn.manifold import t_sne 68 | 69 | t = t_sne.TSNE() 70 | data = data.fillna(data.median()) 71 | transformed = t.fit_transform(data[['Log GDP per capita', 72 | 'Social support', 'Healthy life expectancy at birth', 73 | 'Freedom to make life choices', 'Generosity', 74 | 'Perceptions of corruption','Positive affect', 'Negative affect', 75 | 'Confidence in national government', 'Democratic Quality', 76 | 'Delivery Quality']].values) 77 | 78 | plt.scatter(transformed[:,0], transformed[:,1], c=data['Life Ladder'].values) 79 | 80 | regions = {x: 0 for x in regs.Region.unique()} 81 | i = 0 82 | for r in regions: 83 | regions[r] = i 84 | i += 1 85 | regions['None'] = i 86 | 87 | plt.scatter(transformed[:,0], transformed[:,1], c=data['Region'].apply(lambda x: regions[x]).values) 88 | -------------------------------------------------------------------------------- /Chapter13/insights.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import openensembles as oe 4 | import pandas as pd 5 | 6 | from sklearn import metrics 7 | from sklearn.manifold import t_sne 8 | 9 | np.random.seed(123456) 10 | 11 | # Load the datasets 12 | data = pd.read_csv('WHR.csv') 13 | regs = pd.read_csv('Regions.csv') 14 | 15 | # Use the 2017 data and fill any NaNs 16 | recents = data[data.Year == 2017] 17 | recents = recents.dropna(axis=1, how="all") 18 | recents = recents.fillna(recents.median()) 19 | 20 | 21 | # Use only these specific features 22 | columns = ['Log GDP per capita', 23 | 'Social support', 'Healthy life expectancy at birth', 24 | 'Freedom to make life choices', 'Generosity', 25 | 'Perceptions of corruption','Positive affect', 'Negative affect', 26 | 'Confidence in national government', 'Democratic Quality', 27 | 'Delivery Quality'] 28 | 29 | 30 | # Transform the data with TSNE 31 | tsne = t_sne.TSNE() 32 | transformed = pd.DataFrame(tsne.fit_transform(recents[columns])) 33 | # Create the data object 34 | cluster_data = oe.data(transformed, [0, 1]) 35 | 36 | # Create the ensemble 37 | ensemble = oe.cluster(cluster_data) 38 | for i in range(20): 39 | name = f'kmeans({i}-tsne' 40 | ensemble.cluster('parent', 'kmeans', name, 10) 41 | 42 | # Create the cluster labels 43 | preds = ensemble.finish_co_occ_linkage(threshold=0.5) 44 | 45 | 46 | # Add Life Ladder to columns 47 | columns = ['Life Ladder', 'Log GDP per capita', 48 | 'Social support', 'Healthy life expectancy at birth', 49 | 'Freedom to make life choices', 'Generosity', 50 | 'Perceptions of corruption','Positive affect', 'Negative affect', 51 | 'Confidence in national government', 'Democratic Quality', 52 | 'Delivery Quality'] 53 | # Add the cluster to the dataframe and group by the cluster 54 | recents['Cluster'] = preds.labels['co_occ_linkage'] 55 | grouped = recents.groupby('Cluster') 56 | # Get the means 57 | means = grouped.mean()[columns] 58 | 59 | # Create barplots 60 | def create_bar(col, nc, nr, index): 61 | plt.subplot(nc, nr, index) 62 | values = means.sort_values('Life Ladder')[col] 63 | mn = min(values) * 0.98 64 | mx = max(values) * 1.02 65 | values.plot(kind='bar', ylim=[mn, mx]) 66 | plt.title(col[:18]) 67 | 68 | # Plot for each feature 69 | plt.figure(1) 70 | i = 1 71 | for col in columns: 72 | create_bar(col, 4, 3, i) 73 | i += 1 74 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Hands-On Ensemble Learning with Python 5 | 6 | Hands-On Ensemble Learning with Python 7 | 8 | This is the code repository for [Hands-On Ensemble Learning with Python](https://www.packtpub.com/data/hands-on-ensemble-learning-with-python), published by Packt. 9 | 10 | **Build highly optimized ensemble machine learning models using scikit-learn and Keras** 11 | 12 | ## What is this book about? 13 | Ensembling is a technique for combining two or more similar or dissimilar machine learning algorithms to create a model that delivers superior predictive power. This book will demonstrate how you can use a variety of weak algorithms to make a strong predictive model. 14 | 15 | With its hands-on approach, you'll not only get up to speed on the basic theory, but also the application of various ensemble learning techniques. Using examples and real-world datasets, you'll be able to produce better machine learning models to solve supervised learning problems such as classification and regression. Later in the book, you'll go on to leverage ensemble learning techniques such as clustering to produce unsupervised machine learning models. As you progress, the chapters will cover different machine learning algorithms that are widely used in the practical world to make predictions and classifications. You'll even get to grips with using Python libraries such as scikit-learn and Keras to implement different ensemble models. 16 | 17 | By the end of this book, you will be well versed in ensemble learning and have the skills you need to understand which ensemble method is required for which problem, in order to successfully implement them in real-world scenarios. 18 | 19 | This book covers the following exciting features: 20 | * Implement ensemble methods to generate models with high accuracy 21 | * Overcome challenges such as bias and variance 22 | * Explore machine learning algorithms to evaluate model performance 23 | * Understand how to construct, evaluate, and apply ensemble models 24 | * Analyze tweets in real time using Twitter's streaming API 25 | * Use Keras to build an ensemble of neural networks for the MovieLens dataset 26 | 27 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789612853) today! 28 | 29 | https://www.packtpub.com/ 31 | 32 | ## Instructions and Navigations 33 | All of the code is organized into folders. For example, Chapter03. 34 | 35 | The code will look like the following: 36 | ``` 37 | # Accuracy of hard voting 38 | print('-'*30) 39 | print('Hard Voting:', accuracy_score(y_test, hard_predictions)) 40 | ``` 41 | 42 | **Following is what you need for this book:** 43 | This book is for data analysts, data scientists, machine learning engineers, and other professionals who are looking to generate advanced models using ensemble techniques. Some understanding of machine learning concepts, Python programming and AWS will be beneficial. 44 | 45 | With the following software and hardware list you can run all code files present in the book (Chapter 1-13). 46 | ### Software and Hardware List 47 | | Chapter | Software required | OS required | 48 | | -------- | ------------------------------------ | ----------------------------------- | 49 | | All | Python(Jupyter notebook) | Windows, Mac OS X, and Linux (Any) | 50 | 51 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781789612851_ColorImages.pdf). 52 | 53 | ### Related products 54 | * Ensemble Machine Learning Cookbook [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/ensemble-machine-learning-cookbook) [[Amazon]](https://www.amazon.com/dp/1789136601) 55 | 56 | 57 | ## Get to Know the Author 58 | **George Kyriakides** is a Ph.D. researcher, studying distributed neural architecture search. His interests and experience include the automated generation and optimization of predictive models for a wide array of applications, such as image recognition, time series analysis, and financial applications. He holds an M.Sc. in computational methods and applications, and a B.Sc. in applied informatics, both from the University of Macedonia, Thessaloniki, Greece. 59 | 60 | **Konstantinos G. Margaritis** has been a teacher and researcher in computer science for more than 30 years. His research interests include parallel and distributed computing, as well as computational intelligence and machine learning. He holds an M.Eng. in electrical engineering (Aristotle University of Thessaloniki, Greece), as well as an M.Sc. and a Ph.D. in computer science (Loughborough University, UK). He is a professor at the Department of Applied Informatics, University of Macedonia, Thessaloniki, Greece. 61 | 62 | 63 | ### Suggestions and Feedback 64 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 65 | 66 | 67 | ### Download a free PDF 68 | 69 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
70 |

https://packt.link/free-ebook/9781789612851

--------------------------------------------------------------------------------