├── README.md └── src └── main.py /README.md: -------------------------------------------------------------------------------- 1 | Tutorial about Over-and Undersampling in python 2 | =========================================== 3 | 4 | In this tutorial I use python and the libraries 5 | * scikit-learn 6 | * scikit-imblearn 7 | 8 | to show you how to do proper Over-and Undersampling 9 | 10 | Check out: http://www.coding-maniac.com for the video and further explanations. 11 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from imblearn.datasets import fetch_datasets 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.pipeline import make_pipeline 5 | from imblearn.pipeline import make_pipeline as make_pipeline_imb 6 | from imblearn.over_sampling import SMOTE 7 | from imblearn.under_sampling import NearMiss 8 | from imblearn.metrics import classification_report_imbalanced 9 | from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report 10 | from sklearn.ensemble import RandomForestClassifier 11 | import numpy as np 12 | 13 | 14 | def print_results(headline, true_value, pred): 15 | print(headline) 16 | print("accuracy: {}".format(accuracy_score(true_value, pred))) 17 | print("precision: {}".format(precision_score(true_value, pred))) 18 | print("recall: {}".format(recall_score(true_value, pred))) 19 | print("f1: {}".format(f1_score(true_value, pred))) 20 | 21 | 22 | # our classifier to use 23 | classifier = RandomForestClassifier 24 | 25 | data = fetch_datasets()['wine_quality'] 26 | 27 | # splitting data into training and test set 28 | X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=2) 29 | 30 | 31 | # build normal model 32 | pipeline = make_pipeline(classifier(random_state=42)) 33 | model = pipeline.fit(X_train, y_train) 34 | prediction = model.predict(X_test) 35 | 36 | # build model with SMOTE imblearn 37 | smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), classifier(random_state=42)) 38 | smote_model = smote_pipeline.fit(X_train, y_train) 39 | smote_prediction = smote_model.predict(X_test) 40 | 41 | # build model with undersampling 42 | nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier(random_state=42)) 43 | nearmiss_model = nearmiss_pipeline.fit(X_train, y_train) 44 | nearmiss_prediction = nearmiss_model.predict(X_test) 45 | 46 | 47 | 48 | # print information about both models 49 | print() 50 | print("normal data distribution: {}".format(Counter(data['target']))) 51 | X_smote, y_smote = SMOTE().fit_sample(data['data'], data['target']) 52 | print("SMOTE data distribution: {}".format(Counter(y_smote))) 53 | X_nearmiss, y_nearmiss = NearMiss().fit_sample(data['data'], data['target']) 54 | print("NearMiss data distribution: {}".format(Counter(y_nearmiss))) 55 | 56 | # classification report 57 | print(classification_report(y_test, prediction)) 58 | print(classification_report_imbalanced(y_test, smote_prediction)) 59 | 60 | print() 61 | print('normal Pipeline Score {}'.format(pipeline.score(X_test, y_test))) 62 | print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test))) 63 | print('NearMiss Pipeline Score {}'.format(nearmiss_pipeline.score(X_test, y_test))) 64 | 65 | 66 | print() 67 | print_results("normal classification", y_test, prediction) 68 | print() 69 | print_results("SMOTE classification", y_test, smote_prediction) 70 | print() 71 | print_results("NearMiss classification", y_test, nearmiss_prediction) 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | from sklearn.model_selection import KFold 84 | 85 | # cross validation done right 86 | kf = KFold(n_splits=5, random_state=42) 87 | accuracy = [] 88 | precision = [] 89 | recall = [] 90 | f1 = [] 91 | auc = [] 92 | for train, test in kf.split(X_train, y_train): 93 | pipeline = make_pipeline_imb(SMOTE(), classifier(random_state=42)) 94 | model = pipeline.fit(X_train[train], y_train[train]) 95 | prediction = model.predict(X_train[test]) 96 | 97 | accuracy.append(pipeline.score(X_train[test], y_train[test])) 98 | precision.append(precision_score(y_train[test], prediction)) 99 | recall.append(recall_score(y_train[test], prediction)) 100 | f1.append(f1_score(y_train[test], prediction)) 101 | auc.append(roc_auc_score(y_train[test], prediction)) 102 | 103 | print() 104 | print("done right mean of scores 5-fold:") 105 | print("accuracy: {}".format(np.mean(accuracy))) 106 | print("precision: {}".format(np.mean(precision))) 107 | print("recall: {}".format(np.mean(recall))) 108 | print("f1: {}".format(np.mean(f1))) 109 | print() 110 | 111 | # cross validation done wrong 112 | kf = KFold(n_splits=5, random_state=42) 113 | accuracy = [] 114 | precision = [] 115 | recall = [] 116 | f1 = [] 117 | auc = [] 118 | X, y = SMOTE().fit_sample(X_train, y_train) 119 | for train, test in kf.split(X, y): 120 | pipeline = make_pipeline(classifier(random_state=42)) 121 | model = pipeline.fit(X[train], y[train]) 122 | prediction = model.predict(X[test]) 123 | 124 | accuracy.append(pipeline.score(X[test], y[test])) 125 | precision.append(precision_score(y[test], prediction)) 126 | recall.append(recall_score(y[test], prediction)) 127 | f1.append(f1_score(y[test], prediction)) 128 | 129 | print("done wrong mean of scores 5-fold:") 130 | print("accuracy: {}".format(np.mean(accuracy))) 131 | print("precision: {}".format(np.mean(precision))) 132 | print("recall: {}".format(np.mean(recall))) 133 | print("f1: {}".format(np.mean(f1))) 134 | --------------------------------------------------------------------------------