├── README.md
└── src
    └── main.py


/README.md:
--------------------------------------------------------------------------------
 1 | Tutorial about Over-and Undersampling in python
 2 | ===========================================
 3 | 
 4 | In this tutorial I use python and the libraries
 5 | * scikit-learn
 6 | * scikit-imblearn
 7 | 
 8 | to show you how to do proper Over-and Undersampling
 9 | 
10 | Check out: http://www.coding-maniac.com for the video and further explanations.
11 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from imblearn.datasets import fetch_datasets
  3 | from sklearn.model_selection import train_test_split
  4 | from sklearn.pipeline import make_pipeline
  5 | from imblearn.pipeline import make_pipeline as make_pipeline_imb
  6 | from imblearn.over_sampling import SMOTE
  7 | from imblearn.under_sampling import NearMiss
  8 | from imblearn.metrics import classification_report_imbalanced
  9 | from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
 10 | from sklearn.ensemble import RandomForestClassifier
 11 | import numpy as np
 12 | 
 13 | 
 14 | def print_results(headline, true_value, pred):
 15 |     print(headline)
 16 |     print("accuracy: {}".format(accuracy_score(true_value, pred)))
 17 |     print("precision: {}".format(precision_score(true_value, pred)))
 18 |     print("recall: {}".format(recall_score(true_value, pred)))
 19 |     print("f1: {}".format(f1_score(true_value, pred)))
 20 | 
 21 | 
 22 | # our classifier to use
 23 | classifier = RandomForestClassifier
 24 | 
 25 | data = fetch_datasets()['wine_quality']
 26 | 
 27 | # splitting data into training and test set
 28 | X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], random_state=2)
 29 | 
 30 | 
 31 | # build normal model
 32 | pipeline = make_pipeline(classifier(random_state=42))
 33 | model = pipeline.fit(X_train, y_train)
 34 | prediction = model.predict(X_test)
 35 | 
 36 | # build model with SMOTE imblearn
 37 | smote_pipeline = make_pipeline_imb(SMOTE(random_state=4), classifier(random_state=42))
 38 | smote_model = smote_pipeline.fit(X_train, y_train)
 39 | smote_prediction = smote_model.predict(X_test)
 40 | 
 41 | # build model with undersampling
 42 | nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier(random_state=42))
 43 | nearmiss_model = nearmiss_pipeline.fit(X_train, y_train)
 44 | nearmiss_prediction = nearmiss_model.predict(X_test)
 45 | 
 46 | 
 47 | 
 48 | # print information about both models
 49 | print()
 50 | print("normal data distribution: {}".format(Counter(data['target'])))
 51 | X_smote, y_smote = SMOTE().fit_sample(data['data'], data['target'])
 52 | print("SMOTE data distribution: {}".format(Counter(y_smote)))
 53 | X_nearmiss, y_nearmiss = NearMiss().fit_sample(data['data'], data['target'])
 54 | print("NearMiss data distribution: {}".format(Counter(y_nearmiss)))
 55 | 
 56 | # classification report
 57 | print(classification_report(y_test, prediction))
 58 | print(classification_report_imbalanced(y_test, smote_prediction))
 59 | 
 60 | print()
 61 | print('normal Pipeline Score {}'.format(pipeline.score(X_test, y_test)))
 62 | print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test, y_test)))
 63 | print('NearMiss Pipeline Score {}'.format(nearmiss_pipeline.score(X_test, y_test)))
 64 | 
 65 | 
 66 | print()
 67 | print_results("normal classification", y_test, prediction)
 68 | print()
 69 | print_results("SMOTE classification", y_test, smote_prediction)
 70 | print()
 71 | print_results("NearMiss classification", y_test, nearmiss_prediction)
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | from sklearn.model_selection import KFold
 84 | 
 85 | # cross validation done right
 86 | kf = KFold(n_splits=5, random_state=42)
 87 | accuracy = []
 88 | precision = []
 89 | recall = []
 90 | f1 = []
 91 | auc = []
 92 | for train, test in kf.split(X_train, y_train):
 93 |     pipeline = make_pipeline_imb(SMOTE(), classifier(random_state=42))
 94 |     model = pipeline.fit(X_train[train], y_train[train])
 95 |     prediction = model.predict(X_train[test])
 96 | 
 97 |     accuracy.append(pipeline.score(X_train[test], y_train[test]))
 98 |     precision.append(precision_score(y_train[test], prediction))
 99 |     recall.append(recall_score(y_train[test], prediction))
100 |     f1.append(f1_score(y_train[test], prediction))
101 |     auc.append(roc_auc_score(y_train[test], prediction))
102 | 
103 | print()
104 | print("done right mean of scores 5-fold:")
105 | print("accuracy: {}".format(np.mean(accuracy)))
106 | print("precision: {}".format(np.mean(precision)))
107 | print("recall: {}".format(np.mean(recall)))
108 | print("f1: {}".format(np.mean(f1)))
109 | print()
110 | 
111 | # cross validation done wrong
112 | kf = KFold(n_splits=5, random_state=42)
113 | accuracy = []
114 | precision = []
115 | recall = []
116 | f1 = []
117 | auc = []
118 | X, y = SMOTE().fit_sample(X_train, y_train)
119 | for train, test in kf.split(X, y):
120 |     pipeline = make_pipeline(classifier(random_state=42))
121 |     model = pipeline.fit(X[train], y[train])
122 |     prediction = model.predict(X[test])
123 | 
124 |     accuracy.append(pipeline.score(X[test], y[test]))
125 |     precision.append(precision_score(y[test], prediction))
126 |     recall.append(recall_score(y[test], prediction))
127 |     f1.append(f1_score(y[test], prediction))
128 | 
129 | print("done wrong mean of scores 5-fold:")
130 | print("accuracy: {}".format(np.mean(accuracy)))
131 | print("precision: {}".format(np.mean(precision)))
132 | print("recall: {}".format(np.mean(recall)))
133 | print("f1: {}".format(np.mean(f1)))
134 | 


--------------------------------------------------------------------------------