├── .gitattributes ├── Classification ├── .project ├── .pydevproject ├── .settings │ └── org.eclipse.core.resources.prefs └── src │ └── imbalanced │ ├── __init__.py │ ├── comparison.py │ ├── data │ └── creditcard.csv │ ├── draw_helper.py │ ├── files │ ├── alphas_and_gammas.csv │ └── alphas_and_gammas_old.xlsx │ ├── imgs │ ├── baseline_cm.png │ ├── bias_helped.png │ ├── class_weights_cm.png │ ├── focalloss_cm.png │ ├── resampled_cm.png │ ├── testing_roc_comparison.png │ ├── training_roc_comparison.png │ └── tune_params.jpg │ ├── load_data_and_model.py │ └── tune_params_for_focal_loss.py └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /Classification/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Classification 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /Classification/.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME}/src 5 | 6 | python 3.0 7 | Default 8 | 9 | -------------------------------------------------------------------------------- /Classification/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/imbalanced/comparison.py=utf-8 3 | encoding//src/imbalanced/draw_helper.py=utf-8 4 | encoding//src/imbalanced/load_data_and_model.py=utf-8 5 | encoding//src/imbalanced/tune_params_for_focal_loss.py=utf-8 6 | -------------------------------------------------------------------------------- /Classification/src/imbalanced/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/__init__.py -------------------------------------------------------------------------------- /Classification/src/imbalanced/comparison.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2020-9-20 4 | 5 | @author: Yoga 6 | ''' 7 | 8 | from imbalanced.draw_helper import plot_loss_both_for_train_val, plot_roc, plot_cm 9 | from imbalanced.load_data_and_model import neg, pos, make_model, \ 10 | train_features, train_labels, BATCH_SIZE, EPOCHS, early_stopping, \ 11 | val_features, val_labels, test_features, test_labels, METRICS, total, \ 12 | bool_train_labels 13 | import matplotlib as mpl 14 | import matplotlib.pyplot as plt 15 | import numpy as np 16 | import tensorflow as tf 17 | 18 | 19 | mpl.rcParams['figure.figsize'] = (12, 10) 20 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] 21 | 22 | 23 | 24 | #方法1:使用正确的bias 25 | initial_bias = np.log([pos/neg]) 26 | model = make_model(output_bias = initial_bias) 27 | 28 | 29 | #因为要使用各种方案来训练模型然后进行比较,所以把模型的初始权重保存下来,方便后面各种训练对比 30 | initial_weights = model.get_weights()#bias=np.log([pos/neg]) 31 | 32 | 33 | 34 | #先看看不使用先验bias的效果 35 | model.set_weights(initial_weights) 36 | model.layers[-1].bias.assign([0.0])###bias重新赋值为为0 37 | zero_bias_history = model.fit( 38 | train_features, 39 | train_labels, 40 | batch_size=BATCH_SIZE, 41 | epochs=20, 42 | validation_data=(val_features, val_labels), 43 | verbose=0) 44 | 45 | 46 | #再看看使用先验bias的效果 47 | model.set_weights(initial_weights)#bias=np.log([pos/neg]) 48 | careful_bias_history = model.fit( 49 | train_features, 50 | train_labels, 51 | batch_size=BATCH_SIZE, 52 | epochs=20, 53 | validation_data=(val_features, val_labels), 54 | verbose=0) 55 | 56 | 57 | 58 | 59 | plot_loss_both_for_train_val(zero_bias_history, "Zero Bias", 0) 60 | plot_loss_both_for_train_val(careful_bias_history, "Careful Bias", 1) 61 | plt.savefig('./imgs/bias_helped.png') 62 | plt.show() 63 | #上图可以看出,bias有助于改善模型训练,模型的初期几个epoch不用再学习bias的变化 64 | 65 | #接下来就以加了先验bias的模型作为baseline,分别验证其他各种解决样本不均衡的方法 66 | train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE) 67 | test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE) 68 | 69 | 70 | baseline_results = model.evaluate(test_features, test_labels, batch_size=BATCH_SIZE, verbose=0) 71 | for name, value in zip(model.metrics_names, baseline_results): 72 | print(name, ': ', value) 73 | print() 74 | 75 | plot_cm(test_labels, test_predictions_baseline) 76 | plt.savefig('./imgs/baseline_cm.png') 77 | plt.show() 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | #方法2:类别加权 87 | weight_for_0 = (1./neg)*total/2. 88 | weight_for_1 = (1./pos)*total/2. 89 | 90 | class_weight = {0: weight_for_0, 1: weight_for_1} 91 | 92 | print('Weight for class 0: {:.2f}'.format(weight_for_0)) 93 | print('Weight for class 1: {:.2f}'.format(weight_for_1)) 94 | 95 | 96 | model.set_weights(initial_weights)#bias=np.log([pos/neg]) 97 | 98 | weighted_history = model.fit( 99 | train_features, 100 | train_labels, 101 | batch_size=BATCH_SIZE, 102 | epochs=EPOCHS, 103 | callbacks = [early_stopping], 104 | validation_data=(val_features, val_labels), 105 | # The class weights go here 106 | class_weight=class_weight) ################################# 107 | 108 | train_predictions_weighted = model.predict(train_features, batch_size=BATCH_SIZE) 109 | test_predictions_weighted = model.predict(test_features, batch_size=BATCH_SIZE) 110 | 111 | weighted_results = model.evaluate(test_features, test_labels, batch_size=BATCH_SIZE, verbose=0) 112 | 113 | for name, value in zip(model.metrics_names, weighted_results): 114 | print(name, ': ', value) 115 | print() 116 | 117 | plot_cm(test_labels, test_predictions_weighted)#绘制混淆矩阵 118 | plt.savefig('./imgs/class_weights_cm.png') 119 | plt.show() 120 | 121 | 122 | 123 | 124 | 125 | 126 | #方法3:上采样 127 | pos_features = train_features[bool_train_labels] 128 | neg_features = train_features[~bool_train_labels] 129 | 130 | pos_labels = train_labels[bool_train_labels] 131 | neg_labels = train_labels[~bool_train_labels] 132 | print('positive examples num : {:.2f}'.format(len(pos_labels))) 133 | print('negative examples num : {:.2f}'.format(len(neg_labels))) 134 | 135 | #重采样实现2:使用tf.data API 136 | BUFFER_SIZE = 100000 137 | 138 | def make_ds(features, labels): 139 | ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache() 140 | ds = ds.shuffle(BUFFER_SIZE).repeat() 141 | return ds 142 | 143 | pos_ds = make_ds(pos_features, pos_labels) 144 | neg_ds = make_ds(neg_features, neg_labels) 145 | 146 | for features, label in pos_ds.take(1): 147 | print("Features:\n", features.numpy()) 148 | print() 149 | print("Label: ", label.numpy()) 150 | 151 | #合并两个数据集对象,并传入数据集对象的占比,各占0.5,在这个参数用以resample 152 | resampled_ds = tf.data.experimental.sample_from_datasets([pos_ds, neg_ds], weights=[0.5, 0.5]) 153 | resampled_ds = resampled_ds.batch(BATCH_SIZE).prefetch(2) 154 | 155 | for features, label in resampled_ds.take(1): 156 | print(label.numpy().mean())#计算label=1的样本数占总样本数的比例 此时应该近似于0.5 157 | 158 | 159 | #此时需要定义一个epoch需要执行多少个steps,这里假设一个epoch至少要看到每个负样本一次 所需要的的batch数 160 | resampled_steps_per_epoch = np.ceil(2.0*neg/BATCH_SIZE)#ceil 计算大于等于该值的最小整数 161 | print('resampled_steps_per_epoch:', resampled_steps_per_epoch) 162 | 163 | 164 | #注意:由于过采样,总的样本数增大了,每个epoch的训练时间自然也会增长 165 | 166 | model.set_weights(initial_weights)#bias=np.log([pos/neg]) 167 | 168 | # Reset the bias to zero, since this dataset is balanced. 169 | output_layer = model.layers[-1] ############数据集已经平衡,所以bias要重新赋值为0 170 | output_layer.bias.assign([0]) 171 | 172 | val_ds = tf.data.Dataset.from_tensor_slices((val_features, val_labels)).cache() 173 | val_ds = val_ds.batch(BATCH_SIZE).prefetch(2) #预取2个batch到memory,提高gpu处理和数据pipeline的并行度,加速 174 | 175 | 176 | resampled_history = model.fit( 177 | resampled_ds, 178 | epochs=EPOCHS, 179 | steps_per_epoch=resampled_steps_per_epoch, 180 | callbacks = [early_stopping], 181 | validation_data=val_ds) 182 | 183 | 184 | 185 | #评估 186 | train_predictions_resampled = model.predict(train_features, batch_size=BATCH_SIZE) 187 | test_predictions_resampled = model.predict(test_features, batch_size=BATCH_SIZE) 188 | 189 | resampled_results = model.evaluate(test_features, test_labels, 190 | batch_size=BATCH_SIZE, verbose=0) 191 | for name, value in zip(model.metrics_names, resampled_results): 192 | print(name, ': ', value) 193 | print() 194 | 195 | plot_cm(test_labels, test_predictions_resampled)#绘制混淆矩阵 196 | plt.savefig('./imgs/resampled_cm.png') 197 | plt.show() 198 | 199 | 200 | 201 | 202 | 203 | 204 | #方法4:focal loss 205 | #FL(pt) = -αt(1-pt)^γ log(pt),pt=p and αt=α when y=1 ,pt=1-p and αt=1-α when y=-1或者0 视情况而定 206 | def focal_loss(alpha=0.5, gamma=1.5, epsilon=1e-6): 207 | print('*'*20, 'alpha={}, gamma={}'.format(alpha, gamma)) 208 | def focal_loss_calc(y_true, y_probs): 209 | positive_pt = tf.where(tf.equal(y_true, 1), y_probs, tf.ones_like(y_probs)) 210 | negative_pt = tf.where(tf.equal(y_true, 0), 1-y_probs, tf.ones_like(y_probs)) 211 | 212 | loss = -alpha * tf.pow(1-positive_pt, gamma) * tf.math.log(tf.clip_by_value(positive_pt, epsilon, 1.)) - \ 213 | (1-alpha) * tf.pow(1-negative_pt, gamma) * tf.math.log(tf.clip_by_value(negative_pt, epsilon, 1.)) 214 | 215 | return tf.reduce_sum(loss) 216 | return focal_loss_calc 217 | 218 | 219 | best_alpha = 0.3 220 | best_gamma = 2. 221 | model = make_model(loss_func='focal_loss') 222 | model.compile( 223 | optimizer=tf.keras.optimizers.Adam(lr=1e-3), 224 | loss=focal_loss(alpha=best_alpha, gamma=best_gamma), 225 | metrics=METRICS, 226 | run_eagerly=True)############## 227 | model.set_weights(initial_weights)#bias=np.log([pos/neg]) 228 | 229 | 230 | focalloss_history = model.fit( 231 | train_features, 232 | train_labels, 233 | batch_size=BATCH_SIZE, 234 | epochs=EPOCHS, 235 | callbacks = [early_stopping], 236 | validation_data=(val_features, val_labels), 237 | ) 238 | 239 | 240 | #评估 241 | train_predictions_focal = model.predict(train_features, batch_size=BATCH_SIZE) 242 | test_predictions_focal = model.predict(test_features, batch_size=BATCH_SIZE) 243 | 244 | focal_results = model.evaluate(test_features, test_labels, batch_size=BATCH_SIZE, verbose=0) 245 | 246 | for name, value in zip(model.metrics_names, focal_results): 247 | print(name, ': ', value) 248 | print() 249 | 250 | plot_cm(test_labels, test_predictions_focal)#绘制混淆矩阵 251 | plt.savefig('./imgs/focalloss_cm.png') 252 | plt.show() 253 | 254 | 255 | #对比ROC曲线 256 | plot_roc("Train Baseline", train_labels, train_predictions_baseline, color=colors[0]) 257 | plot_roc("Train Weighted", train_labels, train_predictions_weighted, color=colors[1])#绘制roc曲线 258 | plot_roc("Train Resampled", train_labels, train_predictions_resampled, color=colors[2])#绘制roc曲线 259 | plot_roc("Train Focal_Loss", train_labels, train_predictions_focal, color=colors[3])#绘制roc曲线 260 | plt.legend(loc='lower right') 261 | plt.savefig('./imgs/training_roc_comparison.png') 262 | plt.show() 263 | 264 | 265 | plot_roc("Test Baseline", test_labels, test_predictions_baseline, color=colors[0], linestyle='--') 266 | plot_roc("Test Weighted", test_labels, test_predictions_weighted, color=colors[1], linestyle='--') 267 | plot_roc("Test Resampled", test_labels, test_predictions_resampled, color=colors[2], linestyle='--') 268 | plot_roc("Test Focal_Loss", test_labels, test_predictions_focal, color=colors[3], linestyle='--') 269 | plt.legend(loc='lower right') 270 | plt.savefig('./imgs/testing_roc_comparison.png') 271 | plt.show() 272 | -------------------------------------------------------------------------------- /Classification/src/imbalanced/data/creditcard.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:76274b691b16a6c49d3f159c883398e03ccd6d1ee12d9d8ee38f4b4b98551a89 3 | size 150828752 4 | -------------------------------------------------------------------------------- /Classification/src/imbalanced/draw_helper.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2020-9-20 4 | 5 | @author: Yoga 6 | ''' 7 | 8 | 9 | import matplotlib as mpl 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import seaborn as sns 13 | 14 | import sklearn 15 | from sklearn.metrics import confusion_matrix 16 | 17 | mpl.rcParams['figure.figsize'] = (12, 10) 18 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] 19 | 20 | def plot_loss_both_for_train_val(history, label, n): 21 | # Use a log scale to show the wide range of values. 22 | plt.semilogy(history.epoch, history.history['loss'], 23 | color=colors[n], label='Train '+label) 24 | plt.semilogy(history.epoch, history.history['val_loss'], 25 | color=colors[n], label='Val '+label, linestyle="--") 26 | plt.xlabel('Epoch') 27 | plt.ylabel('Loss') 28 | 29 | plt.legend() 30 | 31 | 32 | 33 | def plot_loss(history, label, loss, n): 34 | # Use a log scale to show the wide range of values. 35 | plt.semilogy(history.epoch, history.history[loss], 36 | color=colors[n], label=label) 37 | plt.xlabel('Epoch') 38 | plt.ylabel('Loss') 39 | 40 | plt.legend() 41 | 42 | 43 | 44 | def plot_cm(labels, predictions, p=0.5): 45 | cm = confusion_matrix(labels, predictions > p) 46 | plt.figure(figsize=(5,5)) 47 | sns.heatmap(cm, annot=True, fmt="d") 48 | plt.title('Confusion matrix @{:.2f}'.format(p)) 49 | plt.ylabel('Actual label') 50 | plt.xlabel('Predicted label') 51 | 52 | print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])#合法交易 53 | print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])#合法交易但被错认为是欺诈行为 54 | print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])#欺诈交易但被认为是合法的 55 | print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])#欺诈交易 56 | print('Total Fraudulent Transactions: ', np.sum(cm[1]))#总的真实的欺诈交易数(y轴的总数) 57 | 58 | 59 | 60 | 61 | def plot_roc(name, labels, predictions, **kwargs): 62 | fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions) 63 | 64 | plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs) 65 | plt.xlabel('False positives [%]') 66 | plt.ylabel('True positives [%]') 67 | plt.xlim([-0.5,20]) 68 | plt.ylim([80,100.5]) 69 | plt.grid(True) 70 | ax = plt.gca() 71 | ax.set_aspect('equal') -------------------------------------------------------------------------------- /Classification/src/imbalanced/files/alphas_and_gammas.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cc1e683aa7d9cb036c10bce7765b1e42189e5ed7069063817f22e69c0f9b4cf2 3 | size 3930 4 | -------------------------------------------------------------------------------- /Classification/src/imbalanced/files/alphas_and_gammas_old.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/files/alphas_and_gammas_old.xlsx -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/baseline_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/baseline_cm.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/bias_helped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/bias_helped.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/class_weights_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/class_weights_cm.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/focalloss_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/focalloss_cm.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/resampled_cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/resampled_cm.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/testing_roc_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/testing_roc_comparison.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/training_roc_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/training_roc_comparison.png -------------------------------------------------------------------------------- /Classification/src/imbalanced/imgs/tune_params.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/Classification-on-imbalanced-data/ae6c774f7d22aec4fc371fcb091ef59ce1047161/Classification/src/imbalanced/imgs/tune_params.jpg -------------------------------------------------------------------------------- /Classification/src/imbalanced/load_data_and_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2020-9-20 4 | 5 | @author: Yoga 6 | ''' 7 | import tensorflow as tf 8 | from tensorflow import keras 9 | 10 | import os 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.preprocessing import StandardScaler 16 | 17 | 18 | 19 | 20 | 21 | os.environ['CUDA_VISIBLE_DEVICES']='0' 22 | #设置按需使用GPUs 23 | gpus = tf.config.experimental.list_physical_devices(device_type='GPU') 24 | if gpus: 25 | try: 26 | for gpu in gpus: 27 | tf.config.experimental.set_memory_growth(gpu, True) 28 | logical_gpus = tf.config.experimental.list_logical_devices(device_type='GPU') 29 | print('************************** ', len(gpus), 'Physical GPUs, ', len(logical_gpus), 'Logical GPUs') 30 | except RuntimeError as e: 31 | print(e) 32 | 33 | 34 | 35 | #加载数据集 36 | data_dir = './data/' 37 | 38 | raw_df = pd.read_csv(data_dir + 'creditcard.csv') 39 | print(raw_df.head()) 40 | 41 | neg, pos = np.bincount(raw_df['Class'])#bincount(): Count number of occurrences of each value in array of non-negative ints 42 | total = neg + pos 43 | print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( 44 | total, pos, 100 * pos / total)) 45 | 46 | 47 | 48 | 49 | #数据预处理,去除无意义数据,转化数据range 50 | cleaned_df = raw_df.copy() 51 | 52 | # You don't want the `Time` column. 53 | cleaned_df.pop('Time') 54 | 55 | # The `Amount` column covers a huge range. Convert to log-space. 56 | eps=0.001 # 0 => 0.1¢ 57 | cleaned_df['Log Ammount'] = np.log(cleaned_df.pop('Amount')+eps) 58 | 59 | 60 | 61 | 62 | 63 | #划分数据集 64 | # Use a utility from sklearn to split and shuffle our dataset. 65 | train_df, test_df = train_test_split(cleaned_df, test_size=0.2) 66 | train_df, val_df = train_test_split(train_df, test_size=0.2) 67 | 68 | # Form np arrays of labels and features. 69 | train_labels = np.array(train_df.pop('Class')) 70 | val_labels = np.array(val_df.pop('Class')) 71 | test_labels = np.array(test_df.pop('Class')) 72 | 73 | bool_train_labels = train_labels != 0 74 | 75 | train_features = np.array(train_df) 76 | val_features = np.array(val_df) 77 | test_features = np.array(test_df) 78 | 79 | 80 | 81 | 82 | 83 | #归一化训练特征(fit时仅使用训练集) 84 | scaler = StandardScaler() 85 | train_features = scaler.fit_transform(train_features) 86 | 87 | val_features = scaler.transform(val_features) 88 | test_features = scaler.transform(test_features) 89 | 90 | train_features = np.clip(train_features, -5, 5) 91 | val_features = np.clip(val_features, -5, 5)#为什么要clip?因为fit时只使用了train数据集,val和test肯定会有些差异,transform后不一定都落在一个0附近的区间 92 | test_features = np.clip(test_features, -5, 5) 93 | 94 | 95 | print('Training labels shape:', train_labels.shape) 96 | print('Validation labels shape:', val_labels.shape) 97 | print('Test labels shape:', test_labels.shape) 98 | 99 | print('Training features shape:', train_features.shape) 100 | print('Validation features shape:', val_features.shape) 101 | print('Test features shape:', test_features.shape) 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | #搭建模型 111 | METRICS = [ 112 | keras.metrics.TruePositives(name='tp'), 113 | keras.metrics.FalsePositives(name='fp'), 114 | keras.metrics.TrueNegatives(name='tn'), 115 | keras.metrics.FalseNegatives(name='fn'), 116 | keras.metrics.BinaryAccuracy(name='accuracy'), 117 | keras.metrics.Precision(name='precision'), 118 | keras.metrics.Recall(name='recall'), 119 | keras.metrics.AUC(name='auc'), 120 | ] 121 | 122 | def make_model(metrics = METRICS, output_bias=None, loss_func=None): 123 | if output_bias is not None: 124 | output_bias = tf.keras.initializers.Constant(output_bias) 125 | model = keras.Sequential([ 126 | keras.layers.Dense(16, activation='relu', input_shape=(train_features.shape[-1],)), 127 | keras.layers.Dropout(0.5), 128 | keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias), 129 | ]) 130 | 131 | if loss_func is None: 132 | model.compile( 133 | optimizer=keras.optimizers.Adam(lr=1e-3), 134 | loss=keras.losses.BinaryCrossentropy(), 135 | metrics=metrics) 136 | 137 | return model 138 | 139 | 140 | 141 | 142 | 143 | 144 | EPOCHS = 100 145 | BATCH_SIZE = 2048#尽量保证每个batch都至少包含正样本 146 | 147 | early_stopping = tf.keras.callbacks.EarlyStopping( 148 | monitor='val_auc', 149 | verbose=1, 150 | patience=10, 151 | mode='max', 152 | restore_best_weights=True)#在每个epoch结束时检查指标是否更好了,是就保存当前最好模型的权重,当patience用完或训练结束结束时,模型会以bestweights重新被赋值 153 | -------------------------------------------------------------------------------- /Classification/src/imbalanced/tune_params_for_focal_loss.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2020-9-20 4 | 5 | @author: Yoga 6 | ''' 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import tensorflow as tf 11 | 12 | from imbalanced.load_data_and_model import neg, pos, make_model, \ 13 | train_features, train_labels, BATCH_SIZE, EPOCHS, early_stopping, \ 14 | val_features, val_labels, test_features, test_labels, METRICS 15 | 16 | 17 | 18 | ##公式:L(pt) = -αt(1-pt)^γ log(pt),pt=p and αt=α when y=1 ,pt=1-p and αt=1-α when y=-1或者0 视情况而定 19 | def focal_loss(alpha=0.5, gamma=1.5, epsilon=1e-6): 20 | print('*'*20, 'alpha={}, gamma={}'.format(alpha, gamma)) 21 | def focal_loss_calc(y_true, y_probs): 22 | positive_pt = tf.where(tf.equal(y_true, 1), y_probs, tf.ones_like(y_probs)) 23 | negative_pt = tf.where(tf.equal(y_true, 0), 1-y_probs, tf.ones_like(y_probs)) 24 | 25 | loss = -alpha * tf.pow(1-positive_pt, gamma) * tf.math.log(tf.clip_by_value(positive_pt, epsilon, 1.)) - \ 26 | (1-alpha) * tf.pow(1-negative_pt, gamma) * tf.math.log(tf.clip_by_value(negative_pt, epsilon, 1.)) 27 | 28 | return tf.reduce_sum(loss) 29 | return focal_loss_calc 30 | 31 | 32 | 33 | 34 | alphas = np.arange(0.1, 0.41, 0.05)#[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4] 35 | gammas = np.arange(1., 4.1, 0.5)#[1.0, 1.5, 2., 2.5, 3., 3.5, 4.] 36 | 37 | 38 | initial_bias = np.log([pos/neg]) 39 | model = make_model(output_bias = initial_bias, loss_func='focal_loss') 40 | initial_weights = model.get_weights()#bias=np.log([pos/neg]) 41 | 42 | all_results = [] 43 | 44 | for i in range(len(alphas)): 45 | for j in range(len(gammas)): 46 | 47 | model.set_weights(initial_weights)#重新初始化模型 48 | 49 | model.compile( 50 | optimizer=tf.keras.optimizers.Adam(lr=1e-3), 51 | loss=focal_loss(alpha=alphas[i], gamma=gammas[j]), 52 | metrics=METRICS, 53 | run_eagerly=True)############## 54 | 55 | focalloss_history = model.fit( 56 | train_features, 57 | train_labels, 58 | batch_size=BATCH_SIZE, 59 | epochs=EPOCHS, 60 | callbacks = [early_stopping], 61 | validation_data=(val_features, val_labels) 62 | ) 63 | 64 | #评估 65 | focal_results = model.evaluate(test_features, test_labels, batch_size=BATCH_SIZE, verbose=0) 66 | 67 | focal_metric_res = {'alpha': alphas[i], 'gamma': gammas[j]} 68 | 69 | for name, value in zip(model.metrics_names, focal_results): 70 | print(name, ': ', value) 71 | focal_metric_res[name] = value 72 | print() 73 | 74 | all_results.append(focal_metric_res) 75 | 76 | 77 | res_df = pd.DataFrame(all_results) 78 | res_df.to_csv('./files/alphas_and_gammas.csv', sep=',', index=False, encoding='UTF-8') 79 | 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Classification-on-imbalanced-data 2 | 3 | ------ 4 | 5 | 这是在数据极度不均衡的场景下的一个二分类实现,使用了`“weighted class”`,`“oversampling”`,**`focal loss`** 等多种方式进行了实验,主要是进行了方法的验证和各种方法结果的对比;对比是主要参看了指标**混淆矩阵** 和 **auc**,最后本实验中还着重对focal loss进行了简单的调参: 6 | 7 | ------ 8 | 9 | ### 1. 代码结构说明 10 | > 1. load_data_and_model.py主要用于加载数据,对数据进行基本的预处理(数据的归一化,部分数据取log,丢弃无用数据等,可以看作是简单的特征工程);然后定义了评价指标并定义了创建模型的方法,对训练的一些超参如epochs和batchsize也是在此文件定义。模型代码使用tf2的keras.Sequential创建了一个简单的模型,仅用来验证几种不同的处理不均衡的方法。模型代码和绘图代码参考于tensorflow2 tutorial的官网实现:[https://tensorflow.google.cn/tutorials/structured_data/imbalanced_data](https://tensorflow.google.cn/tutorials/structured_data/imbalanced_data) ,我增加了focal loss的实现,并一起进行对比。 11 | > 2. draw_helper.py是一些绘图的辅助函数; 12 | > 3. comparison.py里使用`添加经验bias`的方法、`类别加权`、`上采样`、使用`focal loss`的方法分别训练模型,并得出评价,以及对比结果; 13 | > 4. tune_params_for_focal_loss.py主要是对focal loss的 $\alpha$ 和 $\gamma$ 进行调参,主要就是选定了一个范围,两重循环,类似于网格搜索 14 | 15 | ### 2. 实验环境说明 16 | 17 | > * python3.6.9 18 | > * tensorflow 2.4.x 19 | > * gpu:NVIDIA V100 (32G) 20 | 21 | 22 | ### 3. 实验结果说明 23 | 24 | ![增加先验bias后结果对比图](./Classification/src/imbalanced/imgs/bias_helped.png) 25 | 26 | 上图可以看出,bias有助于改善模型训练,模型的初期几个epoch不用再学习bias的变化。 27 | 28 | 接下来就以加了先验bias的模型作为**baseline**,分别验证其他各种解决样本不均衡的方法 29 | 30 | ![各种方法的训练结果对比图](./Classification/src/imbalanced/imgs/training_roc_comparison.png) 31 | 32 | ![各种方法的测试结果对比图](./Classification/src/imbalanced/imgs/testing_roc_comparison.png) 33 | 34 | 上图可以看出,各路方法都在baseline的基础上有所提升。 35 | 36 | ### 4. focal loss调参说明 37 | 38 | 调参使用的是tune_params_for_focal_loss.py,测试了一组$\alpha$ 和 $\gamma$,原论文的最佳组合是$\alpha=0.25$ 和 $\gamma=2.$,我在实验中的best params是$\alpha=0.3$ 和 $\gamma=2.$。原论文地址:https://arxiv.org/abs/1708.02002 39 | ```python 40 | alphas = np.arange(0.1, 0.41, 0.05)#[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4] 41 | gammas = np.arange(1., 4.1, 0.5)#[1.0, 1.5, 2., 2.5, 3., 3.5, 4.] 42 | ``` 43 | ![focal loss调参结果](./Classification/src/imbalanced/imgs/tune_params.jpg) 44 | 可见$\alpha=0.3$ 和 $\gamma=2.$综合结果最好,因为我们更关注FN、FP以及AUC,例如这是一个信用卡欺诈的数据集,正例表示交易存在异常是一个欺诈行为,那么FN会导致这个交易通过,带来重大损失,而如果FP大,又会是的一个正常的交易被识别成一个欺诈交易,从而给客户发去验证和警告邮件,也会给客户带来不好的体验。对于这种数据极度不均衡的数据(正例只占0.17%)acc指标已经不可行,ROC曲线是不错的指标,外部ROC如果能包住内部ROC,则外部ROC对应的结果性能更高。 45 | 46 | 关于Focal Loss想要了解更多的,可参考我的博客:Focal Loss原理以及代码实现和验证(tensorflow2):https://blog.csdn.net/u010366748/article/details/108697771 47 | --------------------------------------------------------------------------------