├── README.md ├── code ├── GBDT_LGB.py ├── GBDT_LGB.pyc ├── MAIN.py ├── XGB_LGB.py └── XGB_LGB.pyc ├── eda ├── boxplot │ ├── V1.png │ ├── V10.png │ ├── V11.png │ ├── V12.png │ ├── V13.png │ ├── V14.png │ ├── V15.png │ ├── V16.png │ ├── V17.png │ ├── V18.png │ ├── V19.png │ ├── V2.png │ ├── V20.png │ ├── V21.png │ ├── V22.png │ ├── V23.png │ ├── V24.png │ ├── V25.png │ ├── V26.png │ ├── V27.png │ ├── V28.png │ ├── V29.png │ ├── V3.png │ ├── V30.png │ ├── V4.png │ ├── V5.png │ ├── V6.png │ ├── V7.png │ ├── V8.png │ ├── V9.png │ └── V_Time.png ├── corr_matrix_new.xlsx ├── corr_plot │ ├── corr_plot.png │ ├── corr_plot_0.png │ └── corr_plot_rainbow.png ├── gbdt_feature_importance.txt └── xgb_feature_importance.txt ├── result └── BestResult │ └── pred_result_combine_two_set_233259_set0_147575_MMK_finished.csv └── 风险识别算法赛-项目说明.pdf /README.md: -------------------------------------------------------------------------------- 1 | # QLM-Tianchi 2 | 天池大数据竞赛 千里马大赛 风险识别与预测赛题 Top5 解决方案 3 | 能够帮助小白快速上手机器学习任务 4 | 欢迎邮件交流 davidkangyz@163.com 5 | -------------------------------------------------------------------------------- /code/GBDT_LGB.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # date: 2018 4 | # author: Kang Yan Zhe 5 | # desc: 千里马 风险识别算法竞赛 6 | 7 | import csv 8 | import pandas as pd 9 | import numpy as np 10 | from scipy import interp 11 | import matplotlib.pyplot as plt 12 | from itertools import cycle 13 | from sklearn.feature_selection import SelectFromModel 14 | from sklearn.model_selection import StratifiedKFold 15 | from sklearn.ensemble import GradientBoostingClassifier 16 | from sklearn.metrics import roc_curve, auc, f1_score 17 | from sklearn.externals import joblib 18 | from lightgbm import LGBMClassifier 19 | 20 | 21 | def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th): 22 | # SelectfromModel 23 | clf = GradientBoostingClassifier(n_estimators=50, random_state=100) 24 | clf.fit(matrix_x_temp, label_y) 25 | sfm = SelectFromModel(clf, prefit=True, threshold=th) 26 | matrix_x = sfm.transform(matrix_x_temp) 27 | 28 | # 打印出有多少特征重要性非零的特征 29 | feature_score_dict = {} 30 | for fn, s in zip(fe_name, clf.feature_importances_): 31 | feature_score_dict[fn] = s 32 | m = 0 33 | for k in feature_score_dict: 34 | if feature_score_dict[k] == 0.0: 35 | m += 1 36 | print 'number of not-zero features:' + str(len(feature_score_dict) - m) 37 | 38 | # 打印出特征重要性 39 | feature_score_dict_sorted = sorted(feature_score_dict.items(), 40 | key=lambda d: d[1], reverse=True) 41 | print 'feature_importance:' 42 | for ii in range(len(feature_score_dict_sorted)): 43 | print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] 44 | print '\n' 45 | 46 | f = open('../eda/gbdt_feature_importance.txt', 'w') 47 | f.write('Rank\tFeature Name\tFeature Importance\n') 48 | for i in range(len(feature_score_dict_sorted)): 49 | f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') 50 | f.close() 51 | 52 | # 打印具体使用了哪些字段 53 | how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 54 | feature_used_dict_temp = feature_score_dict_sorted[:how_long] 55 | feature_used_name = [] 56 | for ii in range(len(feature_used_dict_temp)): 57 | feature_used_name.append(feature_used_dict_temp[ii][0]) 58 | print 'feature_chooesed:' 59 | for ii in range(len(feature_used_name)): 60 | print feature_used_name[ii] 61 | print '\n' 62 | 63 | f = open('../eda/gbdt_feature_chose.txt', 'w') 64 | f.write('Feature Chose Name :\n') 65 | for i in range(len(feature_used_name)): 66 | f.write(str(feature_used_name[i]) + '\n') 67 | f.close() 68 | 69 | # 找到未被使用的字段名 70 | feature_not_used_name = [] 71 | for i in range(len(fe_name)): 72 | if fe_name[i] not in feature_used_name: 73 | feature_not_used_name.append(fe_name[i]) 74 | 75 | # 生成一个染色体(诸如01011100这样的) 76 | chromosome_temp = '' 77 | feature_name_ivar = fe_name[:-1] 78 | for ii in range(len(feature_name_ivar)): 79 | if feature_name_ivar[ii] in feature_used_name: 80 | chromosome_temp += '1' 81 | else: 82 | chromosome_temp += '0' 83 | print 'Chromosome:' 84 | print chromosome_temp 85 | joblib.dump(chromosome_temp, '../config/chromosome.pkl') 86 | print '\n' 87 | return matrix_x, feature_not_used_name, len(feature_used_name) 88 | 89 | 90 | def data_test_feature_drop(data_test, feature_name_drop): 91 | # print feature_name_drop 92 | for col in feature_name_drop: 93 | data_test.drop(col, axis=1, inplace=True) 94 | print "data_test_shape:" 95 | print data_test.shape 96 | return data_test.as_matrix() 97 | 98 | 99 | def write_predict_results_to_csv(csv_name, uid, prob_list): 100 | 101 | csv_file = file(csv_name, 'wb') 102 | writer = csv.writer(csv_file) 103 | combined_list = [['ID', 'pred']] 104 | if len(uid) == len(prob_list): 105 | for i in range(len(uid)): 106 | combined_list.append([str(uid[i]), str(prob_list[i])]) 107 | writer.writerows(combined_list) 108 | csv_file.close() 109 | else: 110 | print 'no和pred的个数不一致' 111 | 112 | 113 | def gbdt_lgb_cv_modeling(): 114 | """ 115 | 116 | :return: 117 | """ 118 | 119 | '''Data input''' 120 | data_train = pd.read_csv('../data/train.csv', index_col='ID') 121 | data_predict = pd.read_csv('../data/pred.csv', index_col='ID') 122 | 123 | '''train 特征工程''' 124 | data_train_without_label = data_train.drop('Label', axis=1) 125 | # del data_train_without_label['V17'] 126 | # data_train_without_label['V14×V17'] = data_train_without_label['V14'] * data_train_without_label['V17'] 127 | # data_train_without_label['V14×V4'] = data_train_without_label['V14'] * data_train_without_label['V4'] 128 | # data_train_without_label['V14×V20'] = data_train_without_label['V14'] * data_train_without_label['V20'] 129 | # data_train_without_label['V14×V7'] = data_train_without_label['V14']*data_train_without_label['V7'] 130 | # data_train_without_label['V14×V10'] = data_train_without_label['V14'] * data_train_without_label['V10'] 131 | # 132 | # data_train_without_label['V17×V4'] = data_train_without_label['V17'] * data_train_without_label['V4'] 133 | # data_train_without_label['V17×V20'] = data_train_without_label['V17'] * data_train_without_label['V20'] 134 | # data_train_without_label['V17×V7'] = data_train_without_label['V17'] * data_train_without_label['V7'] 135 | # data_train_without_label['V17×V10'] = data_train_without_label['V17'] * data_train_without_label['V10'] 136 | # 137 | # data_train_without_label['V4×V20'] = data_train_without_label['V4'] * data_train_without_label['V20'] 138 | # data_train_without_label['V4×V7'] = data_train_without_label['V4'] * data_train_without_label['V7'] 139 | # data_train_without_label['V4×V10'] = data_train_without_label['V4'] * data_train_without_label['V10'] 140 | # 141 | # data_train_without_label['V20×V7'] = data_train_without_label['V20'] * data_train_without_label['V7'] 142 | # data_train_without_label['V20×V10'] = data_train_without_label['V20'] * data_train_without_label['V10'] 143 | # 144 | # data_train_without_label['V7×V10'] = data_train_without_label['V7'] * data_train_without_label['V10'] 145 | 146 | feature_name = list(data_train_without_label.columns.values) 147 | data_predict_user_id = list(data_predict.index.values) 148 | 149 | '''缺失值填充''' 150 | frames = [data_train_without_label, data_predict] 151 | data_all = pd.concat(frames) 152 | data_train_filled = data_train_without_label.fillna(value=data_all.median()) 153 | 154 | '''构造训练集和测试集''' 155 | x_temp = data_train_filled.iloc[:, :].as_matrix() # 自变量 156 | y = data_train.iloc[:, -1].as_matrix() # 因变量 157 | 158 | '''Feature selection''' 159 | X, dropped_feature_name, len_feature_choose = gbdt_feature_selection(feature_name, x_temp, y, '0.0005*mean') 160 | # 0.1*mean可以选出10个特征 161 | # 0.00001*mean可以选出14个特征 162 | 163 | '''处理 验证集 B_test''' 164 | # del data_predict['V17'] 165 | 166 | data_predict_filled = data_predict.fillna(value=data_all.median()) 167 | data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name) 168 | 169 | '''Split train/test data sets''' 170 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation 171 | 172 | '''Choose a classification model''' 173 | parameter_n_estimators = 100 174 | classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1) 175 | 176 | '''Model fit, predict and ROC''' 177 | colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue']) 178 | lw = 2 179 | mean_f1 = 0.0 180 | mean_tpr = 0.0 181 | mean_fpr = np.linspace(0, 1, 500) 182 | i_of_roc = 0 183 | a = 0 184 | 185 | th = 0.3 186 | 187 | for (train_indice, test_indice), color in zip(cv.split(X, y), colors): 188 | a_model = classifier.fit(X[train_indice], y[train_indice]) 189 | 190 | # y_predict_label = a_model.predict(X[test_indice]) 191 | 192 | probas_ = a_model.predict_proba(X[test_indice]) 193 | 194 | fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1]) 195 | 196 | a += 1 # 序号加1 197 | 198 | mean_tpr += interp(mean_fpr, fpr, tpr) 199 | mean_tpr[0] = 0.0 200 | 201 | roc_auc = auc(fpr, tpr) 202 | plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc)) 203 | i_of_roc += 1 204 | 205 | label_transformed = probas_[:, 1] 206 | for i in range(len(label_transformed)): 207 | if label_transformed[i] > th: 208 | label_transformed[i] = 1 209 | else: 210 | label_transformed[i] = 0 211 | lt = label_transformed.astype('int32') 212 | f1 = f1_score(y[test_indice], lt) 213 | mean_f1 += f1 # 0.7739 214 | 215 | plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') 216 | 217 | mean_tpr /= cv.get_n_splits(X, y) 218 | mean_tpr[-1] = 1.0 219 | mean_auc = auc(mean_fpr, mean_tpr) 220 | print 'mean_auc=' + str(mean_auc) 221 | print 'mean_f1=' + str(mean_f1/5) 222 | plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw) 223 | plt.xlim([-0.01, 1.01]) 224 | plt.ylim([-0.01, 1.01]) 225 | plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1)) 226 | plt.ylabel('True Positive Rate') 227 | 228 | plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5)) 229 | plt.legend(loc="lower right") 230 | plt.savefig('../result/pred_ROC_GL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + 231 | '_proba_to_label_using_th_' + str(th) + '.png') 232 | # plt.show() 233 | 234 | a_model = classifier.fit(X, y) 235 | 236 | # label_predict = a_model.predict(data_predict_filled_after_feature_selection) # 对B_test进行预测 237 | proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection) 238 | 239 | '''写入预测出概率的结果''' 240 | result_file_name = '../result/pred_result_GL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv' 241 | write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist()) 242 | 243 | '''写入要提交的结果''' 244 | label_transformed = proba_predict[:, 1] 245 | sum_of_1 = 0 246 | for i in range(len(label_transformed)): 247 | if label_transformed[i] > th: 248 | label_transformed[i] = 1 249 | sum_of_1 += 1 250 | else: 251 | label_transformed[i] = 0 252 | lt = label_transformed.astype('int32') 253 | result_file_name = '../result/pred_result_GL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \ 254 | '_proba_to_label_using_th_' + str(th) + '_' + str(sum_of_1) + '.csv' 255 | write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist()) 256 | 257 | -------------------------------------------------------------------------------- /code/GBDT_LGB.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/code/GBDT_LGB.pyc -------------------------------------------------------------------------------- /code/MAIN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # date: 2018 4 | # author: Kang Yan Zhe 5 | # desc: 千里马 风险识别算法竞赛 6 | 7 | from XGB_LGB import xgb_lgb_cv_modeling 8 | from GBDT_LGB import gbdt_lgb_cv_modeling 9 | 10 | if __name__ == '__main__': 11 | 12 | xgb_lgb_cv_modeling() 13 | 14 | gbdt_lgb_cv_modeling() -------------------------------------------------------------------------------- /code/XGB_LGB.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # date: 2018 4 | # author: Kang Yan Zhe 5 | # desc: 千里马 风险识别算法竞赛 6 | 7 | import csv 8 | import pandas as pd 9 | import numpy as np 10 | from scipy import interp 11 | import matplotlib.pyplot as plt 12 | from itertools import cycle 13 | from sklearn.feature_selection import SelectFromModel 14 | from sklearn.model_selection import StratifiedKFold 15 | from sklearn.metrics import roc_curve, auc, f1_score 16 | from sklearn.externals import joblib 17 | from xgboost import XGBClassifier 18 | from lightgbm import LGBMClassifier 19 | 20 | 21 | def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th): 22 | # SelectfromModel 23 | clf = XGBClassifier(n_estimators=50) 24 | clf.fit(matrix_x_temp, label_y) 25 | sfm = SelectFromModel(clf, prefit=True, threshold=th) 26 | matrix_x = sfm.transform(matrix_x_temp) 27 | 28 | # 打印出有多少特征重要性非零的特征 29 | feature_score_dict = {} 30 | for fn, s in zip(fe_name, clf.feature_importances_): 31 | feature_score_dict[fn] = s 32 | m = 0 33 | for k in feature_score_dict: 34 | if feature_score_dict[k] == 0.0: 35 | m += 1 36 | print 'number of not-zero features:' + str(len(feature_score_dict) - m) 37 | 38 | # 打印出特征重要性 39 | feature_score_dict_sorted = sorted(feature_score_dict.items(), 40 | key=lambda d: d[1], reverse=True) 41 | print 'xgb_feature_importance:' 42 | for ii in range(len(feature_score_dict_sorted)): 43 | print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] 44 | print '\n' 45 | 46 | f = open('../eda/xgb_feature_importance.txt', 'w') 47 | f.write('Rank\tFeature Name\tFeature Importance\n') 48 | for i in range(len(feature_score_dict_sorted)): 49 | f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') 50 | f.close() 51 | 52 | # 打印具体使用了哪些字段 53 | how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 54 | feature_used_dict_temp = feature_score_dict_sorted[:how_long] 55 | feature_used_name = [] 56 | for ii in range(len(feature_used_dict_temp)): 57 | feature_used_name.append(feature_used_dict_temp[ii][0]) 58 | print 'feature_chooesed:' 59 | for ii in range(len(feature_used_name)): 60 | print feature_used_name[ii] 61 | print '\n' 62 | 63 | f = open('../eda/xgb_feature_chose.txt', 'w') 64 | f.write('Feature Chose Name :\n') 65 | for i in range(len(feature_used_name)): 66 | f.write(str(feature_used_name[i]) + '\n') 67 | f.close() 68 | 69 | # 找到未被使用的字段名 70 | feature_not_used_name = [] 71 | for i in range(len(fe_name)): 72 | if fe_name[i] not in feature_used_name: 73 | feature_not_used_name.append(fe_name[i]) 74 | 75 | # 生成一个染色体(诸如01011100这样的) 76 | chromosome_temp = '' 77 | feature_name_ivar = fe_name[:-1] 78 | for ii in range(len(feature_name_ivar)): 79 | if feature_name_ivar[ii] in feature_used_name: 80 | chromosome_temp += '1' 81 | else: 82 | chromosome_temp += '0' 83 | print 'Chromosome:' 84 | print chromosome_temp 85 | joblib.dump(chromosome_temp, '../config/chromosome.pkl') 86 | print '\n' 87 | return matrix_x, feature_not_used_name[:], len(feature_used_name) 88 | 89 | 90 | def data_test_feature_drop(data_test, feature_name_drop): 91 | # print feature_name_drop 92 | for col in feature_name_drop: 93 | data_test.drop(col, axis=1, inplace=True) 94 | print "data_test_shape:" 95 | print data_test.shape 96 | return data_test.as_matrix() 97 | 98 | 99 | def write_predict_results_to_csv(csv_name, uid, prob_list): 100 | 101 | csv_file = file(csv_name, 'wb') 102 | writer = csv.writer(csv_file) 103 | combined_list = [['ID', 'pred']] 104 | if len(uid) == len(prob_list): 105 | for i in range(len(uid)): 106 | combined_list.append([str(uid[i]), str(prob_list[i])]) 107 | writer.writerows(combined_list) 108 | csv_file.close() 109 | else: 110 | print 'no和pred的个数不一致' 111 | 112 | 113 | def xgb_lgb_cv_modeling(): 114 | """ 115 | 116 | :return: 117 | """ 118 | 119 | '''Data input''' 120 | data_train = pd.read_csv('../data/train.csv', index_col='ID') 121 | data_predict = pd.read_csv('../data/pred.csv', index_col='ID') 122 | 123 | '''train 特征工程''' 124 | data_train_without_label = data_train.drop('Label', axis=1) 125 | # del data_train_without_label['V17'] 126 | 127 | # data_train_without_label['V14×V17'] = data_train_without_label['V14'] * data_train_without_label['V17'] 128 | # data_train_without_label['V14×V4'] = data_train_without_label['V14'] * data_train_without_label['V4'] 129 | # data_train_without_label['V14×V20'] = data_train_without_label['V14'] * data_train_without_label['V20'] 130 | # data_train_without_label['V14×V7'] = data_train_without_label['V14']*data_train_without_label['V7'] 131 | # data_train_without_label['V14×V10'] = data_train_without_label['V14'] * data_train_without_label['V10'] 132 | # 133 | # data_train_without_label['V17×V4'] = data_train_without_label['V17'] * data_train_without_label['V4'] 134 | # data_train_without_label['V17×V20'] = data_train_without_label['V17'] * data_train_without_label['V20'] 135 | # data_train_without_label['V17×V7'] = data_train_without_label['V17'] * data_train_without_label['V7'] 136 | # data_train_without_label['V17×V10'] = data_train_without_label['V17'] * data_train_without_label['V10'] 137 | # 138 | # data_train_without_label['V4×V20'] = data_train_without_label['V4'] * data_train_without_label['V20'] 139 | # data_train_without_label['V4×V7'] = data_train_without_label['V4'] * data_train_without_label['V7'] 140 | # data_train_without_label['V4×V10'] = data_train_without_label['V4'] * data_train_without_label['V10'] 141 | # 142 | # data_train_without_label['V20×V7'] = data_train_without_label['V20'] * data_train_without_label['V7'] 143 | # data_train_without_label['V20×V10'] = data_train_without_label['V20'] * data_train_without_label['V10'] 144 | # 145 | # data_train_without_label['V7×V10'] = data_train_without_label['V7'] * data_train_without_label['V10'] 146 | 147 | # 无缺失值,不需要填补 148 | 149 | feature_name = list(data_train_without_label.columns.values) 150 | data_predict_user_id = list(data_predict.index.values) 151 | 152 | '''缺失值填充''' 153 | frames = [data_train_without_label, data_predict] 154 | data_all = pd.concat(frames) 155 | data_train_filled = data_train_without_label.fillna(value=data_all.median()) 156 | 157 | '''构造训练集和测试集''' 158 | x_temp = data_train_filled.iloc[:, :].as_matrix() # 自变量 159 | y = data_train.iloc[:, -1].as_matrix() # 因变量 160 | 161 | '''Feature selection''' 162 | X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean') 163 | # 0.1*mean可以选出10个特征 164 | # 0.00001*mean可以选出14个特征 165 | 166 | '''处理 验证集 B_test''' 167 | # del data_predict['V17'] 168 | 169 | data_predict_filled = data_predict.fillna(value=data_all.median()) 170 | data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name) 171 | 172 | '''Split train/test data sets''' 173 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation 174 | 175 | '''Choose a classification model''' 176 | parameter_n_estimators = 100 177 | classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1) 178 | 179 | # param = { 180 | # 'max_depth': 6, 181 | # 'num_leaves': 64, 182 | # 'learning_rate': 0.03, 183 | # 'scale_pos_weight': 1, 184 | # 'num_threads': 40, 185 | # 'objective': 'binary', 186 | # 'bagging_fraction': 0.7, 187 | # 'bagging_freq': 1, 188 | # 'min_sum_hessian_in_leaf': 100 189 | # } 190 | # 191 | # param['is_unbalance'] = 'true' 192 | # param['metric'] = 'auc' 193 | 194 | # (1)num_leaves 195 | # 196 | # LightGBM使用的是leaf - wise的算法,因此在调节树的复杂程度时,使用的是num_leaves而不是max_depth。 197 | # 198 | # 大致换算关系:num_leaves = 2 ^ (max_depth) 199 | # 200 | # (2)样本分布非平衡数据集:可以param[‘is_unbalance’]=’true’ 201 | # 202 | # (3)Bagging参数:bagging_fraction + bagging_freq(必须同时设置)、feature_fraction 203 | # 204 | # (4)min_data_in_leaf、min_sum_hessian_in_leaf 205 | 206 | '''Model fit, predict and ROC''' 207 | colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue']) 208 | lw = 2 209 | mean_f1 = 0.0 210 | mean_tpr = 0.0 211 | mean_fpr = np.linspace(0, 1, 500) 212 | i_of_roc = 0 213 | a = 0 214 | 215 | th = 0.5 216 | 217 | for (train_indice, test_indice), color in zip(cv.split(X, y), colors): 218 | a_model = classifier.fit(X[train_indice], y[train_indice]) 219 | 220 | # y_predict_label = a_model.predict(X[test_indice]) 221 | 222 | probas_ = a_model.predict_proba(X[test_indice]) 223 | 224 | fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1]) 225 | 226 | a += 1 227 | 228 | mean_tpr += interp(mean_fpr, fpr, tpr) 229 | mean_tpr[0] = 0.0 230 | 231 | roc_auc = auc(fpr, tpr) 232 | plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc)) 233 | i_of_roc += 1 234 | 235 | label_transformed = probas_[:, 1] 236 | for i in range(len(label_transformed)): 237 | if label_transformed[i] > th: 238 | label_transformed[i] = 1 239 | else: 240 | label_transformed[i] = 0 241 | lt = label_transformed.astype('int32') 242 | f1 = f1_score(y[test_indice], lt) 243 | mean_f1 += f1 244 | 245 | plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') 246 | 247 | mean_tpr /= cv.get_n_splits(X, y) 248 | mean_tpr[-1] = 1.0 249 | mean_auc = auc(mean_fpr, mean_tpr) 250 | print 'mean_auc=' + str(mean_auc) 251 | print 'mean_f1=' + str(mean_f1/5) 252 | plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw) 253 | plt.xlim([-0.01, 1.01]) 254 | plt.ylim([-0.01, 1.01]) 255 | plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1)) 256 | plt.ylabel('True Positive Rate') 257 | 258 | plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5)) 259 | plt.legend(loc="lower right") 260 | plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + 261 | '_proba_to_label_using_th_' + str(th) + '.png') 262 | # plt.show() 263 | 264 | a_model = classifier.fit(X, y) 265 | 266 | proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection) 267 | 268 | '''proba result''' 269 | result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv' 270 | write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist()) 271 | 272 | '''写入要提交的结果''' 273 | label_transformed = proba_predict[:, 1] 274 | for i in range(len(label_transformed)): 275 | if label_transformed[i] > th: 276 | label_transformed[i] = 1 277 | else: 278 | label_transformed[i] = 0 279 | lt = label_transformed.astype('int32') 280 | result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \ 281 | '_proba_to_label_using_th_' + str(th) + '.csv' 282 | write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist()) 283 | 284 | -------------------------------------------------------------------------------- /code/XGB_LGB.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/code/XGB_LGB.pyc -------------------------------------------------------------------------------- /eda/boxplot/V1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V1.png -------------------------------------------------------------------------------- /eda/boxplot/V10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V10.png -------------------------------------------------------------------------------- /eda/boxplot/V11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V11.png -------------------------------------------------------------------------------- /eda/boxplot/V12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V12.png -------------------------------------------------------------------------------- /eda/boxplot/V13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V13.png -------------------------------------------------------------------------------- /eda/boxplot/V14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V14.png -------------------------------------------------------------------------------- /eda/boxplot/V15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V15.png -------------------------------------------------------------------------------- /eda/boxplot/V16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V16.png -------------------------------------------------------------------------------- /eda/boxplot/V17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V17.png -------------------------------------------------------------------------------- /eda/boxplot/V18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V18.png -------------------------------------------------------------------------------- /eda/boxplot/V19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V19.png -------------------------------------------------------------------------------- /eda/boxplot/V2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V2.png -------------------------------------------------------------------------------- /eda/boxplot/V20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V20.png -------------------------------------------------------------------------------- /eda/boxplot/V21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V21.png -------------------------------------------------------------------------------- /eda/boxplot/V22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V22.png -------------------------------------------------------------------------------- /eda/boxplot/V23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V23.png -------------------------------------------------------------------------------- /eda/boxplot/V24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V24.png -------------------------------------------------------------------------------- /eda/boxplot/V25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V25.png -------------------------------------------------------------------------------- /eda/boxplot/V26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V26.png -------------------------------------------------------------------------------- /eda/boxplot/V27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V27.png -------------------------------------------------------------------------------- /eda/boxplot/V28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V28.png -------------------------------------------------------------------------------- /eda/boxplot/V29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V29.png -------------------------------------------------------------------------------- /eda/boxplot/V3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V3.png -------------------------------------------------------------------------------- /eda/boxplot/V30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V30.png -------------------------------------------------------------------------------- /eda/boxplot/V4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V4.png -------------------------------------------------------------------------------- /eda/boxplot/V5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V5.png -------------------------------------------------------------------------------- /eda/boxplot/V6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V6.png -------------------------------------------------------------------------------- /eda/boxplot/V7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V7.png -------------------------------------------------------------------------------- /eda/boxplot/V8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V8.png -------------------------------------------------------------------------------- /eda/boxplot/V9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V9.png -------------------------------------------------------------------------------- /eda/boxplot/V_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V_Time.png -------------------------------------------------------------------------------- /eda/corr_matrix_new.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_matrix_new.xlsx -------------------------------------------------------------------------------- /eda/corr_plot/corr_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_plot/corr_plot.png -------------------------------------------------------------------------------- /eda/corr_plot/corr_plot_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_plot/corr_plot_0.png -------------------------------------------------------------------------------- /eda/corr_plot/corr_plot_rainbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_plot/corr_plot_rainbow.png -------------------------------------------------------------------------------- /eda/gbdt_feature_importance.txt: -------------------------------------------------------------------------------- 1 | Rank Feature Name Feature Importance 2 | 0 V17 0.715761411234 3 | 1 V27 0.0827720571735 4 | 2 V10 0.0811850721827 5 | 3 V14 0.0768377090785 6 | 4 V7 0.0116218223458 7 | 5 V2 0.00710842393001 8 | 6 V16 0.00562570885128 9 | 7 V11 0.00466191914899 10 | 8 V6 0.00439651734468 11 | 9 V9 0.00324933369048 12 | 10 V21 0.00287736630364 13 | 11 V8 0.00159270515461 14 | 12 V4 0.000705344432263 15 | 13 V18 0.000382178785853 16 | 14 V12 0.000247463477802 17 | 15 V25 0.000247365427351 18 | 16 V22 0.000202470118201 19 | 17 V24 0.000184439077434 20 | 18 V5 0.000129512221324 21 | 19 V26 6.88147257893e-05 22 | 20 V15 5.496971524e-05 23 | 21 V_Time 2.52304558213e-05 24 | 22 V20 2.25784388932e-05 25 | 23 V13 2.03872327681e-05 26 | 24 V19 1.29193264202e-05 27 | 25 V23 6.28012690464e-06 28 | 26 V29 0.0 29 | 27 V28 0.0 30 | 28 V1 0.0 31 | 29 V3 0.0 32 | 30 V30 0.0 33 | -------------------------------------------------------------------------------- /eda/xgb_feature_importance.txt: -------------------------------------------------------------------------------- 1 | Rank Feature Name Feature Importance Train异常值情况 异常值的Label 2 | 0 V14 0.22093 有一个略大异常值 3 | 1 V17 0.127907 异常值差不多 4 | 2 V4 0.119186 有一个略大异常值 5 | 3 V20 0.0901163 有一个过大异常值 6 | 4 V7 0.0843023 有一个过大异常值 7 | 5 V10 0.0726744 pred有一个过大异常值 8 | 6 V21 0.0348837 差不多 9 | 7 V26 0.0348837 差不多 10 | 8 V12 0.0290698 train有一个略大异常值 11 | 9 V23 0.0232558 train有三个略小异常值 12 | 10 V27 0.0232558 13 | 11 V5 0.0232558 train有一个过小异常值 14 | 12 V_Time 0.0203488 完全匹配 15 | 13 V16 0.0145349 train有一个过大异常值 16 | 14 V22 0.0116279 勉强差不多 17 | 15 V28 0.00872093 train有一个过大异常值 18 | 16 V1 0.00872093 差不多 19 | 17 V8 0.00872093 勉强算差不多 20 | 18 V9 0.00872093 pred 有一个过大异常值 21 | 19 V13 0.00872093 train有一个过大异常值 22 | 20 V2 0.00581395 差不多 23 | 21 V15 0.00581395 有一个过大异常值 24 | 以上是被选择的特征 25 | 26 | 27 | 28 | 22 V29 0.00290698 29 | 23 V3 0.00290698 30 | 24 V30 0.00290698 31 | 25 V18 0.00290698 32 | 26 V11 0.00290698 33 | 27 V25 0.0 34 | 28 V24 0.0 35 | 29 V6 0.0 36 | 30 V19 0.0 37 | -------------------------------------------------------------------------------- /风险识别算法赛-项目说明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/风险识别算法赛-项目说明.pdf --------------------------------------------------------------------------------