├── README.md
├── code
    ├── GBDT_LGB.py
    ├── GBDT_LGB.pyc
    ├── MAIN.py
    ├── XGB_LGB.py
    └── XGB_LGB.pyc
├── eda
    ├── boxplot
    │   ├── V1.png
    │   ├── V10.png
    │   ├── V11.png
    │   ├── V12.png
    │   ├── V13.png
    │   ├── V14.png
    │   ├── V15.png
    │   ├── V16.png
    │   ├── V17.png
    │   ├── V18.png
    │   ├── V19.png
    │   ├── V2.png
    │   ├── V20.png
    │   ├── V21.png
    │   ├── V22.png
    │   ├── V23.png
    │   ├── V24.png
    │   ├── V25.png
    │   ├── V26.png
    │   ├── V27.png
    │   ├── V28.png
    │   ├── V29.png
    │   ├── V3.png
    │   ├── V30.png
    │   ├── V4.png
    │   ├── V5.png
    │   ├── V6.png
    │   ├── V7.png
    │   ├── V8.png
    │   ├── V9.png
    │   └── V_Time.png
    ├── corr_matrix_new.xlsx
    ├── corr_plot
    │   ├── corr_plot.png
    │   ├── corr_plot_0.png
    │   └── corr_plot_rainbow.png
    ├── gbdt_feature_importance.txt
    └── xgb_feature_importance.txt
├── result
    └── BestResult
    │   └── pred_result_combine_two_set_233259_set0_147575_MMK_finished.csv
└── 风险识别算法赛-项目说明.pdf


/README.md:
--------------------------------------------------------------------------------
1 | # QLM-Tianchi
2 | 天池大数据竞赛  千里马大赛  风险识别与预测赛题  Top5  解决方案  
3 | 能够帮助小白快速上手机器学习任务  
4 | 欢迎邮件交流 davidkangyz@163.com
5 | 


--------------------------------------------------------------------------------
/code/GBDT_LGB.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #  -*- coding: utf-8 -*-
  3 | # date: 2018
  4 | # author: Kang Yan Zhe
  5 | # desc: 千里马 风险识别算法竞赛
  6 | 
  7 | import csv
  8 | import pandas as pd
  9 | import numpy as np
 10 | from scipy import interp
 11 | import matplotlib.pyplot as plt
 12 | from itertools import cycle
 13 | from sklearn.feature_selection import SelectFromModel
 14 | from sklearn.model_selection import StratifiedKFold
 15 | from sklearn.ensemble import GradientBoostingClassifier
 16 | from sklearn.metrics import roc_curve, auc, f1_score
 17 | from sklearn.externals import joblib
 18 | from lightgbm import LGBMClassifier
 19 | 
 20 | 
 21 | def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th):
 22 |     # SelectfromModel
 23 |     clf = GradientBoostingClassifier(n_estimators=50, random_state=100)
 24 |     clf.fit(matrix_x_temp, label_y)
 25 |     sfm = SelectFromModel(clf, prefit=True, threshold=th)
 26 |     matrix_x = sfm.transform(matrix_x_temp)
 27 | 
 28 |     # 打印出有多少特征重要性非零的特征
 29 |     feature_score_dict = {}
 30 |     for fn, s in zip(fe_name, clf.feature_importances_):
 31 |         feature_score_dict[fn] = s
 32 |     m = 0
 33 |     for k in feature_score_dict:
 34 |         if feature_score_dict[k] == 0.0:
 35 |             m += 1
 36 |     print 'number of not-zero features:' + str(len(feature_score_dict) - m)
 37 | 
 38 |     # 打印出特征重要性
 39 |     feature_score_dict_sorted = sorted(feature_score_dict.items(),
 40 |                                        key=lambda d: d[1], reverse=True)
 41 |     print 'feature_importance:'
 42 |     for ii in range(len(feature_score_dict_sorted)):
 43 |         print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
 44 |     print '\n'
 45 | 
 46 |     f = open('../eda/gbdt_feature_importance.txt', 'w')
 47 |     f.write('Rank\tFeature Name\tFeature Importance\n')
 48 |     for i in range(len(feature_score_dict_sorted)):
 49 |         f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
 50 |     f.close()
 51 | 
 52 |     # 打印具体使用了哪些字段
 53 |     how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
 54 |     feature_used_dict_temp = feature_score_dict_sorted[:how_long]
 55 |     feature_used_name = []
 56 |     for ii in range(len(feature_used_dict_temp)):
 57 |         feature_used_name.append(feature_used_dict_temp[ii][0])
 58 |     print 'feature_chooesed:'
 59 |     for ii in range(len(feature_used_name)):
 60 |         print feature_used_name[ii]
 61 |     print '\n'
 62 | 
 63 |     f = open('../eda/gbdt_feature_chose.txt', 'w')
 64 |     f.write('Feature Chose Name :\n')
 65 |     for i in range(len(feature_used_name)):
 66 |         f.write(str(feature_used_name[i]) + '\n')
 67 |     f.close()
 68 | 
 69 |     # 找到未被使用的字段名
 70 |     feature_not_used_name = []
 71 |     for i in range(len(fe_name)):
 72 |         if fe_name[i] not in feature_used_name:
 73 |             feature_not_used_name.append(fe_name[i])
 74 | 
 75 |     # 生成一个染色体（诸如01011100这样的）
 76 |     chromosome_temp = ''
 77 |     feature_name_ivar = fe_name[:-1]
 78 |     for ii in range(len(feature_name_ivar)):
 79 |         if feature_name_ivar[ii] in feature_used_name:
 80 |             chromosome_temp += '1'
 81 |         else:
 82 |             chromosome_temp += '0'
 83 |     print 'Chromosome:'
 84 |     print chromosome_temp
 85 |     joblib.dump(chromosome_temp, '../config/chromosome.pkl')
 86 |     print '\n'
 87 |     return matrix_x, feature_not_used_name, len(feature_used_name)
 88 | 
 89 | 
 90 | def data_test_feature_drop(data_test, feature_name_drop):
 91 |     # print feature_name_drop
 92 |     for col in feature_name_drop:
 93 |         data_test.drop(col, axis=1, inplace=True)
 94 |     print "data_test_shape:"
 95 |     print data_test.shape
 96 |     return data_test.as_matrix()
 97 | 
 98 | 
 99 | def write_predict_results_to_csv(csv_name, uid, prob_list):
100 | 
101 |     csv_file = file(csv_name, 'wb')
102 |     writer = csv.writer(csv_file)
103 |     combined_list = [['ID', 'pred']]
104 |     if len(uid) == len(prob_list):
105 |         for i in range(len(uid)):
106 |             combined_list.append([str(uid[i]), str(prob_list[i])])
107 |         writer.writerows(combined_list)
108 |         csv_file.close()
109 |     else:
110 |         print 'no和pred的个数不一致'
111 | 
112 | 
113 | def gbdt_lgb_cv_modeling():
114 |     """
115 | 
116 |     :return:
117 |     """
118 | 
119 |     '''Data input'''
120 |     data_train = pd.read_csv('../data/train.csv', index_col='ID')
121 |     data_predict = pd.read_csv('../data/pred.csv', index_col='ID')
122 | 
123 |     '''train 特征工程'''
124 |     data_train_without_label = data_train.drop('Label', axis=1)
125 |     # del data_train_without_label['V17']
126 |     # data_train_without_label['V14×V17'] = data_train_without_label['V14'] * data_train_without_label['V17']
127 |     # data_train_without_label['V14×V4'] = data_train_without_label['V14'] * data_train_without_label['V4']
128 |     # data_train_without_label['V14×V20'] = data_train_without_label['V14'] * data_train_without_label['V20']
129 |     # data_train_without_label['V14×V7'] = data_train_without_label['V14']*data_train_without_label['V7']
130 |     # data_train_without_label['V14×V10'] = data_train_without_label['V14'] * data_train_without_label['V10']
131 |     #
132 |     # data_train_without_label['V17×V4'] = data_train_without_label['V17'] * data_train_without_label['V4']
133 |     # data_train_without_label['V17×V20'] = data_train_without_label['V17'] * data_train_without_label['V20']
134 |     # data_train_without_label['V17×V7'] = data_train_without_label['V17'] * data_train_without_label['V7']
135 |     # data_train_without_label['V17×V10'] = data_train_without_label['V17'] * data_train_without_label['V10']
136 |     #
137 |     # data_train_without_label['V4×V20'] = data_train_without_label['V4'] * data_train_without_label['V20']
138 |     # data_train_without_label['V4×V7'] = data_train_without_label['V4'] * data_train_without_label['V7']
139 |     # data_train_without_label['V4×V10'] = data_train_without_label['V4'] * data_train_without_label['V10']
140 |     #
141 |     # data_train_without_label['V20×V7'] = data_train_without_label['V20'] * data_train_without_label['V7']
142 |     # data_train_without_label['V20×V10'] = data_train_without_label['V20'] * data_train_without_label['V10']
143 |     #
144 |     # data_train_without_label['V7×V10'] = data_train_without_label['V7'] * data_train_without_label['V10']
145 | 
146 |     feature_name = list(data_train_without_label.columns.values)
147 |     data_predict_user_id = list(data_predict.index.values)
148 | 
149 |     '''缺失值填充'''
150 |     frames = [data_train_without_label, data_predict]
151 |     data_all = pd.concat(frames)
152 |     data_train_filled = data_train_without_label.fillna(value=data_all.median())
153 | 
154 |     '''构造训练集和测试集'''
155 |     x_temp = data_train_filled.iloc[:, :].as_matrix()  # 自变量
156 |     y = data_train.iloc[:, -1].as_matrix()  # 因变量
157 | 
158 |     '''Feature selection'''
159 |     X, dropped_feature_name, len_feature_choose = gbdt_feature_selection(feature_name, x_temp, y, '0.0005*mean')
160 |     # 0.1*mean可以选出10个特征
161 |     # 0.00001*mean可以选出14个特征
162 | 
163 |     '''处理 验证集 B_test'''
164 |     # del data_predict['V17']
165 | 
166 |     data_predict_filled = data_predict.fillna(value=data_all.median())
167 |     data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name)
168 | 
169 |     '''Split train/test data sets'''
170 |     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation
171 | 
172 |     '''Choose a classification model'''
173 |     parameter_n_estimators = 100
174 |     classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1)
175 | 
176 |     '''Model fit, predict and ROC'''
177 |     colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
178 |     lw = 2
179 |     mean_f1 = 0.0
180 |     mean_tpr = 0.0
181 |     mean_fpr = np.linspace(0, 1, 500)
182 |     i_of_roc = 0
183 |     a = 0
184 | 
185 |     th = 0.3
186 | 
187 |     for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
188 |         a_model = classifier.fit(X[train_indice], y[train_indice])
189 | 
190 |         # y_predict_label = a_model.predict(X[test_indice])
191 | 
192 |         probas_ = a_model.predict_proba(X[test_indice])
193 | 
194 |         fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])
195 | 
196 |         a += 1  # 序号加1
197 | 
198 |         mean_tpr += interp(mean_fpr, fpr, tpr)
199 |         mean_tpr[0] = 0.0
200 | 
201 |         roc_auc = auc(fpr, tpr)
202 |         plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
203 |         i_of_roc += 1
204 | 
205 |         label_transformed = probas_[:, 1]
206 |         for i in range(len(label_transformed)):
207 |             if label_transformed[i] > th:
208 |                 label_transformed[i] = 1
209 |             else:
210 |                 label_transformed[i] = 0
211 |         lt = label_transformed.astype('int32')
212 |         f1 = f1_score(y[test_indice], lt)
213 |         mean_f1 += f1  # 0.7739
214 | 
215 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
216 | 
217 |     mean_tpr /= cv.get_n_splits(X, y)
218 |     mean_tpr[-1] = 1.0
219 |     mean_auc = auc(mean_fpr, mean_tpr)
220 |     print 'mean_auc=' + str(mean_auc)
221 |     print 'mean_f1=' + str(mean_f1/5)
222 |     plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)
223 |     plt.xlim([-0.01, 1.01])
224 |     plt.ylim([-0.01, 1.01])
225 |     plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1))
226 |     plt.ylabel('True Positive Rate')
227 | 
228 |     plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5))
229 |     plt.legend(loc="lower right")
230 |     plt.savefig('../result/pred_ROC_GL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) +
231 |                 '_proba_to_label_using_th_' + str(th) + '.png')
232 |     # plt.show()
233 | 
234 |     a_model = classifier.fit(X, y)
235 | 
236 |     # label_predict = a_model.predict(data_predict_filled_after_feature_selection)  # 对B_test进行预测
237 |     proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection)
238 | 
239 |     '''写入预测出概率的结果'''
240 |     result_file_name = '../result/pred_result_GL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv'
241 |     write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist())
242 | 
243 |     '''写入要提交的结果'''
244 |     label_transformed = proba_predict[:, 1]
245 |     sum_of_1 = 0
246 |     for i in range(len(label_transformed)):
247 |         if label_transformed[i] > th:
248 |             label_transformed[i] = 1
249 |             sum_of_1 += 1
250 |         else:
251 |             label_transformed[i] = 0
252 |     lt = label_transformed.astype('int32')
253 |     result_file_name = '../result/pred_result_GL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \
254 |                        '_proba_to_label_using_th_' + str(th) + '_' + str(sum_of_1) + '.csv'
255 |     write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())
256 | 
257 | 


--------------------------------------------------------------------------------
/code/GBDT_LGB.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/code/GBDT_LGB.pyc


--------------------------------------------------------------------------------
/code/MAIN.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #  -*- coding: utf-8 -*-
 3 | # date: 2018
 4 | # author: Kang Yan Zhe
 5 | # desc: 千里马 风险识别算法竞赛
 6 | 
 7 | from XGB_LGB import xgb_lgb_cv_modeling
 8 | from GBDT_LGB import gbdt_lgb_cv_modeling
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     xgb_lgb_cv_modeling()
13 | 
14 |     gbdt_lgb_cv_modeling()


--------------------------------------------------------------------------------
/code/XGB_LGB.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #  -*- coding: utf-8 -*-
  3 | # date: 2018
  4 | # author: Kang Yan Zhe
  5 | # desc: 千里马 风险识别算法竞赛
  6 | 
  7 | import csv
  8 | import pandas as pd
  9 | import numpy as np
 10 | from scipy import interp
 11 | import matplotlib.pyplot as plt
 12 | from itertools import cycle
 13 | from sklearn.feature_selection import SelectFromModel
 14 | from sklearn.model_selection import StratifiedKFold
 15 | from sklearn.metrics import roc_curve, auc, f1_score
 16 | from sklearn.externals import joblib
 17 | from xgboost import XGBClassifier
 18 | from lightgbm import LGBMClassifier
 19 | 
 20 | 
 21 | def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
 22 |     # SelectfromModel
 23 |     clf = XGBClassifier(n_estimators=50)
 24 |     clf.fit(matrix_x_temp, label_y)
 25 |     sfm = SelectFromModel(clf, prefit=True, threshold=th)
 26 |     matrix_x = sfm.transform(matrix_x_temp)
 27 | 
 28 |     # 打印出有多少特征重要性非零的特征
 29 |     feature_score_dict = {}
 30 |     for fn, s in zip(fe_name, clf.feature_importances_):
 31 |         feature_score_dict[fn] = s
 32 |     m = 0
 33 |     for k in feature_score_dict:
 34 |         if feature_score_dict[k] == 0.0:
 35 |             m += 1
 36 |     print 'number of not-zero features:' + str(len(feature_score_dict) - m)
 37 | 
 38 |     # 打印出特征重要性
 39 |     feature_score_dict_sorted = sorted(feature_score_dict.items(),
 40 |                                        key=lambda d: d[1], reverse=True)
 41 |     print 'xgb_feature_importance:'
 42 |     for ii in range(len(feature_score_dict_sorted)):
 43 |         print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
 44 |     print '\n'
 45 | 
 46 |     f = open('../eda/xgb_feature_importance.txt', 'w')
 47 |     f.write('Rank\tFeature Name\tFeature Importance\n')
 48 |     for i in range(len(feature_score_dict_sorted)):
 49 |         f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
 50 |     f.close()
 51 | 
 52 |     # 打印具体使用了哪些字段
 53 |     how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
 54 |     feature_used_dict_temp = feature_score_dict_sorted[:how_long]
 55 |     feature_used_name = []
 56 |     for ii in range(len(feature_used_dict_temp)):
 57 |         feature_used_name.append(feature_used_dict_temp[ii][0])
 58 |     print 'feature_chooesed:'
 59 |     for ii in range(len(feature_used_name)):
 60 |         print feature_used_name[ii]
 61 |     print '\n'
 62 | 
 63 |     f = open('../eda/xgb_feature_chose.txt', 'w')
 64 |     f.write('Feature Chose Name :\n')
 65 |     for i in range(len(feature_used_name)):
 66 |         f.write(str(feature_used_name[i]) + '\n')
 67 |     f.close()
 68 | 
 69 |     # 找到未被使用的字段名
 70 |     feature_not_used_name = []
 71 |     for i in range(len(fe_name)):
 72 |         if fe_name[i] not in feature_used_name:
 73 |             feature_not_used_name.append(fe_name[i])
 74 | 
 75 |     # 生成一个染色体（诸如01011100这样的）
 76 |     chromosome_temp = ''
 77 |     feature_name_ivar = fe_name[:-1]
 78 |     for ii in range(len(feature_name_ivar)):
 79 |         if feature_name_ivar[ii] in feature_used_name:
 80 |             chromosome_temp += '1'
 81 |         else:
 82 |             chromosome_temp += '0'
 83 |     print 'Chromosome:'
 84 |     print chromosome_temp
 85 |     joblib.dump(chromosome_temp, '../config/chromosome.pkl')
 86 |     print '\n'
 87 |     return matrix_x, feature_not_used_name[:], len(feature_used_name)
 88 | 
 89 | 
 90 | def data_test_feature_drop(data_test, feature_name_drop):
 91 |     # print feature_name_drop
 92 |     for col in feature_name_drop:
 93 |         data_test.drop(col, axis=1, inplace=True)
 94 |     print "data_test_shape:"
 95 |     print data_test.shape
 96 |     return data_test.as_matrix()
 97 | 
 98 | 
 99 | def write_predict_results_to_csv(csv_name, uid, prob_list):
100 | 
101 |     csv_file = file(csv_name, 'wb')
102 |     writer = csv.writer(csv_file)
103 |     combined_list = [['ID', 'pred']]
104 |     if len(uid) == len(prob_list):
105 |         for i in range(len(uid)):
106 |             combined_list.append([str(uid[i]), str(prob_list[i])])
107 |         writer.writerows(combined_list)
108 |         csv_file.close()
109 |     else:
110 |         print 'no和pred的个数不一致'
111 | 
112 | 
113 | def xgb_lgb_cv_modeling():
114 |     """
115 | 
116 |     :return:
117 |     """
118 | 
119 |     '''Data input'''
120 |     data_train = pd.read_csv('../data/train.csv', index_col='ID')
121 |     data_predict = pd.read_csv('../data/pred.csv', index_col='ID')
122 | 
123 |     '''train 特征工程'''
124 |     data_train_without_label = data_train.drop('Label', axis=1)
125 |     # del data_train_without_label['V17']
126 | 
127 |     # data_train_without_label['V14×V17'] = data_train_without_label['V14'] * data_train_without_label['V17']
128 |     # data_train_without_label['V14×V4'] = data_train_without_label['V14'] * data_train_without_label['V4']
129 |     # data_train_without_label['V14×V20'] = data_train_without_label['V14'] * data_train_without_label['V20']
130 |     # data_train_without_label['V14×V7'] = data_train_without_label['V14']*data_train_without_label['V7']
131 |     # data_train_without_label['V14×V10'] = data_train_without_label['V14'] * data_train_without_label['V10']
132 |     #
133 |     # data_train_without_label['V17×V4'] = data_train_without_label['V17'] * data_train_without_label['V4']
134 |     # data_train_without_label['V17×V20'] = data_train_without_label['V17'] * data_train_without_label['V20']
135 |     # data_train_without_label['V17×V7'] = data_train_without_label['V17'] * data_train_without_label['V7']
136 |     # data_train_without_label['V17×V10'] = data_train_without_label['V17'] * data_train_without_label['V10']
137 |     #
138 |     # data_train_without_label['V4×V20'] = data_train_without_label['V4'] * data_train_without_label['V20']
139 |     # data_train_without_label['V4×V7'] = data_train_without_label['V4'] * data_train_without_label['V7']
140 |     # data_train_without_label['V4×V10'] = data_train_without_label['V4'] * data_train_without_label['V10']
141 |     #
142 |     # data_train_without_label['V20×V7'] = data_train_without_label['V20'] * data_train_without_label['V7']
143 |     # data_train_without_label['V20×V10'] = data_train_without_label['V20'] * data_train_without_label['V10']
144 |     #
145 |     # data_train_without_label['V7×V10'] = data_train_without_label['V7'] * data_train_without_label['V10']
146 | 
147 |     # 无缺失值，不需要填补
148 | 
149 |     feature_name = list(data_train_without_label.columns.values)
150 |     data_predict_user_id = list(data_predict.index.values)
151 | 
152 |     '''缺失值填充'''
153 |     frames = [data_train_without_label, data_predict]
154 |     data_all = pd.concat(frames)
155 |     data_train_filled = data_train_without_label.fillna(value=data_all.median())
156 | 
157 |     '''构造训练集和测试集'''
158 |     x_temp = data_train_filled.iloc[:, :].as_matrix()  # 自变量
159 |     y = data_train.iloc[:, -1].as_matrix()  # 因变量
160 | 
161 |     '''Feature selection'''
162 |     X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean')
163 |     # 0.1*mean可以选出10个特征
164 |     # 0.00001*mean可以选出14个特征
165 | 
166 |     '''处理 验证集 B_test'''
167 |     # del data_predict['V17']
168 | 
169 |     data_predict_filled = data_predict.fillna(value=data_all.median())
170 |     data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name)
171 | 
172 |     '''Split train/test data sets'''
173 |     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation
174 | 
175 |     '''Choose a classification model'''
176 |     parameter_n_estimators = 100
177 |     classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1)
178 | 
179 |     # param = {
180 |     #     'max_depth': 6,
181 |     #     'num_leaves': 64,
182 |     #     'learning_rate': 0.03,
183 |     #     'scale_pos_weight': 1,
184 |     #     'num_threads': 40,
185 |     #     'objective': 'binary',
186 |     #     'bagging_fraction': 0.7,
187 |     #     'bagging_freq': 1,
188 |     #     'min_sum_hessian_in_leaf': 100
189 |     # }
190 |     #
191 |     # param['is_unbalance'] = 'true'
192 |     # param['metric'] = 'auc'
193 | 
194 |     # （1）num_leaves
195 |     #
196 |     # LightGBM使用的是leaf - wise的算法，因此在调节树的复杂程度时，使用的是num_leaves而不是max_depth。
197 |     #
198 |     # 大致换算关系：num_leaves = 2 ^ (max_depth)
199 |     #
200 |     # （2）样本分布非平衡数据集：可以param[‘is_unbalance’]=’true’
201 |     #
202 |     # （3）Bagging参数：bagging_fraction + bagging_freq（必须同时设置）、feature_fraction
203 |     #
204 |     # （4）min_data_in_leaf、min_sum_hessian_in_leaf
205 | 
206 |     '''Model fit, predict and ROC'''
207 |     colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
208 |     lw = 2
209 |     mean_f1 = 0.0
210 |     mean_tpr = 0.0
211 |     mean_fpr = np.linspace(0, 1, 500)
212 |     i_of_roc = 0
213 |     a = 0
214 | 
215 |     th = 0.5
216 | 
217 |     for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
218 |         a_model = classifier.fit(X[train_indice], y[train_indice])
219 | 
220 |         # y_predict_label = a_model.predict(X[test_indice])
221 | 
222 |         probas_ = a_model.predict_proba(X[test_indice])
223 | 
224 |         fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])
225 | 
226 |         a += 1
227 | 
228 |         mean_tpr += interp(mean_fpr, fpr, tpr)
229 |         mean_tpr[0] = 0.0
230 | 
231 |         roc_auc = auc(fpr, tpr)
232 |         plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
233 |         i_of_roc += 1
234 | 
235 |         label_transformed = probas_[:, 1]
236 |         for i in range(len(label_transformed)):
237 |             if label_transformed[i] > th:
238 |                 label_transformed[i] = 1
239 |             else:
240 |                 label_transformed[i] = 0
241 |         lt = label_transformed.astype('int32')
242 |         f1 = f1_score(y[test_indice], lt)
243 |         mean_f1 += f1
244 | 
245 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
246 | 
247 |     mean_tpr /= cv.get_n_splits(X, y)
248 |     mean_tpr[-1] = 1.0
249 |     mean_auc = auc(mean_fpr, mean_tpr)
250 |     print 'mean_auc=' + str(mean_auc)
251 |     print 'mean_f1=' + str(mean_f1/5)
252 |     plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)
253 |     plt.xlim([-0.01, 1.01])
254 |     plt.ylim([-0.01, 1.01])
255 |     plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1))
256 |     plt.ylabel('True Positive Rate')
257 | 
258 |     plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5))
259 |     plt.legend(loc="lower right")
260 |     plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) +
261 |                 '_proba_to_label_using_th_' + str(th) + '.png')
262 |     # plt.show()
263 | 
264 |     a_model = classifier.fit(X, y)
265 | 
266 |     proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection)
267 | 
268 |     '''proba result'''
269 |     result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv'
270 |     write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist())
271 | 
272 |     '''写入要提交的结果'''
273 |     label_transformed = proba_predict[:, 1]
274 |     for i in range(len(label_transformed)):
275 |         if label_transformed[i] > th:
276 |             label_transformed[i] = 1
277 |         else:
278 |             label_transformed[i] = 0
279 |     lt = label_transformed.astype('int32')
280 |     result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \
281 |                        '_proba_to_label_using_th_' + str(th) + '.csv'
282 |     write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())
283 | 
284 | 


--------------------------------------------------------------------------------
/code/XGB_LGB.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/code/XGB_LGB.pyc


--------------------------------------------------------------------------------
/eda/boxplot/V1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V1.png


--------------------------------------------------------------------------------
/eda/boxplot/V10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V10.png


--------------------------------------------------------------------------------
/eda/boxplot/V11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V11.png


--------------------------------------------------------------------------------
/eda/boxplot/V12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V12.png


--------------------------------------------------------------------------------
/eda/boxplot/V13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V13.png


--------------------------------------------------------------------------------
/eda/boxplot/V14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V14.png


--------------------------------------------------------------------------------
/eda/boxplot/V15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V15.png


--------------------------------------------------------------------------------
/eda/boxplot/V16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V16.png


--------------------------------------------------------------------------------
/eda/boxplot/V17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V17.png


--------------------------------------------------------------------------------
/eda/boxplot/V18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V18.png


--------------------------------------------------------------------------------
/eda/boxplot/V19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V19.png


--------------------------------------------------------------------------------
/eda/boxplot/V2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V2.png


--------------------------------------------------------------------------------
/eda/boxplot/V20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V20.png


--------------------------------------------------------------------------------
/eda/boxplot/V21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V21.png


--------------------------------------------------------------------------------
/eda/boxplot/V22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V22.png


--------------------------------------------------------------------------------
/eda/boxplot/V23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V23.png


--------------------------------------------------------------------------------
/eda/boxplot/V24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V24.png


--------------------------------------------------------------------------------
/eda/boxplot/V25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V25.png


--------------------------------------------------------------------------------
/eda/boxplot/V26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V26.png


--------------------------------------------------------------------------------
/eda/boxplot/V27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V27.png


--------------------------------------------------------------------------------
/eda/boxplot/V28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V28.png


--------------------------------------------------------------------------------
/eda/boxplot/V29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V29.png


--------------------------------------------------------------------------------
/eda/boxplot/V3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V3.png


--------------------------------------------------------------------------------
/eda/boxplot/V30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V30.png


--------------------------------------------------------------------------------
/eda/boxplot/V4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V4.png


--------------------------------------------------------------------------------
/eda/boxplot/V5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V5.png


--------------------------------------------------------------------------------
/eda/boxplot/V6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V6.png


--------------------------------------------------------------------------------
/eda/boxplot/V7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V7.png


--------------------------------------------------------------------------------
/eda/boxplot/V8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V8.png


--------------------------------------------------------------------------------
/eda/boxplot/V9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V9.png


--------------------------------------------------------------------------------
/eda/boxplot/V_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/boxplot/V_Time.png


--------------------------------------------------------------------------------
/eda/corr_matrix_new.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_matrix_new.xlsx


--------------------------------------------------------------------------------
/eda/corr_plot/corr_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_plot/corr_plot.png


--------------------------------------------------------------------------------
/eda/corr_plot/corr_plot_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_plot/corr_plot_0.png


--------------------------------------------------------------------------------
/eda/corr_plot/corr_plot_rainbow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/eda/corr_plot/corr_plot_rainbow.png


--------------------------------------------------------------------------------
/eda/gbdt_feature_importance.txt:
--------------------------------------------------------------------------------
 1 | Rank	Feature Name	Feature Importance
 2 | 0	V17	0.715761411234
 3 | 1	V27	0.0827720571735
 4 | 2	V10	0.0811850721827
 5 | 3	V14	0.0768377090785
 6 | 4	V7	0.0116218223458
 7 | 5	V2	0.00710842393001
 8 | 6	V16	0.00562570885128
 9 | 7	V11	0.00466191914899
10 | 8	V6	0.00439651734468
11 | 9	V9	0.00324933369048
12 | 10	V21	0.00287736630364
13 | 11	V8	0.00159270515461
14 | 12	V4	0.000705344432263
15 | 13	V18	0.000382178785853
16 | 14	V12	0.000247463477802
17 | 15	V25	0.000247365427351
18 | 16	V22	0.000202470118201
19 | 17	V24	0.000184439077434
20 | 18	V5	0.000129512221324
21 | 19	V26	6.88147257893e-05
22 | 20	V15	5.496971524e-05
23 | 21	V_Time	2.52304558213e-05
24 | 22	V20	2.25784388932e-05
25 | 23	V13	2.03872327681e-05
26 | 24	V19	1.29193264202e-05
27 | 25	V23	6.28012690464e-06
28 | 26	V29	0.0
29 | 27	V28	0.0
30 | 28	V1	0.0
31 | 29	V3	0.0
32 | 30	V30	0.0
33 | 


--------------------------------------------------------------------------------
/eda/xgb_feature_importance.txt:
--------------------------------------------------------------------------------
 1 | Rank	Feature Name	Feature Importance	Train异常值情况	异常值的Label
 2 | 0	V14	0.22093		有一个略大异常值
 3 | 1	V17	0.127907	异常值差不多
 4 | 2	V4	0.119186	有一个略大异常值
 5 | 3	V20	0.0901163	有一个过大异常值
 6 | 4	V7	0.0843023	有一个过大异常值
 7 | 5	V10	0.0726744	pred有一个过大异常值
 8 | 6	V21	0.0348837	差不多
 9 | 7	V26	0.0348837	差不多
10 | 8	V12	0.0290698	train有一个略大异常值
11 | 9	V23	0.0232558	train有三个略小异常值
12 | 10	V27	0.0232558	
13 | 11	V5	0.0232558	train有一个过小异常值
14 | 12	V_Time	0.0203488	完全匹配
15 | 13	V16	0.0145349	train有一个过大异常值
16 | 14	V22	0.0116279	勉强差不多
17 | 15	V28	0.00872093	train有一个过大异常值
18 | 16	V1	0.00872093	差不多
19 | 17	V8	0.00872093	勉强算差不多
20 | 18	V9	0.00872093	pred 有一个过大异常值
21 | 19	V13	0.00872093	train有一个过大异常值
22 | 20	V2	0.00581395	差不多
23 | 21	V15	0.00581395	有一个过大异常值
24 | 以上是被选择的特征
25 | 
26 | 
27 | 
28 | 22	V29	0.00290698
29 | 23	V3	0.00290698
30 | 24	V30	0.00290698
31 | 25	V18	0.00290698
32 | 26	V11	0.00290698
33 | 27	V25	0.0
34 | 28	V24	0.0
35 | 29	V6	0.0
36 | 30	V19	0.0
37 | 


--------------------------------------------------------------------------------
/风险识别算法赛-项目说明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzkang/QLM-Tianchi/ba05e6df3f59b00b084236e0a899f1d56db2f87a/风险识别算法赛-项目说明.pdf


--------------------------------------------------------------------------------