├── README.md
├── XGBoost调参
    ├── xgboost调参.py
    └── 调参过程结果截图.docx
├── data.rar
├── 南衣-上海财经大学-基于适度贪婪调参策略下的xgboost集成模型.docx
├── 数据预处理
    └── 数据选择.py
├── 泛化时的程序
    ├── test_model1.py
    ├── test_model2.py
    ├── test_model3.py
    ├── test_model4.py
    ├── test_model5.py
    ├── test_model6.py
    ├── test_model7.py
    ├── test_model8.py
    └── 泛化时的集成学习.py
└── 训练时的程序
    ├── model1.py
    ├── model2.py
    ├── model3.py
    ├── model4.py
    ├── model5.py
    ├── model6.py
    ├── model7.py
    ├── model8.py
    └── 训练时的集成学习.py


/README.md:
--------------------------------------------------------------------------------
1 | # -融360-
2 | 第三届“融360”天机智能金融算法挑战赛中“拒绝推断”赛题--复赛第四名的代码分享
3 | >>基于适度贪婪调参策略下的xgboost集成模型
4 | 
5 | 
6 | 本赛题发放的训练集数据量比较庞大，约10万条带有6745列特征属性的数据，处理起来十分费时。其实数据量大带来的处理费时问题不大，我可以多花时间进行处理；但是巨大的数据量还带来一个硬件上的问题，因此必须对数据进行数据规模上的筛选处理
7 | 
8 | 


--------------------------------------------------------------------------------
/XGBoost调参/xgboost调参.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import xgboost as xgb
  3 | from xgboost.sklearn import XGBClassifier
  4 | from sklearn import cross_validation, metrics  
  5 | from sklearn.grid_search import GridSearchCV  # Perforing grid search
  6 | 
  7 | df = pd.read_csv('G:\\ml360\\train\\1104.csv')
  8 | data0 = df.drop(['id', 'loan_dt', 'tag'], axis=1)
  9 | 
 10 | # 数据筛选
 11 | data = data0
 12 | dataT = data.T
 13 | dataT.isnull().sum()
 14 | X = dataT.isnull().sum()  # X是缺失值的序号集
 15 | x = list()  # x是超过缺失值指标的序号值集
 16 | for i in range(len(X)):
 17 |     if X[i] < 600:# 值越大，留下的数据集越多;500-1354,600-2456,675-3313,700-3636
 18 |         x.append(i)
 19 | data = data.ix[x]
 20 | 
 21 | train = data
 22 | target = 'label'
 23 | 
 24 | def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
 25 |     if useTrainCV:
 26 |         xgb_param = alg.get_xgb_params()
 27 |         xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
 28 |         cvresult = xgb.cv(xgb_param, xgtrain, 
 29 |                           num_boost_round=alg.get_params()['n_estimators'], 
 30 |                           nfold=cv_folds,
 31 |                           metrics='auc', 
 32 |                           early_stopping_rounds=early_stopping_rounds, 
 33 |                           show_stdv=False)
 34 |         alg.set_params(n_estimators=cvresult.shape[0])
 35 |     # Fit the algorithm on the data
 36 |     alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc')
 37 |     # Predict training set:
 38 |     dtrain_predictions = alg.predict(dtrain[predictors])
 39 |     dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
 40 |     # Print model report:
 41 |     print("\nModel Report")
 42 |     print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['label'].values, dtrain_predictions))
 43 |     print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['label'], dtrain_predprob))
 44 | 
 45 | # Choose all predictors except target & IDcols
 46 | A = [x for x in train.columns if x not in [target]]
 47 | predictors = [x for x in train.columns if x not in [target]]
 48 | xgb1 = XGBClassifier(
 49 |     learning_rate=0.1,
 50 |     n_estimators=140,
 51 |     max_depth=6,
 52 |     min_child_weight=1,
 53 |     gamma=0,
 54 |     subsample=0.8,
 55 |     colsample_bytree=0.8,
 56 |     objective='binary:logistic',
 57 |     nthread=4,
 58 |     scale_pos_weight=1,
 59 |     seed=27)
 60 | modelfit(xgb1, train, predictors)
 61 | 
 62 | 
 63 | # 先对 max_depth，min_child_weight两组参数进行调参
 64 | param_test1 = {
 65 |     'max_depth': [3, 4, 5, 6, 7, 8],
 66 |     'min_child_weight': [1, 2, 3, 4, 5, 6]
 67 | }
 68 | gsearch1 = GridSearchCV(estimator=XGBClassifier(
 69 |         learning_rate=0.1, 
 70 |         n_estimators=140, 
 71 |         max_depth=5,
 72 |         min_child_weight=1, 
 73 |         gamma=0, 
 74 |         subsample=0.8, 
 75 |         colsample_bytree=0.8,
 76 |         objective='binary:logistic', 
 77 |         nthread=4, 
 78 |         scale_pos_weight=1, 
 79 |         seed=27),
 80 |         param_grid=param_test1, 
 81 |         scoring='roc_auc', 
 82 |         n_jobs=4, 
 83 |         iid=False,
 84 |         cv=5)
 85 | gsearch1.fit(train[predictors], train[target])
 86 | gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
 87 | print("gsearch1.grid_scores_:", gsearch1.grid_scores_, 
 88 |       " gsearch1.best_params_:", gsearch1.best_params_,
 89 |       "gsearch1.best_score_:", gsearch1.best_score_)
 90 | # 最优 max_depth=3， min_child_weight=3，得分：0.78699
 91 | # 次优 max_depth=7， min_child_weight=4，得分：0.78245
 92 | 
 93 | # 在 max_depth=3，min_child_weight=3情况下调参 gamma
 94 | param_test2 = {
 95 |  'gamma':[i/10.0 for i in range(0,5)]
 96 | }
 97 | gsearch2 = GridSearchCV(estimator = XGBClassifier(
 98 |         learning_rate =0.1, 
 99 |         n_estimators=200, 
100 |         max_depth=3,
101 |         min_child_weight=3, 
102 |         gamma=0, 
103 |         subsample=0.8, 
104 |         colsample_bytree=0.8,
105 |         objective= 'binary:logistic', 
106 |         nthread=4, 
107 |         scale_pos_weight=1, 
108 |         seed=27), 
109 |         param_grid = param_test2, 
110 |         scoring='roc_auc',
111 |         n_jobs=4,
112 |         iid=False,
113 |         cv=5)
114 | gsearch2.fit(train[predictors],train[target])
115 | gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
116 | print("gsearch2.grid_scores_:", gsearch2.grid_scores_,
117 |       " gsearch2.best_params_:", gsearch2.best_params_,
118 |       "gsearch2.best_score_:", gsearch2.best_score_)
119 | # 测试结果 最优：gamma=0.2
120 | #         次优：gamma=0.4
121 | 
122 | # 下面在 max_depth=3，min_child_weight=3，gamma=0条件下，调整 subsample，colsample_bytree的参数
123 | param_test3 = {
124 |  'subsample':[i/10.0 for i in range(1,10,2)],
125 |  'colsample_bytree':[i/10.0 for i in range(1,10,2)]
126 | }
127 | gsearch3= GridSearchCV(estimator = XGBClassifier(
128 |         learning_rate =0.1, 
129 |         n_estimators=200, 
130 |         max_depth=3,
131 |         min_child_weight=3, 
132 |         gamma=0.2, 
133 |         subsample=0.8, 
134 |         colsample_bytree=0.8,
135 |         objective= 'binary:logistic', 
136 |         nthread=4, 
137 |         scale_pos_weight=1, 
138 |         seed=27), 
139 |         param_grid = param_test3, 
140 |         scoring='roc_auc',
141 |         n_jobs=4,
142 |         iid=False, 
143 |         cv=5)
144 | gsearch3.fit(train[predictors],train[target])
145 | gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
146 | print("gsearch3.grid_scores_:", gsearch3.grid_scores_, 
147 |       " gsearch3.best_params_:", gsearch3.best_params_,
148 |       "gsearch3.best_score_:", gsearch3.best_score_)
149 | # 结果 subsample=0.9.colsample_bytree=0.5
150 | 
151 | # 更加精确化,分度值0.05
152 | param_test4 = {
153 |  'subsample':[i/10.0 for i in range(8,10)],
154 |  'colsample_bytree':[i/10.0 for i in range(4,7)]
155 | }
156 | gsearch4 = GridSearchCV(estimator = XGBClassifier(
157 |         learning_rate =0.1, 
158 |         n_estimators=200, 
159 |         max_depth=3,
160 |         min_child_weight=3, 
161 |         gamma=0.2, 
162 |         subsample=0.8, 
163 |         colsample_bytree=0.8,
164 |         objective= 'binary:logistic', 
165 |         nthread=4, 
166 |         scale_pos_weight=1, 
167 |         seed=27), 
168 |         param_grid = param_test4, 
169 |         scoring='roc_auc',
170 |         n_jobs=4,
171 |         iid=False, 
172 |         cv=5)
173 | gsearch4.fit(train[predictors],train[target])
174 | gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
175 | print("gsearch4.grid_scores_:", gsearch4.grid_scores_, 
176 |       " gsearch4.best_params_:", gsearch4.best_params_,
177 |       "gsearch4.best_score_:", gsearch4.best_score_)
178 | # 结果 最优：subsample=0.9, colsample_bytree=0.8
179 | #      次优：subsample=0.8, colsample_bytree=0.8
180 | # 得到第一组参数：max_depth=3，min_child_weight=3，gamma=0.2,subsample=0.9,colsample_bytree=0.8
181 | # 得到第二组参数：max_depth=3，min_child_weight=3，gamma=0.2,subsample=0.8,colsample_bytree=0.8  
182 | 
183 | #下面在max_depth=3，min_child_weight=3，gamma=0.4条件下，对subsample，colsample_bytree进行调参
184 | param_test5 = {
185 |  'subsample':[i/10.0 for i in range(1,10,2)],
186 |  'colsample_bytree':[i/10.0 for i in range(1,10,2)]
187 | }
188 | gsearch5 = GridSearchCV(estimator = XGBClassifier(
189 |         learning_rate =0.1, 
190 |         n_estimators=200, 
191 |         max_depth=7,
192 |         min_child_weight=5, 
193 |         gamma=0.3, 
194 |         subsample=0.8, 
195 |         colsample_bytree=0.8,
196 |         objective= 'binary:logistic', 
197 |         nthread=4, 
198 |         scale_pos_weight=1, 
199 |         seed=27), 
200 |         param_grid = param_test5, 
201 |         scoring='roc_auc',
202 |         n_jobs=4,
203 |         iid=False, 
204 |         cv=5)
205 | gsearch5.fit(train[predictors],train[target])
206 | gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
207 | print("gsearch5.grid_scores_:", gsearch5.grid_scores_, 
208 |       " gsearch5.best_params_:", gsearch5.best_params_,
209 |       "gsearch5.best_score_:", gsearch5.best_score_)
210 | # 测试结果 最优：subsample=0.9, colsample_bytree=0.5
211 | #         次优：subsample=0.95, colsample_bytree=0.5
212 | 
213 | # 精确化
214 | param_test6 = {
215 |  'subsample':[i/10.0 for i in range(8,10)],
216 |  'colsample_bytree':[i/10.0 for i in range(4,7)]
217 | }
218 | gsearch6 = GridSearchCV(estimator = XGBClassifier(
219 |         learning_rate =0.1, 
220 |         n_estimators=200, 
221 |         max_depth=3,
222 |         min_child_weight=3, 
223 |         gamma=0., 
224 |         subsample=0.8, 
225 |         colsample_bytree=0.8,
226 |         objective= 'binary:logistic', 
227 |         nthread=4, 
228 |         scale_pos_weight=1, 
229 |         seed=27), 
230 |         param_grid = param_test6, 
231 |         scoring='roc_auc',
232 |         n_jobs=4,
233 |         iid=False, 
234 |         cv=5)
235 | gsearch6.fit(train[predictors],train[target])
236 | gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_
237 | print("gsearch6.grid_scores_:", gsearch6.grid_scores_, 
238 |       " gsearch6.best_params_:", gsearch6.best_params_,
239 |       "gsearch6.best_score_:", gsearch6.best_score_)
240 | # 得到第三组参数：max_depth=3，min_child_weight=3，gamma=0.4,subsample=0.9,colsample_bytree=0.5
241 | # 得到第四组参数：max_depth=3，min_child_weight=3，gamma=0.4,subsample=0.95,colsample_bytree=0.5
242 | 
243 | # 这里对max_depth=7，min_child_weight=4条件下，对gamma 进行调参
244 | param_test7 = {
245 |  'gamma':[i/10.0 for i in range(0,5)]
246 | }
247 | gsearch7 = GridSearchCV(estimator = XGBClassifier(
248 |         learning_rate =0.1, 
249 |         n_estimators=200, 
250 |         max_depth=7,
251 |         min_child_weight=4, 
252 |         gamma=0, 
253 |         subsample=0.8, 
254 |         colsample_bytree=0.8,
255 |         objective= 'binary:logistic', 
256 |         nthread=4, 
257 |         scale_pos_weight=1, 
258 |         seed=27), 
259 |         param_grid = param_test7, 
260 |         scoring='roc_auc',
261 |         n_jobs=4,
262 |         iid=False,
263 |         cv=5)
264 | gsearch7.fit(train[predictors],train[target])
265 | gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_
266 | print("gsearch7.grid_scores_:", gsearch7.grid_scores_,
267 |       " gsearch7.best_params_:", gsearch7.best_params_,
268 |       "gsearch7.best_score_:", gsearch7.best_score_)
269 | # 结果 最优：gamma=0
270 | #      次优：gamma=0.1
271 | 
272 | # 在max_depth=7,min_child_weight=4,gamma=0条件下，对subsample，colsample_bytree进行调参
273 | param_test8 = {
274 |  'subsample':[i/10.0 for i in range(1,10,2)],
275 |  'colsample_bytree':[i/10.0 for i in range(1,10,2)]
276 | }
277 | gsearch8 = GridSearchCV(estimator = XGBClassifier(
278 |         learning_rate =0.1, 
279 |         n_estimators=177, 
280 |         max_depth=7,
281 |         min_child_weight=4, 
282 |         gamma=0, 
283 |         subsample=0.8, 
284 |         colsample_bytree=0.8,
285 |         objective= 'binary:logistic', 
286 |         nthread=4, 
287 |         scale_pos_weight=1, 
288 |         seed=27), 
289 |         param_grid = param_test8, 
290 |         scoring='roc_auc',
291 |         n_jobs=4,
292 |         iid=False, 
293 |         cv=5)
294 | gsearch8.fit(train[predictors],train[target])
295 | gsearch8.grid_scores_, gsearch8.best_params_, gsearch8.best_score_
296 | print("gsearch8.grid_scores_:", gsearch8.grid_scores_, 
297 |       " gsearch8.best_params_:", gsearch8.best_params_,
298 |       "gsearch8.best_score_:", gsearch8.best_score_)
299 | # 测试结果 subsample=0.9   colsample_bytree=0.9
300 | 
301 | # 精确化
302 | param_test9 = {
303 |  'subsample':[0.85, 0.9, 0.95],
304 |  'colsample_bytree':[0.85, 0.9, 0.95]
305 | }
306 | gsearch9 = GridSearchCV(estimator = XGBClassifier(
307 |         learning_rate =0.1, 
308 |         n_estimators=177, 
309 |         max_depth=7,
310 |         min_child_weight=4, 
311 |         gamma=0, 
312 |         subsample=0.9, 
313 |         colsample_bytree=0.8,
314 |         objective= 'binary:logistic', 
315 |         nthread=4, 
316 |         scale_pos_weight=1, 
317 |         seed=27), 
318 |         param_grid = param_test9, 
319 |         scoring='roc_auc',
320 |         n_jobs=4,
321 |         iid=False, 
322 |         cv=5)
323 | gsearch9.fit(train[predictors],train[target])
324 | gsearch9.grid_scores_, gsearch9.best_params_, gsearch9.best_score_
325 | print("gsearch9.grid_scores_:", gsearch9.grid_scores_, 
326 |       " gsearch9.best_params_:", gsearch9.best_params_,
327 |       "gsearch9.best_score_:", gsearch9.best_score_)
328 | # 结果 最优：subsample=0.9，colsample_bytree=0.95
329 | #      次优：subsample=0.95， colsample_bytree=0.95
330 | # 得到第五组参数：max_depth=7，min_child_weight=4，gamma=0,subsample=0.9,colsample_bytree=0.95
331 | # 得到第六组参数：max_depth=7，min_child_weight=4，gamma=0,subsample=0.95,colsample_bytree=0.95
332 | 
333 | # 在max_depth=7,min_child_weight=4,gamma=0.1条件下，对subsample，colsample_bytree进行调参
334 | param_test10 = {
335 |  'subsample':[i/10.0 for i in range(1,10,2)],
336 |  'colsample_bytree':[i/10.0 for i in range(1,10,2)]
337 | }
338 | gsearch10 = GridSearchCV(estimator = XGBClassifier(
339 |         learning_rate =0.1, 
340 |         n_estimators=177, 
341 |         max_depth=7,
342 |         min_child_weight=4, 
343 |         gamma=0.1, 
344 |         subsample=0.8, 
345 |         colsample_bytree=0.8,
346 |         objective= 'binary:logistic', 
347 |         nthread=4, 
348 |         scale_pos_weight=1, 
349 |         seed=27), 
350 |         param_grid = param_test10, 
351 |         scoring='roc_auc',
352 |         n_jobs=4,
353 |         iid=False, 
354 |         cv=5)
355 | gsearch10.fit(train[predictors],train[target])
356 | gsearch10.grid_scores_, gsearch10.best_params_, gsearch10.best_score_
357 | print("gsearch10.grid_scores_:", gsearch10.grid_scores_, 
358 |       " gsearch10.best_params_:", gsearch10.best_params_,
359 |       "gsearch10.best_score_:", gsearch10.best_score_)
360 | # 测试结果 subsample=0.9   colsample_bytree=0.3
361 | 
362 | # 精确化
363 | param_test11 = {
364 |  'subsample':[0.85, 0.9, 0.95],
365 |  'colsample_bytree':[0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
366 |  }
367 | gsearch11 = GridSearchCV(estimator = XGBClassifier(
368 |         learning_rate =0.1, 
369 |         n_estimators=177, 
370 |         max_depth=7,
371 |         min_child_weight=4, 
372 |         gamma=0.1, 
373 |         subsample=0.9, 
374 |         colsample_bytree=0.8,
375 |         objective= 'binary:logistic', 
376 |         nthread=4, 
377 |         scale_pos_weight=1, 
378 |         seed=27), 
379 |         param_grid = param_test11, 
380 |         scoring='roc_auc',
381 |         n_jobs=4,
382 |         iid=False, 
383 |         cv=5)
384 | gsearch11.fit(train[predictors],train[target])
385 | gsearch11.grid_scores_, gsearch11.best_params_, gsearch11.best_score_
386 | print("gsearch11.grid_scores_:", gsearch11.grid_scores_, 
387 |       " gsearch11.best_params_:", gsearch11.best_params_,
388 |       "gsearch11.best_score_:", gsearch11.best_score_)
389 | # 结果 最优：subsample=0.9，colsample_bytree=0.15
390 | #      次优：subsample=0.9， colsample_bytree=0.3
391 | # 得到第七组参数：max_depth=7，min_child_weight=4，gamma=0.1,subsample=0.9,colsample_bytree=0.15
392 | # 得到第八组参数：max_depth=7，min_child_weight=4，gamma=0.1,subsample=0.9,colsample_bytree=0.3
393 | 
394 | 
395 | ##################################################################################################
396 | ## 综上共得到了8组实验参数：
397 | 
398 | # 得到第一组参数：max_depth=3，min_child_weight=3，gamma=0.2,subsample=0.9,colsample_bytree=0.8
399 | # 得到第二组参数：max_depth=3，min_child_weight=3，gamma=0.2,subsample=0.8,colsample_bytree=0.8  
400 | # 得到第三组参数：max_depth=3，min_child_weight=3，gamma=0.4,subsample=0.9,colsample_bytree=0.5
401 | # 得到第四组参数：max_depth=3，min_child_weight=3，gamma=0.4,subsample=0.95,colsample_bytree=0.5
402 | # 得到第五组参数：max_depth=7，min_child_weight=4，gamma=0,subsample=0.9,colsample_bytree=0.95
403 | # 得到第六组参数：max_depth=7，min_child_weight=4，gamma=0,subsample=0.95,colsample_bytree=0.95
404 | # 得到第七组参数：max_depth=7，min_child_weight=4，gamma=0.1,subsample=0.9,colsample_bytree=0.15
405 | # 得到第八组参数：max_depth=7，min_child_weight=4，gamma=0.1,subsample=0.9,colsample_bytree=0.3
406 | 
407 | ####################################################################################################
408 | 
409 | 
410 | # 下面在对reg_alpha,reg_lambda 进行调参
411 | param_test12 = {
412 |  'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
413 | }
414 | gsearch12 = GridSearchCV(estimator = XGBClassifier(
415 |         learning_rate =0.1, 
416 |         n_estimators=177, 
417 |         max_depth=3,
418 |         min_child_weight=3, 
419 |         gamma=0.2, 
420 |         subsample=0.9, 
421 |         colsample_bytree=0.8,
422 |         objective= 'binary:logistic', 
423 |         nthread=4, 
424 |         scale_pos_weight=1, 
425 |         seed=27), 
426 |         param_grid = param_test12, 
427 |         scoring='roc_auc',
428 |         n_jobs=4,iid=False, cv=5)
429 | gsearch12.fit(train[predictors],train[target])
430 | gsearch12.grid_scores_, gsearch12.best_params_, gsearch12.best_score_
431 | print("gsearch12.grid_scores_:", gsearch12.grid_scores_, 
432 |       " gsearch12.best_params_:", gsearch12.best_params_,
433 |       "gsearch12.best_score_:", gsearch12.best_score_)
434 | # reg_alpha=0.00001继续调试
435 | 
436 | param_test13 = {
437 |  'reg_alpha':[0, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
438 | }
439 | gsearch13 = GridSearchCV(estimator = XGBClassifier(
440 |         learning_rate =0.05, 
441 |         n_estimators=1000, 
442 |         max_depth=3,
443 |         min_child_weight=3, 
444 |         gamma=0.2, 
445 |         subsample=0.9, 
446 |         colsample_bytree=0.8,
447 |         objective= 'binary:logistic', 
448 |         nthread=4, 
449 |         scale_pos_weight=1, 
450 |         seed=27), 
451 |         param_grid = param_test13,
452 |         scoring='roc_auc',
453 |         n_jobs=4,
454 |         iid=False,
455 |         cv=5)
456 | gsearch13.fit(train[predictors],train[target])
457 | gsearch13.grid_scores_, gsearch13.best_params_, gsearch13.best_score_
458 | print("gsearch13.grid_scores_:", gsearch13.grid_scores_, 
459 |       " gsearch13.best_params_:", gsearch13.best_params_,
460 |       "gsearch13.best_score_:", gsearch13.best_score_)
461 | # 最终reg_alpha=1e-5
462 | 
463 | 
464 | #下面对reg_lambda进行调参
465 | param_test14 = {
466 |  'reg_lambda':[0, 1e-6, 1e-5, 1e-4, 1e-3]
467 | }
468 | gsearch14 = GridSearchCV(estimator = XGBClassifier(
469 |         learning_rate =0.1, 
470 |         n_estimators=177, 
471 |         max_depth=3,
472 |         min_child_weight=3, 
473 |         gamma=0.2, 
474 |         reg_alpha=1e-5,
475 |         subsample=0.9, 
476 |         colsample_bytree=0.8,
477 |         objective= 'binary:logistic', 
478 |         nthread=4, 
479 |         scale_pos_weight=1, 
480 |         seed=27), 
481 |         param_grid = param_test14, 
482 |         scoring='roc_auc',
483 |         n_jobs=4,
484 |         iid=False,
485 |         cv=5)
486 | gsearch14.fit(train[predictors],train[target])
487 | gsearch14.grid_scores_, gsearch14.best_params_, gsearch14.best_score_
488 | print("gsearch14.grid_scores_:", gsearch14.grid_scores_, 
489 |       " gsearch14.best_params_:", gsearch14.best_params_,
490 |       "gsearch14.best_score_:", gsearch14.best_score_)
491 | #reg_lambda=0.001
492 | 
493 | param_test15 = {
494 |  'reg_lambda':[1, 10, 100]
495 | }
496 | gsearch15 = GridSearchCV(estimator = XGBClassifier(
497 |         learning_rate =0.1, 
498 |         n_estimators=177, 
499 |         max_depth=3,
500 |         min_child_weight=3, 
501 |         gamma=0.2,  
502 |         reg_alpha=1e-5,
503 |         subsample=0.9, 
504 |         colsample_bytree=0.8,
505 |         objective= 'binary:logistic', 
506 |         nthread=4, 
507 |         scale_pos_weight=1, 
508 |         seed=27), 
509 |         param_grid = param_test15, 
510 |         scoring='roc_auc',
511 |         n_jobs=4,
512 |         iid=False, 
513 |         cv=5)
514 | gsearch15.fit(train[predictors],train[target])
515 | gsearch15.grid_scores_, gsearch15.best_params_, gsearch15.best_score_
516 | print("gsearch15.grid_scores_:", gsearch15.grid_scores_, 
517 |       " gsearch15.best_params_:", gsearch15.best_params_,
518 |       "gsearch15.best_score_:", gsearch15.best_score_)
519 | #最终reg_lambda=1
520 | 
521 | # 最终正则化参数是：reg_alpha=1e-5, reg_lambda=1
522 | 
523 | 
524 | 


--------------------------------------------------------------------------------
/XGBoost调参/调参过程结果截图.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woairong/Xgboost-integration-model-based-on-moderate-greedy-parametric-strategy/383ba81c86cb817670d9a6ee32c97496a7cf6bec/XGBoost调参/调参过程结果截图.docx


--------------------------------------------------------------------------------
/data.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woairong/Xgboost-integration-model-based-on-moderate-greedy-parametric-strategy/383ba81c86cb817670d9a6ee32c97496a7cf6bec/data.rar


--------------------------------------------------------------------------------
/南衣-上海财经大学-基于适度贪婪调参策略下的xgboost集成模型.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woairong/Xgboost-integration-model-based-on-moderate-greedy-parametric-strategy/383ba81c86cb817670d9a6ee32c97496a7cf6bec/南衣-上海财经大学-基于适度贪婪调参策略下的xgboost集成模型.docx


--------------------------------------------------------------------------------
/数据预处理/数据选择.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | df1 = pd.read_csv('G:\\ml360\\train\\traindata1.txt', sep='\t')
 4 | df2 = pd.read_csv('G:\\ml360\\train\\traindata2.txt', sep='\t')
 5 | df3 = pd.read_csv('G:\\ml360\\train\\traindata3.txt', sep='\t')
 6 | df4 = pd.read_csv('G:\\ml360\\train\\traindata4.txt', sep='\t')
 7 | df5 = pd.read_csv('G:\\ml360\\train\\traindata5.txt', sep='\t')
 8 | df1.to_csv('G:\\ml360\\train\\1104.csv', index = False, sep = ',')
 9 | df20 = df2.ix[0:13463,:]
10 | df1.to_csv('G:\\ml360\\train\\1104.csv', index = False, sep = ',')
11 | df20.to_csv('G:\\ml360\\train\\1104.csv', mode = 'a', index = False, header = False, sep = ',')
12 | df1104 = pd.read_csv('G:\\ml360\\train\\1104.csv')
13 | 
14 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':3,
17 |         'lambda':1,
18 |         'subsample':0.9,
19 |         'colsample_bytree':0.8,
20 |         'min_child_weight':3,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0.2,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=4000,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model1') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':3,
17 |         'lambda':1,
18 |         'subsample':0.8,
19 |         'colsample_bytree':0.8,
20 |         'min_child_weight':3,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0.2,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model2') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model3.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':3,
17 |         'lambda':1,
18 |         'subsample':0.9,
19 |         'colsample_bytree':0.5,
20 |         'min_child_weight':3,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0.4,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=6000,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model3') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model4.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':3,
17 |         'lambda':1,
18 |         'subsample':0.95,
19 |         'colsample_bytree':0.95,
20 |         'min_child_weight':3,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0.4,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=4000,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model4') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model5.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':7,
17 |         'lambda':1,
18 |         'subsample':0.9,
19 |         'colsample_bytree':0.95,
20 |         'min_child_weight':4,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model5') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model6.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':7,
17 |         'lambda':1,
18 |         'subsample':0.95,
19 |         'colsample_bytree':0.95,
20 |         'min_child_weight':4,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=6745,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model6') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model7.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':7,
17 |         'lambda':1,
18 |         'subsample':0.9,
19 |         'colsample_bytree':0.3,
20 |         'min_child_weight':4,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0.1,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=6500,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model7') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/test_model8.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取数据预处理的训练集数据：33464*6746
 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 6 | df = df.drop(['loan_dt','id','tag'], axis=1)
 7 | train_x = df.ix[:,1:6746]
 8 | train_y = df.ix[:,[0]]
 9 | 
10 | dtrain=xgb.DMatrix(train_x,label=train_y)
11 | 
12 | #booster:
13 | params={'booster':'gbtree',
14 |         'objective': 'binary:logistic',
15 |         'eval_metric': 'auc',
16 |         'max_depth':7,
17 |         'lambda':1,
18 |         'subsample':0.95,
19 |         'colsample_bytree':0.4,
20 |         'min_child_weight':4,
21 |         'alpha':1e-5,
22 |         'seed':0,
23 |         'nthread':4,
24 |         'silent':1,
25 |         'gamma':0.1,
26 |         'learning_rate' : 0.01} 
27 | watchlist = [(dtrain,'train')]
28 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist)
29 | bst.save_model('G:\\ml360\\train\\test\\test_model8') # 保存实验模型
30 | 


--------------------------------------------------------------------------------
/泛化时的程序/泛化时的集成学习.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | # 读取赛题发放的测试集
 5 | test = pd.read_csv('G:\\ml360\\train\\test\\test(p).csv') # test(p).csv是赛题发放的测试集数据
 6 | id = test['id']
 7 | test = test.ix[:, 2:6747]
 8 | 
 9 | dtest = xgb.DMatrix(test)
10 | 
11 | bst1 = xgb.Booster(model_file='G:/ml360/train/test/test_model1')
12 | bst2 = xgb.Booster(model_file='G:/ml360/train/test/test_model2')
13 | bst3 = xgb.Booster(model_file='G:/ml360/train/test/test_model3')
14 | bst4 = xgb.Booster(model_file='G:/ml360/train/test/test_model4')
15 | bst5 = xgb.Booster(model_file='G:/ml360/train/test/test_model5')
16 | bst6 = xgb.Booster(model_file='G:/ml360/train/test/test_model6')
17 | bst7 = xgb.Booster(model_file='G:/ml360/train/test/test_model7')
18 | bst8 = xgb.Booster(model_file='G:/ml360/train/test/test_model8')
19 | 
20 | ypred1 = bst1.predict(dtest)
21 | ypred2 = bst2.predict(dtest)
22 | ypred3 = bst3.predict(dtest)
23 | ypred4 = bst4.predict(dtest)
24 | ypred5 = bst5.predict(dtest)
25 | ypred6 = bst6.predict(dtest)
26 | ypred7 = bst7.predict(dtest)
27 | ypred8 = bst8.predict(dtest)
28 | 
29 | ypred = 0.296*ypred1 + 0.148*ypred2 + 0.148*ypred3 + 0.074*ypred4 + 0.148*ypred5 + 0.074*ypred6 + 0.074*ypred7 + 0.038*ypred8
30 | 
31 | # 保存数据
32 | ypred = list(ypred)
33 | pd_ypred = pd.DataFrame(ypred, columns=['prob'])
34 | id = pd.DataFrame([id])
35 | id = id.T
36 | result = pd.concat([id, pd_ypred], axis = 1)
37 | result.to_csv('G:\\ml360\\train\\test\\result.txt', index = False, sep = ',')


--------------------------------------------------------------------------------
/训练时的程序/model1.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':3,
23 |         'lambda':1,
24 |         'subsample':0.9,
25 |         'colsample_bytree':0.8,
26 |         'min_child_weight':3,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0.2,
32 |         'learning_rate' : 0.01} 
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\model1') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型1下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(参数模型1)')
69 | plt.legend(loc="lower right")
70 | plt.show()


--------------------------------------------------------------------------------
/训练时的程序/model2.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':3,
23 |         'lambda':1,
24 |         'subsample':0.8,
25 |         'colsample_bytree':0.8,
26 |         'min_child_weight':3,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0.2,
32 |         'learning_rate' : 0.01}
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\model2') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型2下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(paramarors_2)')
69 | plt.legend(loc="lower right")
70 | plt.show()
71 | 
72 | 


--------------------------------------------------------------------------------
/训练时的程序/model3.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':3,
23 |         'lambda':1,
24 |         'subsample':0.9,
25 |         'colsample_bytree':0.5,
26 |         'min_child_weight':3,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0.4,
32 |         'learning_rate' : 0.01} 
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\model3') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型3下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(参数模型3)')
69 | plt.legend(loc="lower right")
70 | plt.show()


--------------------------------------------------------------------------------
/训练时的程序/model4.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':3,
23 |         'lambda':1,
24 |         'subsample':0.95,
25 |         'colsample_bytree':0.5,
26 |         'min_child_weight':3,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0.4,
32 |         'learning_rate' : 0.01} 
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\model4') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型4下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(参数模型4)')
69 | plt.legend(loc="lower right")
70 | plt.show()


--------------------------------------------------------------------------------
/训练时的程序/model5.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':7,
23 |         'lambda':1,
24 |         'subsample':0.9,
25 |         'colsample_bytree':0.95,
26 |         'min_child_weight':4,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0,
32 |         'learning_rate' : 0.01} #0.8319/0.8299--0.8302 num-boost_round = 4000/7000
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\test5') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型5下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(parameters5)')
69 | plt.legend(loc="lower right")
70 | plt.show()


--------------------------------------------------------------------------------
/训练时的程序/model6.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':7,
23 |         'lambda':1,
24 |         'subsample':0.95,
25 |         'colsample_bytree':0.95,
26 |         'min_child_weight':4,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0,
32 |         'learning_rate' : 0.01} 
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\model6') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型6下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(parameters6)')
69 | plt.legend(loc="lower right")
70 | plt.show()


--------------------------------------------------------------------------------
/训练时的程序/model7.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':7,
23 |         'lambda':1,
24 |         'subsample':0.9,
25 |         'colsample_bytree':0.15,
26 |         'min_child_weight':4,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0.1,
32 |         'learning_rate' : 0.01} 
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist)
35 | bst.save_model('G:\\ml360\\train\\test\\model7') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型7下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(parameters7)')
69 | plt.legend(loc="lower right")
70 | plt.show()


--------------------------------------------------------------------------------
/训练时的程序/model8.py:
--------------------------------------------------------------------------------
 1 | from pylab import mpl 
 2 | import pandas as pd
 3 | from xgboost import plot_importance
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import train_test_split
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.metrics import roc_curve, auc
 8 | 
 9 | # 读取数据预处理的训练集数据：33464*6746
10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
11 | data = df.ix[:,4:6749]
12 | flag = df['label']
13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
14 | 
15 | dtrain=xgb.DMatrix(train_x,label=train_y)
16 | dtest = xgb.DMatrix(test_x)
17 | 
18 | #booster:
19 | params={'booster':'gbtree',
20 |         'objective': 'binary:logistic',
21 |         'eval_metric': 'auc',
22 |         'max_depth':7,
23 |         'lambda':1,
24 |         'subsample':0.9,
25 |         'colsample_bytree':0.3,
26 |         'min_child_weight':4,
27 |         'alpha':1e-5,
28 |         'seed':0,
29 |         'nthread':4,
30 |         'silent':1,
31 |         'gamma':0.1,
32 |         'learning_rate' : 0.01} #0.8319/0.8299--0.8302 num-boost_round = 4000/7000
33 | watchlist = [(dtrain,'train')]
34 | bst = xgb.train(params,dtrain,num_boost_round=2000,evals=watchlist)
35 | bst.save_model('G:\ml360\\train\\test\\model8') # 保存实验模型
36 | 
37 | ypred=bst.predict(dtest)
38 | y_pred = (ypred >= 0.5)*1
39 | 
40 | # 画出特征得分图
41 | from matplotlib.pylab import rcParams
42 | rcParams['figure.figsize'] = 20, 800
43 | plot_importance(bst) 
44 | 
45 | # 画出AUC
46 | from sklearn import metrics
47 | print ('参数模型8下的实验结果：')
48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
53 | metrics.confusion_matrix(test_y,y_pred)
54 |  
55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
56 | roc_auc = auc(fpr,tpr) ###计算auc的值
57 | 
58 | plt.figure()
59 | lw = 2
60 | plt.figure(figsize=(10,10))
61 | plt.plot(fpr, tpr, color='darkorange',
62 |          lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
64 | plt.xlim([0.0, 1.0])
65 | plt.ylim([0.0, 1.05])
66 | plt.xlabel('False Positive Rate')
67 | plt.ylabel('True Positive Rate')
68 | plt.title('Receiver operating characteristic example(parameters8)')
69 | plt.legend(loc="lower right")
70 | plt.show()
71 | 


--------------------------------------------------------------------------------
/训练时的程序/训练时的集成学习.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | from sklearn.cross_validation import train_test_split
 4 | import matplotlib.pyplot as plt
 5 | from sklearn.metrics import roc_curve, auc
 6 | 
 7 | # 读取数据预处理的训练集数据：33464*6746
 8 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集：33464*6749
 9 | data = df.ix[:,4:6749]
10 | flag = df['label']
11 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 
12 | 
13 | dtest = xgb.DMatrix(test_x)
14 | 
15 | bst1 = xgb.Booster(model_file='G:/ml360/train/test/model1')
16 | bst2 = xgb.Booster(model_file='G:/ml360/train/test/model2')
17 | bst3 = xgb.Booster(model_file='G:/ml360/train/test/model3')
18 | bst4 = xgb.Booster(model_file='G:/ml360/train/test/model4')
19 | bst5 = xgb.Booster(model_file='G:/ml360/train/test/model5')
20 | bst6 = xgb.Booster(model_file='G:/ml360/train/test/model6')
21 | bst7 = xgb.Booster(model_file='G:/ml360/train/test/model7')
22 | bst8 = xgb.Booster(model_file='G:/ml360/train/test/model8')
23 | 
24 | ypred1 = bst1.predict(dtest)
25 | ypred2 = bst2.predict(dtest)
26 | ypred3 = bst3.predict(dtest)
27 | ypred4 = bst4.predict(dtest)
28 | ypred5 = bst5.predict(dtest)
29 | ypred6 = bst6.predict(dtest)
30 | ypred7 = bst7.predict(dtest)
31 | ypred8 = bst8.predict(dtest)
32 | 
33 | ypred = 0.296*ypred1 + 0.148*ypred2 + 0.148*ypred3 + 0.074*ypred4 + 0.148*ypred5 + 0.074*ypred6 + 0.074*ypred7 + 0.038*ypred8
34 | y_pred = (ypred >= 0.5)*1
35 | 
36 | from sklearn import metrics
37 | print ('集成学习下的实验结果：')
38 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred))
39 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred))
40 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred))
41 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred))
42 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred))
43 | metrics.confusion_matrix(test_y,y_pred)
44 |  
45 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率
46 | roc_auc = auc(fpr,tpr) ###计算auc的值
47 | 
48 | plt.figure()
49 | lw = 2
50 | plt.figure(figsize=(10,10))
51 | plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
52 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
53 | plt.xlim([0.0, 1.0])
54 | plt.ylim([0.0, 1.05])
55 | plt.xlabel('False Positive Rate')
56 | plt.ylabel('True Positive Rate')
57 | plt.title('Receiver operating characteristic example()')
58 | plt.legend(loc="lower right")
59 | plt.show()


--------------------------------------------------------------------------------