├── README.md
├── data_input.ipynb
├── model_bagging_lightgbm.ipynb
├── feature_select.ipynb
└── single_lightgbm_model.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # 拍拍贷"魔镜杯"风控算法比赛实战
2 |
3 | * 比赛项目链接
4 | [“魔镜杯”风控算法大赛](https://www.kesci.com/home/competition/56cd5f02b89b5bd026cb39c9/content/0)
5 |
6 | * 知乎文章链接地址
7 | [风控模型实战--"魔镜杯"风控算法大赛](https://zhuanlan.zhihu.com/p/56864235)
8 |
9 | * 代码目录说明
10 |
11 | 1. data_input: 数据导入及合并
12 | 2. data_EDA_clean: 数据清洗
13 | 3. feature_processing: 特征工程
14 | 4. feature_select: 特征筛选
15 | 5. single_lightgbm_model:单模型--lightgbm
16 | 6. model_bagging_lightgbm:lightgbm的bagging模型
17 |
18 |
--------------------------------------------------------------------------------
/data_input.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np \n",
10 | "import pandas as pd \n",
11 | "import warnings\n",
12 | "warnings.filterwarnings('ignore')\n",
13 | "import os \n",
14 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 3,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "f_train1 = pd.read_csv('first_train1.csv',encoding='gbk')\n",
24 | "f_train2 = pd.read_csv('first_train2.csv',encoding='gbk')\n",
25 | "f_train3 = pd.read_csv('first_train3.csv',encoding='gbk')\n",
26 | "f_test1 = pd.read_csv('first_test1.csv',encoding='gb18030')\n",
27 | "f_test2 = pd.read_csv('first_test2.csv',encoding='gbk')\n",
28 | "f_test3 = pd.read_csv('first_test3.csv',encoding='gbk')"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 4,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# 训练集和测试集合并\n",
38 | "f_train1['sample_status'] = 'train'\n",
39 | "f_test1['sample_status'] = 'test'\n",
40 | "df1 = pd.concat([f_train1,f_test1],axis=0).reset_index(drop=True)\n",
41 | "df2 = pd.concat([f_train2,f_test2],axis=0).reset_index(drop=True)\n",
42 | "df3 = pd.concat([f_train3,f_test3],axis=0).reset_index(drop=True)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 6,
48 | "metadata": {
49 | "scrolled": true
50 | },
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "
\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " Idx | \n",
74 | " UserInfo_1 | \n",
75 | " UserInfo_2 | \n",
76 | " UserInfo_3 | \n",
77 | " UserInfo_4 | \n",
78 | " WeblogInfo_1 | \n",
79 | " WeblogInfo_2 | \n",
80 | " WeblogInfo_3 | \n",
81 | " WeblogInfo_4 | \n",
82 | " WeblogInfo_5 | \n",
83 | " ... | \n",
84 | " SocialNetwork_11 | \n",
85 | " SocialNetwork_12 | \n",
86 | " SocialNetwork_13 | \n",
87 | " SocialNetwork_14 | \n",
88 | " SocialNetwork_15 | \n",
89 | " SocialNetwork_16 | \n",
90 | " SocialNetwork_17 | \n",
91 | " target | \n",
92 | " ListingInfo | \n",
93 | " sample_status | \n",
94 | "
\n",
95 | " \n",
96 | " \n",
97 | " \n",
98 | " | 0 | \n",
99 | " 10001 | \n",
100 | " 1.000 | \n",
101 | " 深圳 | \n",
102 | " 4.000 | \n",
103 | " 深圳 | \n",
104 | " nan | \n",
105 | " 1.000 | \n",
106 | " nan | \n",
107 | " 1.000 | \n",
108 | " 1.000 | \n",
109 | " ... | \n",
110 | " -1 | \n",
111 | " 0 | \n",
112 | " 0 | \n",
113 | " 0 | \n",
114 | " 0 | \n",
115 | " 0 | \n",
116 | " 1 | \n",
117 | " 0 | \n",
118 | " 2014-3-5 | \n",
119 | " train | \n",
120 | "
\n",
121 | " \n",
122 | " | 1 | \n",
123 | " 10002 | \n",
124 | " 1.000 | \n",
125 | " 温州 | \n",
126 | " 4.000 | \n",
127 | " 温州 | \n",
128 | " nan | \n",
129 | " 0.000 | \n",
130 | " nan | \n",
131 | " 1.000 | \n",
132 | " 1.000 | \n",
133 | " ... | \n",
134 | " -1 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 0 | \n",
139 | " 0 | \n",
140 | " 2 | \n",
141 | " 0 | \n",
142 | " 2014-2-26 | \n",
143 | " train | \n",
144 | "
\n",
145 | " \n",
146 | " | 2 | \n",
147 | " 10003 | \n",
148 | " 1.000 | \n",
149 | " 宜昌 | \n",
150 | " 3.000 | \n",
151 | " 宜昌 | \n",
152 | " nan | \n",
153 | " 0.000 | \n",
154 | " nan | \n",
155 | " 2.000 | \n",
156 | " 2.000 | \n",
157 | " ... | \n",
158 | " -1 | \n",
159 | " -1 | \n",
160 | " 1 | \n",
161 | " 0 | \n",
162 | " 0 | \n",
163 | " 0 | \n",
164 | " 0 | \n",
165 | " 0 | \n",
166 | " 2014-2-28 | \n",
167 | " train | \n",
168 | "
\n",
169 | " \n",
170 | " | 3 | \n",
171 | " 10006 | \n",
172 | " 4.000 | \n",
173 | " 南平 | \n",
174 | " 1.000 | \n",
175 | " 南平 | \n",
176 | " nan | \n",
177 | " nan | \n",
178 | " nan | \n",
179 | " nan | \n",
180 | " nan | \n",
181 | " ... | \n",
182 | " -1 | \n",
183 | " -1 | \n",
184 | " 0 | \n",
185 | " 0 | \n",
186 | " 0 | \n",
187 | " 0 | \n",
188 | " 0 | \n",
189 | " 0 | \n",
190 | " 2014-2-25 | \n",
191 | " train | \n",
192 | "
\n",
193 | " \n",
194 | " | 4 | \n",
195 | " 10007 | \n",
196 | " 5.000 | \n",
197 | " 辽阳 | \n",
198 | " 1.000 | \n",
199 | " 辽阳 | \n",
200 | " nan | \n",
201 | " 0.000 | \n",
202 | " nan | \n",
203 | " 1.000 | \n",
204 | " 1.000 | \n",
205 | " ... | \n",
206 | " -1 | \n",
207 | " -1 | \n",
208 | " 0 | \n",
209 | " 0 | \n",
210 | " 0 | \n",
211 | " 0 | \n",
212 | " 0 | \n",
213 | " 0 | \n",
214 | " 2014-2-27 | \n",
215 | " train | \n",
216 | "
\n",
217 | " \n",
218 | "
\n",
219 | "
5 rows × 229 columns
\n",
220 | "
"
221 | ],
222 | "text/plain": [
223 | " Idx UserInfo_1 UserInfo_2 UserInfo_3 UserInfo_4 WeblogInfo_1 \\\n",
224 | "0 10001 1.000 深圳 4.000 深圳 nan \n",
225 | "1 10002 1.000 温州 4.000 温州 nan \n",
226 | "2 10003 1.000 宜昌 3.000 宜昌 nan \n",
227 | "3 10006 4.000 南平 1.000 南平 nan \n",
228 | "4 10007 5.000 辽阳 1.000 辽阳 nan \n",
229 | "\n",
230 | " WeblogInfo_2 WeblogInfo_3 WeblogInfo_4 WeblogInfo_5 ... \\\n",
231 | "0 1.000 nan 1.000 1.000 ... \n",
232 | "1 0.000 nan 1.000 1.000 ... \n",
233 | "2 0.000 nan 2.000 2.000 ... \n",
234 | "3 nan nan nan nan ... \n",
235 | "4 0.000 nan 1.000 1.000 ... \n",
236 | "\n",
237 | " SocialNetwork_11 SocialNetwork_12 SocialNetwork_13 SocialNetwork_14 \\\n",
238 | "0 -1 0 0 0 \n",
239 | "1 -1 0 0 0 \n",
240 | "2 -1 -1 1 0 \n",
241 | "3 -1 -1 0 0 \n",
242 | "4 -1 -1 0 0 \n",
243 | "\n",
244 | " SocialNetwork_15 SocialNetwork_16 SocialNetwork_17 target ListingInfo \\\n",
245 | "0 0 0 1 0 2014-3-5 \n",
246 | "1 0 0 2 0 2014-2-26 \n",
247 | "2 0 0 0 0 2014-2-28 \n",
248 | "3 0 0 0 0 2014-2-25 \n",
249 | "4 0 0 0 0 2014-2-27 \n",
250 | "\n",
251 | " sample_status \n",
252 | "0 train \n",
253 | "1 train \n",
254 | "2 train \n",
255 | "3 train \n",
256 | "4 train \n",
257 | "\n",
258 | "[5 rows x 229 columns]"
259 | ]
260 | },
261 | "execution_count": 6,
262 | "metadata": {},
263 | "output_type": "execute_result"
264 | }
265 | ],
266 | "source": [
267 | "df1.head()"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 7,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "# 保存数据至本地\n",
277 | "df1.to_csv('C:/Users/Administrator/Desktop/魔镜杯数据/data_input1.csv',encoding='gb18030',index=False)\n",
278 | "df2.to_csv('C:/Users/Administrator/Desktop/魔镜杯数据/data_input2.csv',encoding='gb18030',index=False)\n",
279 | "df3.to_csv('C:/Users/Administrator/Desktop/魔镜杯数据/data_input3.csv',encoding='gb18030',index=False)"
280 | ]
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "Python 3",
286 | "language": "python",
287 | "name": "python3"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 3
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython3",
299 | "version": "3.7.0"
300 | },
301 | "toc": {
302 | "base_numbering": 1,
303 | "nav_menu": {},
304 | "number_sections": true,
305 | "sideBar": true,
306 | "skip_h1_title": false,
307 | "title_cell": "Table of Contents",
308 | "title_sidebar": "Contents",
309 | "toc_cell": false,
310 | "toc_position": {},
311 | "toc_section_display": true,
312 | "toc_window_display": false
313 | }
314 | },
315 | "nbformat": 4,
316 | "nbformat_minor": 2
317 | }
318 |
--------------------------------------------------------------------------------
/model_bagging_lightgbm.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "ExecuteTime": {
8 | "end_time": "2019-02-17T03:18:36.904108Z",
9 | "start_time": "2019-02-17T03:18:33.466960Z"
10 | },
11 | "scrolled": true
12 | },
13 | "outputs": [
14 | {
15 | "name": "stderr",
16 | "output_type": "stream",
17 | "text": [
18 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\utils\\__init__.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
19 | " from collections import Sequence\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "import numpy as np \n",
25 | "import pandas as pd\n",
26 | "import lightgbm as lgb\n",
27 | "import random\n",
28 | "from sklearn import metrics\n",
29 | "import warnings\n",
30 | "warnings.filterwarnings('ignore')\n",
31 | "import os \n",
32 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')\n",
33 | "import score_card as sc"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {
40 | "ExecuteTime": {
41 | "end_time": "2019-02-17T03:18:42.561213Z",
42 | "start_time": "2019-02-17T03:18:36.904108Z"
43 | },
44 | "scrolled": true
45 | },
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "(49701, 161)"
51 | ]
52 | },
53 | "execution_count": 2,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "# Master数据\n",
60 | "df1 = pd.read_csv('feature_select_data1.csv',encoding='gb18030')\n",
61 | "df1.shape"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {
68 | "ExecuteTime": {
69 | "end_time": "2019-02-17T03:18:54.458570Z",
70 | "start_time": "2019-02-17T03:18:42.566214Z"
71 | }
72 | },
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "(49701, 124)"
78 | ]
79 | },
80 | "execution_count": 3,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | }
84 | ],
85 | "source": [
86 | "# 排序特征数据\n",
87 | "rank_df = pd.read_csv('rank_feature.csv',encoding='gbk')\n",
88 | "rank_df.shape"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 4,
94 | "metadata": {
95 | "ExecuteTime": {
96 | "end_time": "2019-02-17T03:18:57.831734Z",
97 | "start_time": "2019-02-17T03:18:54.461570Z"
98 | }
99 | },
100 | "outputs": [
101 | {
102 | "data": {
103 | "text/plain": [
104 | "(49701, 51)"
105 | ]
106 | },
107 | "execution_count": 4,
108 | "metadata": {},
109 | "output_type": "execute_result"
110 | }
111 | ],
112 | "source": [
113 | "# periods衍生特征数据\n",
114 | "periods_df = pd.read_csv('periods_feature.csv',encoding='gbk')\n",
115 | "periods_df.shape"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 5,
121 | "metadata": {
122 | "ExecuteTime": {
123 | "end_time": "2019-02-17T03:18:57.837734Z",
124 | "start_time": "2019-02-17T03:18:57.833734Z"
125 | }
126 | },
127 | "outputs": [],
128 | "source": [
129 | "# 原生特征(不含排序特征和periods衍生特征)\n",
130 | "feature1 = list(df1.columns)\n",
131 | "# 排序特征和periods衍生特征\n",
132 | "feature2 = list(rank_df.columns)+list(periods_df.columns)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 6,
138 | "metadata": {
139 | "ExecuteTime": {
140 | "end_time": "2019-02-17T03:18:57.963735Z",
141 | "start_time": "2019-02-17T03:18:57.841734Z"
142 | }
143 | },
144 | "outputs": [],
145 | "source": [
146 | "# 对feature2进行随机打乱顺序\n",
147 | "random.shuffle(feature2)"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 7,
153 | "metadata": {
154 | "ExecuteTime": {
155 | "end_time": "2019-02-17T03:18:58.129739Z",
156 | "start_time": "2019-02-17T03:18:57.964736Z"
157 | }
158 | },
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/plain": [
163 | "(49701, 336)"
164 | ]
165 | },
166 | "execution_count": 7,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "# 合并数据集\n",
173 | "df = pd.concat([df1,rank_df,periods_df],axis=1)\n",
174 | "df.shape"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 8,
180 | "metadata": {
181 | "ExecuteTime": {
182 | "end_time": "2019-02-17T03:18:58.272747Z",
183 | "start_time": "2019-02-17T03:18:58.133739Z"
184 | }
185 | },
186 | "outputs": [],
187 | "source": [
188 | "# 保存用户id\n",
189 | "data_idx = df.Idx"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 9,
195 | "metadata": {
196 | "ExecuteTime": {
197 | "end_time": "2019-02-17T03:18:58.440757Z",
198 | "start_time": "2019-02-17T03:18:58.274747Z"
199 | }
200 | },
201 | "outputs": [],
202 | "source": [
203 | "# 定义lightgbm的bagging函数\n",
204 | "def bagging_lightgbm(feature_fraction,bagging_fraction,ramdom_seed,n_feature):\n",
205 | " \n",
206 | " select_fea = feature1+feature2[:n_feature]\n",
207 | " \n",
208 | " data = df.loc[:,select_fea]\n",
209 | " train_x = data[data.sample_status=='train'].drop(['sample_status','target','Idx'],axis=1)\n",
210 | " train_y = data[data.sample_status=='train']['target']\n",
211 | " test_x = data[data.sample_status=='test'].drop(['sample_status','target','Idx'],axis=1)\n",
212 | " test_y = data[data.sample_status=='test']['target']\n",
213 | " \n",
214 | " test_user_id = list(data[data.sample_status=='test']['Idx'])\n",
215 | " \n",
216 | " \n",
217 | " dtrain = lgb.Dataset(train_x,train_y)\n",
218 | " dtest = lgb.Dataset(test_x,test_y)\n",
219 | " \n",
220 | " params={\n",
221 | " 'boosting_type':'gbdt',\n",
222 | " 'metric':'auc',\n",
223 | " 'num_leaves':30,\n",
224 | " 'min_data_in_leaf':20,\n",
225 | " 'min_sum_hessian_in_leaf':0.001,\n",
226 | " 'bagging_fraction':bagging_fraction,\n",
227 | " 'feature_fraction':feature_fraction,\n",
228 | " 'learning_rate':0.005,\n",
229 | " }\n",
230 | " \n",
231 | " # 寻找最佳的迭代次数\n",
232 | " cv_result = lgb.cv(train_set=dtrain,\n",
233 | " early_stopping_rounds=10,\n",
234 | " num_boost_round=1000,\n",
235 | " nfold=5,\n",
236 | " metrics='auc',\n",
237 | " seed=0,\n",
238 | " params=params,\n",
239 | " stratified=True,\n",
240 | " shuffle=True)\n",
241 | " max_auc = max(cv_result['auc-mean'])\n",
242 | " num_round = len(cv_result['auc-mean'])\n",
243 | " \n",
244 | " model = lgb.train(train_set=dtrain,early_stopping_rounds=10,num_boost_round=num_round,valid_sets=dtest,params=params)\n",
245 | " \n",
246 | " model_pre = list(model.predict(test_x))\n",
247 | " result_df = pd.DataFrame({'Idx':test_user_id,\n",
248 | " 'score':model_pre})\n",
249 | " return result_df\n",
250 | "\n",
251 | "# 对随机种子,bagging_fraction,feature_fraction及特征数量进行随机扰动\n",
252 | "random_seed = list(range(2018))\n",
253 | "bagging_fraction = [i/1000.0 for i in range(500,1000)]\n",
254 | "feature_fraction = [i/1000.0 for i in range(500,1000)]\n",
255 | "n_feature = list(range(50,174,2))\n",
256 | "\n",
257 | "random.shuffle(random_seed)\n",
258 | "random.shuffle(bagging_fraction)\n",
259 | "random.shuffle(feature_fraction)\n",
260 | "random.shuffle(n_feature)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {
267 | "ExecuteTime": {
268 | "end_time": "2019-02-17T03:18:59.134796Z",
269 | "start_time": "2019-02-17T03:18:33.508Z"
270 | }
271 | },
272 | "outputs": [],
273 | "source": [
274 | "import time \n",
275 | "a= time.time()\n",
276 | "result_df_list=[]\n",
277 | "# 建立30个子模型,保存各个子模型输出的结果\n",
278 | "for i in range(30):\n",
279 | " result_df = bagging_lightgbm(feature_fraction=feature_fraction[i],\n",
280 | " n_feature=n_feature[i],\n",
281 | " ramdom_seed=random_seed[i],\n",
282 | " bagging_fraction=bagging_fraction[i])\n",
283 | " result_df_list.append(result_df)\n",
284 | "# 对30个子模型的结果average,得到bagging模型的最终结果\n",
285 | "prep_list = [list(x['prep']) for x in result_df_list]\n",
286 | "bagging_prep= list(np.sum(score_list,axis=0)/30)\n",
287 | "b = time.time()\n",
288 | "print('运行时间:{}'.format(round(b-a,0)))"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 103,
294 | "metadata": {
295 | "ExecuteTime": {
296 | "end_time": "2019-02-16T14:41:12.488155Z",
297 | "start_time": "2019-02-16T14:41:12.313145Z"
298 | }
299 | },
300 | "outputs": [
301 | {
302 | "data": {
303 | "image/png": "\n",
304 | "text/plain": [
305 | ""
306 | ]
307 | },
308 | "metadata": {
309 | "needs_background": "light"
310 | },
311 | "output_type": "display_data"
312 | }
313 | ],
314 | "source": [
315 | "# bagging模型的AUC\n",
316 | "test_y = list(df[df.sample_status=='test']['target'])\n",
317 | "sc.plot_roc(y_label=test_y,y_pred=ss)"
318 | ]
319 | }
320 | ],
321 | "metadata": {
322 | "kernelspec": {
323 | "display_name": "Python 3",
324 | "language": "python",
325 | "name": "python3"
326 | },
327 | "language_info": {
328 | "codemirror_mode": {
329 | "name": "ipython",
330 | "version": 3
331 | },
332 | "file_extension": ".py",
333 | "mimetype": "text/x-python",
334 | "name": "python",
335 | "nbconvert_exporter": "python",
336 | "pygments_lexer": "ipython3",
337 | "version": "3.7.0"
338 | },
339 | "toc": {
340 | "base_numbering": 1,
341 | "nav_menu": {},
342 | "number_sections": true,
343 | "sideBar": true,
344 | "skip_h1_title": false,
345 | "title_cell": "Table of Contents",
346 | "title_sidebar": "Contents",
347 | "toc_cell": false,
348 | "toc_position": {},
349 | "toc_section_display": true,
350 | "toc_window_display": false
351 | }
352 | },
353 | "nbformat": 4,
354 | "nbformat_minor": 2
355 | }
356 |
--------------------------------------------------------------------------------
/feature_select.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\utils\\__init__.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
13 | " from collections import Sequence\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import numpy as np \n",
19 | "import math \n",
20 | "import pandas as pd \n",
21 | "pd.set_option('display.float_format',lambda x:'%.3f' % x)\n",
22 | "import matplotlib.pyplot as plt \n",
23 | "plt.style.use('ggplot')\n",
24 | "%matplotlib inline\n",
25 | "import seaborn as sns \n",
26 | "sns.set_palette('muted')\n",
27 | "sns.set_style('darkgrid')\n",
28 | "import warnings\n",
29 | "warnings.filterwarnings('ignore')\n",
30 | "import os \n",
31 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')\n",
32 | "import lightgbm as lgb \n",
33 | "from lightgbm import plot_importance"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# 导入feature_processing处理过后的数据\n",
43 | "data = pd.read_csv('data1_process.csv',encoding='gb18030')\n",
44 | "periods_df = pd.read_csv('periods_feature.csv',encoding='gbk')\n",
45 | "rank_df = pd.read_csv('rank_feature.csv',encoding='gbk')\n",
46 | "update_info = pd.read_csv('update_feature.csv',encoding='gbk')\n",
47 | "log_df = pd.read_csv('log_info_feature.csv',encoding='gbk')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/plain": [
58 | "(49701, 237)"
59 | ]
60 | },
61 | "execution_count": 3,
62 | "metadata": {},
63 | "output_type": "execute_result"
64 | }
65 | ],
66 | "source": [
67 | "# 合并衍生后的变量,data1不包含排序特征和periods衍生特征\n",
68 | "data1 = pd.merge(data,update_info,on='Idx',how='left')\n",
69 | "data1 = pd.merge(data1,log_df,on='Idx',how='left')\n",
70 | "data1.shape"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "metadata": {
77 | "scrolled": true
78 | },
79 | "outputs": [
80 | {
81 | "data": {
82 | "text/plain": [
83 | "(49701, 412)"
84 | ]
85 | },
86 | "execution_count": 4,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "# data2包含排序特征和periods衍生特征\n",
93 | "data2 = pd.concat([data1,rank_df,periods_df],axis=1)\n",
94 | "data2.shape"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "data_idx = data.Idx\n",
104 | "df1 =data1.drop(['Idx'],axis=1)# 删除Idx\n",
105 | "# 测试集训练集的划分\n",
106 | "train_fea = np.array(df1[df1.sample_status=='train'].drop(['sample_status','target'],axis=1))\n",
107 | "test_fea = np.array(df1[df1.sample_status=='test'].drop(['sample_status','target'],axis=1))\n",
108 | "train_label = np.array(df1[df1.sample_status=='train']['target']).reshape(-1,1)\n",
109 | "test_label = np.array(df1[df1.sample_status=='test']['target']).reshape(-1,1)\n",
110 | "\n",
111 | "\n",
112 | "fea_names = list(df1.drop(['sample_status','target'],axis=1).columns)# 特征名字存成列表\n",
113 | "feature_importance_values = np.zeros(len(fea_names)) # \n",
114 | "\n",
115 | "# 训练10个lightgbm,并对10个模型输出的feature_importances_取平均\n",
116 | "for _ in range(10):\n",
117 | " model = lgb.LGBMClassifier(n_estimators=1000,learning_rate=0.05,n_jobs=-1,verbose = -1)\n",
118 | " model.fit(train_fea,train_label,eval_metric='auc',\n",
119 | " eval_set = [(test_fea, test_label)],\n",
120 | " early_stopping_rounds=100,verbose = -1)\n",
121 | " feature_importance_values += model.feature_importances_/10\n",
122 | "\n",
123 | "# 将feature_importance_values存成临时表\n",
124 | "fea_imp_df1 = pd.DataFrame({'feature':fea_names,\n",
125 | " 'fea_importance':feature_importance_values})\n",
126 | "fea_imp_df1 = fea_imp_df1.sort_values('fea_importance',ascending=False).reset_index(drop=True)\n",
127 | "fea_imp_df1['norm_importance'] = fea_imp_df1['fea_importance']/fea_imp_df1['fea_importance'].sum() # 特征重要性value的归一化\n",
128 | "fea_imp_df1['cum_importance'] = np.cumsum(fea_imp_df1['norm_importance'])# 特征重要性value的累加值\n",
129 | "fea_imp_df1.head()"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 6,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "data": {
139 | "image/png": "\n",
140 | "text/plain": [
141 | ""
142 | ]
143 | },
144 | "metadata": {
145 | "needs_background": "light"
146 | },
147 | "output_type": "display_data"
148 | }
149 | ],
150 | "source": [
151 | "# 特征重要性可视化\n",
152 | "plt.figure(figsize=(16,5))\n",
153 | "plt.rcParams['font.sans-serif']=['Microsoft YaHei']\n",
154 | "plt.subplot(1,2,1)\n",
155 | "plt.title('特征重要性')\n",
156 | "sns.barplot(data=fea_imp_df1.iloc[:10,:],x='norm_importance',y='feature')\n",
157 | "plt.subplot(1,2,2)\n",
158 | "plt.title('特征重要性累加图')\n",
159 | "plt.xlabel('特征个数')\n",
160 | "plt.ylabel('cum_importance')\n",
161 | "plt.plot(list(range(1, len(fea_names)+1)),fea_imp_df1['cum_importance'], 'r-')\n",
162 | "plt.show()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 7,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "name": "stdout",
172 | "output_type": "stream",
173 | "text": [
174 | "特征重要性为0的变量个数为 :50\n",
175 | "['_userid', '_provinceid', 'SocialNetwork_17', '_nickname', '_orderid', '_otherwebshoptype', '_phonetype', 'is_sichuan_userinfo19', '_ppdaiaccount', 'china_telecom', 'is_jilin_userinfo7', '_relationshipid', '_workyears', '_webshopurl', '_residencetypeid', '_webshoptypeid', '_schoolname', '_secondemail', '_secondmobile', '_residenceyears', 'WeblogInfo_19_H', 'WeblogInfo_19_J', '_department', 'WeblogInfo_21_B', 'WeblogInfo_19_G', 'WeblogInfo_19_F', '_age', '_bussinessaddress', '_byuserid', 'WeblogInfo_19_E', 'is_weifang_UserInfo20', '_companysizeid', '_companytypeid', '_contactid', '_creationdate', 'WeblogInfo_27', '_idnumber', '_dormitoryphone', 'is_zibo_UserInfo8', '_flag_uctobcp', '_flag_uctopvr', '_gender', '_graduatedate', '_graduateschool', '_hasbusinesslicense', '_hasbuycar', '_hasppdaiaccount', '_hassborgjj', '_idaddress', 'is_weifang_UserInfo4']\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "# 剔除特征重要性为0的变量\n",
181 | "zero_imp_col = list(fea_imp_df1[fea_imp_df1.fea_importance==0].feature)\n",
182 | "fea_imp_df11 = fea_imp_df1[~(fea_imp_df1.feature.isin(zero_imp_col))]\n",
183 | "print('特征重要性为0的变量个数为 :{}'.format(len(zero_imp_col)))\n",
184 | "print(zero_imp_col)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 8,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "name": "stdout",
194 | "output_type": "stream",
195 | "text": [
196 | "特征重要性比较弱的变量个数为:26\n",
197 | "['UserInfo_10', 'UserInfo_13', 'WeblogInfo_33', 'is_chengdu_UserInfo2', '_educationid', '_lastupdatedate', '_companyname', '_cityid', 'WeblogInfo_36', 'is_chengdu_UserInfo20', 'is_yantai_UserInfo2', 'is_tianjin_userinfo7', 'china_unicom', 'WeblogInfo_21_D', 'is_chengdu_UserInfo4', '_phone', '_position', '_regstepid', '_residenceaddress', '_residencephone', 'is_hunan_userinfo7', 'operator_unknown', 'WeblogInfo_21_A', 'WeblogInfo_21_C', 'is_sichuan_userinfo7', '_companyaddress']\n"
198 | ]
199 | }
200 | ],
201 | "source": [
202 | "# 剔除特征重要性比较弱的变量\n",
203 | "low_imp_col = list(fea_imp_df11[fea_imp_df11.cum_importance>=0.99].feature)\n",
204 | "print('特征重要性比较弱的变量个数为:{}'.format(len(low_imp_col)))\n",
205 | "print(low_imp_col)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 9,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/plain": [
216 | "(49701, 160)"
217 | ]
218 | },
219 | "execution_count": 9,
220 | "metadata": {},
221 | "output_type": "execute_result"
222 | }
223 | ],
224 | "source": [
225 | "# 删除特征重要性为0和比较弱的特征\n",
226 | "drop_imp_col = zero_imp_col+low_imp_col\n",
227 | "mydf1 = df1.drop(drop_imp_col,axis=1)\n",
228 | "mydf1.shape"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 10,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "# 加上训练集测试集状态,保存数据\n",
238 | "sample_status = list(df1.sample_status)\n",
239 | "mydf1['sample_status'] = sample_status\n",
240 | "mydf1['Idx'] = data_idx\n",
241 | "mydf1.to_csv('../魔镜杯数据/feature_select_data1.csv',encoding='gb18030',index=False)"
242 | ]
243 | }
244 | ],
245 | "metadata": {
246 | "kernelspec": {
247 | "display_name": "Python 3",
248 | "language": "python",
249 | "name": "python3"
250 | },
251 | "language_info": {
252 | "codemirror_mode": {
253 | "name": "ipython",
254 | "version": 3
255 | },
256 | "file_extension": ".py",
257 | "mimetype": "text/x-python",
258 | "name": "python",
259 | "nbconvert_exporter": "python",
260 | "pygments_lexer": "ipython3",
261 | "version": "3.7.0"
262 | },
263 | "toc": {
264 | "base_numbering": 1,
265 | "nav_menu": {},
266 | "number_sections": true,
267 | "sideBar": true,
268 | "skip_h1_title": false,
269 | "title_cell": "Table of Contents",
270 | "title_sidebar": "Contents",
271 | "toc_cell": false,
272 | "toc_position": {},
273 | "toc_section_display": true,
274 | "toc_window_display": false
275 | }
276 | },
277 | "nbformat": 4,
278 | "nbformat_minor": 2
279 | }
280 |
--------------------------------------------------------------------------------
/single_lightgbm_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "ExecuteTime": {
8 | "end_time": "2019-02-17T02:57:42.938856Z",
9 | "start_time": "2019-02-17T02:57:18.450164Z"
10 | }
11 | },
12 | "outputs": [
13 | {
14 | "name": "stderr",
15 | "output_type": "stream",
16 | "text": [
17 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\utils\\__init__.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
18 | " from collections import Sequence\n",
19 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\ensemble\\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
20 | " from numpy.core.umath_tests import inner1d\n"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "import numpy as np \n",
26 | "import pandas as pd \n",
27 | "import matplotlib.pyplot as plt \n",
28 | "%matplotlib inline \n",
29 | "plt.style.use('ggplot')\n",
30 | "import seaborn as sns \n",
31 | "import os \n",
32 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')\n",
33 | "import warnings \n",
34 | "warnings.filterwarnings('ignore')\n",
35 | "\n",
36 | "import lightgbm as lgb \n",
37 | "from lightgbm import plot_importance \n",
38 | "from sklearn.model_selection import GridSearchCV\n",
39 | "from sklearn.model_selection import train_test_split \n",
40 | "from sklearn import metrics\n",
41 | "from sklearn.model_selection import cross_val_score\n",
42 | "from sklearn.model_selection import StratifiedKFold\n",
43 | "\n",
44 | "import score_card as sc"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 2,
50 | "metadata": {
51 | "ExecuteTime": {
52 | "end_time": "2019-02-17T02:57:51.284077Z",
53 | "start_time": "2019-02-17T02:57:42.938856Z"
54 | }
55 | },
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/html": [
60 | "\n",
61 | "\n",
74 | "
\n",
75 | " \n",
76 | " \n",
77 | " | \n",
78 | " UserInfo_1 | \n",
79 | " UserInfo_3 | \n",
80 | " WeblogInfo_2 | \n",
81 | " WeblogInfo_4 | \n",
82 | " WeblogInfo_5 | \n",
83 | " WeblogInfo_6 | \n",
84 | " WeblogInfo_7 | \n",
85 | " WeblogInfo_8 | \n",
86 | " WeblogInfo_15 | \n",
87 | " WeblogInfo_16 | \n",
88 | " ... | \n",
89 | " _mobilephone | \n",
90 | " _qq | \n",
91 | " _realname | \n",
92 | " _turnover | \n",
93 | " update_time_cnt | \n",
94 | " update_all_cnt | \n",
95 | " log_cnt | \n",
96 | " log_timespan | \n",
97 | " avg_log_timespan | \n",
98 | " Idx | \n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " \n",
103 | " | 0 | \n",
104 | " 1.000 | \n",
105 | " 4.000 | \n",
106 | " 1.000 | \n",
107 | " 1.000 | \n",
108 | " 1.000 | \n",
109 | " 1.000 | \n",
110 | " 14.000 | \n",
111 | " 0.000 | \n",
112 | " 6.000 | \n",
113 | " 0.000 | \n",
114 | " ... | \n",
115 | " 1.000 | \n",
116 | " 1.000 | \n",
117 | " 0.000 | \n",
118 | " 0.000 | \n",
119 | " 1.000 | \n",
120 | " 11.000 | \n",
121 | " 19.000 | \n",
122 | " 1.000 | \n",
123 | " 0.632 | \n",
124 | " 10001.000 | \n",
125 | "
\n",
126 | " \n",
127 | " | 1 | \n",
128 | " 1.000 | \n",
129 | " 4.000 | \n",
130 | " 0.000 | \n",
131 | " 1.000 | \n",
132 | " 1.000 | \n",
133 | " 1.000 | \n",
134 | " 14.000 | \n",
135 | " 0.000 | \n",
136 | " 0.000 | \n",
137 | " 7.000 | \n",
138 | " ... | \n",
139 | " 2.000 | \n",
140 | " 1.000 | \n",
141 | " 1.000 | \n",
142 | " 0.000 | \n",
143 | " 3.000 | \n",
144 | " 21.000 | \n",
145 | " 24.000 | \n",
146 | " 1.000 | \n",
147 | " 10.375 | \n",
148 | " 10002.000 | \n",
149 | "
\n",
150 | " \n",
151 | " | 2 | \n",
152 | " 1.000 | \n",
153 | " 3.000 | \n",
154 | " 0.000 | \n",
155 | " 2.000 | \n",
156 | " 2.000 | \n",
157 | " 2.000 | \n",
158 | " 9.000 | \n",
159 | " 3.000 | \n",
160 | " 0.000 | \n",
161 | " 3.000 | \n",
162 | " ... | \n",
163 | " 1.000 | \n",
164 | " 1.000 | \n",
165 | " 0.000 | \n",
166 | " 0.000 | \n",
167 | " 1.000 | \n",
168 | " 10.000 | \n",
169 | " 14.000 | \n",
170 | " 1.000 | \n",
171 | " 0.500 | \n",
172 | " 10003.000 | \n",
173 | "
\n",
174 | " \n",
175 | " | 3 | \n",
176 | " 4.000 | \n",
177 | " 1.000 | \n",
178 | " nan | \n",
179 | " nan | \n",
180 | " nan | \n",
181 | " nan | \n",
182 | " 2.000 | \n",
183 | " 0.000 | \n",
184 | " 0.000 | \n",
185 | " 0.000 | \n",
186 | " ... | \n",
187 | " 1.000 | \n",
188 | " 1.000 | \n",
189 | " 0.000 | \n",
190 | " 0.000 | \n",
191 | " 1.000 | \n",
192 | " 10.000 | \n",
193 | " 7.000 | \n",
194 | " 5.000 | \n",
195 | " 0.000 | \n",
196 | " 10006.000 | \n",
197 | "
\n",
198 | " \n",
199 | " | 4 | \n",
200 | " 5.000 | \n",
201 | " 1.000 | \n",
202 | " 0.000 | \n",
203 | " 1.000 | \n",
204 | " 1.000 | \n",
205 | " 1.000 | \n",
206 | " 3.000 | \n",
207 | " 0.000 | \n",
208 | " 0.000 | \n",
209 | " 0.000 | \n",
210 | " ... | \n",
211 | " 1.000 | \n",
212 | " 1.000 | \n",
213 | " 0.000 | \n",
214 | " 0.000 | \n",
215 | " 2.000 | \n",
216 | " 10.000 | \n",
217 | " 5.000 | \n",
218 | " 0.000 | \n",
219 | " 1.400 | \n",
220 | " 10007.000 | \n",
221 | "
\n",
222 | " \n",
223 | "
\n",
224 | "
5 rows × 161 columns
\n",
225 | "
"
226 | ],
227 | "text/plain": [
228 | " UserInfo_1 UserInfo_3 WeblogInfo_2 WeblogInfo_4 WeblogInfo_5 \\\n",
229 | "0 1.000 4.000 1.000 1.000 1.000 \n",
230 | "1 1.000 4.000 0.000 1.000 1.000 \n",
231 | "2 1.000 3.000 0.000 2.000 2.000 \n",
232 | "3 4.000 1.000 nan nan nan \n",
233 | "4 5.000 1.000 0.000 1.000 1.000 \n",
234 | "\n",
235 | " WeblogInfo_6 WeblogInfo_7 WeblogInfo_8 WeblogInfo_15 WeblogInfo_16 \\\n",
236 | "0 1.000 14.000 0.000 6.000 0.000 \n",
237 | "1 1.000 14.000 0.000 0.000 7.000 \n",
238 | "2 2.000 9.000 3.000 0.000 3.000 \n",
239 | "3 nan 2.000 0.000 0.000 0.000 \n",
240 | "4 1.000 3.000 0.000 0.000 0.000 \n",
241 | "\n",
242 | " ... _mobilephone _qq _realname _turnover update_time_cnt \\\n",
243 | "0 ... 1.000 1.000 0.000 0.000 1.000 \n",
244 | "1 ... 2.000 1.000 1.000 0.000 3.000 \n",
245 | "2 ... 1.000 1.000 0.000 0.000 1.000 \n",
246 | "3 ... 1.000 1.000 0.000 0.000 1.000 \n",
247 | "4 ... 1.000 1.000 0.000 0.000 2.000 \n",
248 | "\n",
249 | " update_all_cnt log_cnt log_timespan avg_log_timespan Idx \n",
250 | "0 11.000 19.000 1.000 0.632 10001.000 \n",
251 | "1 21.000 24.000 1.000 10.375 10002.000 \n",
252 | "2 10.000 14.000 1.000 0.500 10003.000 \n",
253 | "3 10.000 7.000 5.000 0.000 10006.000 \n",
254 | "4 10.000 5.000 0.000 1.400 10007.000 \n",
255 | "\n",
256 | "[5 rows x 161 columns]"
257 | ]
258 | },
259 | "execution_count": 2,
260 | "metadata": {},
261 | "output_type": "execute_result"
262 | }
263 | ],
264 | "source": [
265 | "df = pd.read_csv('feature_select_data1.csv',encoding='gb18030')\n",
266 | "df.head()"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "# 两种版本的lgb默认参数模型"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "## sklearn版本"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 3,
286 | "metadata": {
287 | "ExecuteTime": {
288 | "end_time": "2019-02-17T02:57:51.599095Z",
289 | "start_time": "2019-02-17T02:57:51.295077Z"
290 | }
291 | },
292 | "outputs": [],
293 | "source": [
294 | "# 默认参数模型\n",
295 | "x_train = df[df.sample_status=='train'].drop(['Idx','sample_status','target'],axis=1)\n",
296 | "x_test = df[df.sample_status=='test'].drop(['Idx','sample_status','target'],axis=1)\n",
297 | "y_train = df[df.sample_status=='train']['target']\n",
298 | "y_test = df[df.sample_status=='test']['target']"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 4,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "name": "stdout",
308 | "output_type": "stream",
309 | "text": [
310 | "运行时间为6.0秒\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "import time\n",
316 | "start = time.time()\n",
317 | "lgb_sklearn = lgb.LGBMClassifier(random_state=0).fit(x_train,y_train)\n",
318 | "end = time.time()\n",
319 | "print('运行时间为{}秒'.format(round(end-start,0)))"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 5,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "data": {
329 | "image/png": "\n",
330 | "text/plain": [
331 | ""
332 | ]
333 | },
334 | "metadata": {
335 | "needs_background": "light"
336 | },
337 | "output_type": "display_data"
338 | }
339 | ],
340 | "source": [
341 | "# 默认参数模型的AUC\n",
342 | "lgb_sklearn_pre = lgb_sklearn.predict_proba(x_test)[:,1]\n",
343 | "sc.plot_roc(y_test,lgb_sklearn_pre)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 6,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "data": {
353 | "text/plain": [
354 | "{'boosting_type': 'gbdt',\n",
355 | " 'class_weight': None,\n",
356 | " 'colsample_bytree': 1.0,\n",
357 | " 'importance_type': 'split',\n",
358 | " 'learning_rate': 0.1,\n",
359 | " 'max_depth': -1,\n",
360 | " 'min_child_samples': 20,\n",
361 | " 'min_child_weight': 0.001,\n",
362 | " 'min_split_gain': 0.0,\n",
363 | " 'n_estimators': 100,\n",
364 | " 'n_jobs': -1,\n",
365 | " 'num_leaves': 31,\n",
366 | " 'objective': None,\n",
367 | " 'random_state': 0,\n",
368 | " 'reg_alpha': 0.0,\n",
369 | " 'reg_lambda': 0.0,\n",
370 | " 'silent': True,\n",
371 | " 'subsample': 1.0,\n",
372 | " 'subsample_for_bin': 200000,\n",
373 | " 'subsample_freq': 0}"
374 | ]
375 | },
376 | "execution_count": 6,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "lgb_sklearn.get_params()"
383 | ]
384 | },
385 | {
386 | "cell_type": "markdown",
387 | "metadata": {},
388 | "source": [
389 | "## 原生版本 "
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 14,
395 | "metadata": {
396 | "scrolled": true
397 | },
398 | "outputs": [
399 | {
400 | "name": "stdout",
401 | "output_type": "stream",
402 | "text": [
403 | "[1]\tvalid_0's auc: 0.675718\n",
404 | "Training until validation scores don't improve for 10 rounds.\n",
405 | "[2]\tvalid_0's auc: 0.683763\n",
406 | "[3]\tvalid_0's auc: 0.689045\n",
407 | "[4]\tvalid_0's auc: 0.688154\n",
408 | "[5]\tvalid_0's auc: 0.692355\n",
409 | "[6]\tvalid_0's auc: 0.692386\n",
410 | "[7]\tvalid_0's auc: 0.697619\n",
411 | "[8]\tvalid_0's auc: 0.699751\n",
412 | "[9]\tvalid_0's auc: 0.70051\n",
413 | "[10]\tvalid_0's auc: 0.702275\n",
414 | "[11]\tvalid_0's auc: 0.706518\n",
415 | "[12]\tvalid_0's auc: 0.70864\n",
416 | "[13]\tvalid_0's auc: 0.713132\n",
417 | "[14]\tvalid_0's auc: 0.715673\n",
418 | "[15]\tvalid_0's auc: 0.717739\n",
419 | "[16]\tvalid_0's auc: 0.719119\n",
420 | "[17]\tvalid_0's auc: 0.72115\n",
421 | "[18]\tvalid_0's auc: 0.723824\n",
422 | "[19]\tvalid_0's auc: 0.724232\n",
423 | "[20]\tvalid_0's auc: 0.726006\n",
424 | "[21]\tvalid_0's auc: 0.726508\n",
425 | "[22]\tvalid_0's auc: 0.726892\n",
426 | "[23]\tvalid_0's auc: 0.727921\n",
427 | "[24]\tvalid_0's auc: 0.729418\n",
428 | "[25]\tvalid_0's auc: 0.73087\n",
429 | "[26]\tvalid_0's auc: 0.732294\n",
430 | "[27]\tvalid_0's auc: 0.7336\n",
431 | "[28]\tvalid_0's auc: 0.734957\n",
432 | "[29]\tvalid_0's auc: 0.736162\n",
433 | "[30]\tvalid_0's auc: 0.737107\n",
434 | "[31]\tvalid_0's auc: 0.736938\n",
435 | "[32]\tvalid_0's auc: 0.73804\n",
436 | "[33]\tvalid_0's auc: 0.737969\n",
437 | "[34]\tvalid_0's auc: 0.738373\n",
438 | "[35]\tvalid_0's auc: 0.738153\n",
439 | "[36]\tvalid_0's auc: 0.739998\n",
440 | "[37]\tvalid_0's auc: 0.739689\n",
441 | "[38]\tvalid_0's auc: 0.740843\n",
442 | "[39]\tvalid_0's auc: 0.741177\n",
443 | "[40]\tvalid_0's auc: 0.741063\n",
444 | "[41]\tvalid_0's auc: 0.740791\n",
445 | "[42]\tvalid_0's auc: 0.741013\n",
446 | "[43]\tvalid_0's auc: 0.741408\n",
447 | "[44]\tvalid_0's auc: 0.741923\n",
448 | "[45]\tvalid_0's auc: 0.741994\n",
449 | "[46]\tvalid_0's auc: 0.74203\n",
450 | "[47]\tvalid_0's auc: 0.741826\n",
451 | "[48]\tvalid_0's auc: 0.741808\n",
452 | "[49]\tvalid_0's auc: 0.741153\n",
453 | "[50]\tvalid_0's auc: 0.740779\n",
454 | "[51]\tvalid_0's auc: 0.741177\n",
455 | "[52]\tvalid_0's auc: 0.741106\n",
456 | "[53]\tvalid_0's auc: 0.741315\n",
457 | "[54]\tvalid_0's auc: 0.740231\n",
458 | "[55]\tvalid_0's auc: 0.739891\n",
459 | "[56]\tvalid_0's auc: 0.740211\n",
460 | "Early stopping, best iteration is:\n",
461 | "[46]\tvalid_0's auc: 0.74203\n",
462 | "运行时间为6.0秒\n"
463 | ]
464 | }
465 | ],
466 | "source": [
467 | "# 原生的lightgbm\n",
468 | "lgb_train = lgb.Dataset(x_train,y_train)\n",
469 | "lgb_test = lgb.Dataset(x_test,y_test,reference=lgb_train)\n",
470 | "lgb_origi_params = {'boosting_type':'gbdt',\n",
471 | " 'max_depth':-1,\n",
472 | " 'num_leaves':31,\n",
473 | " 'bagging_fraction':1.0,\n",
474 | " 'feature_fraction':1.0,\n",
475 | " 'learning_rate':0.1,\n",
476 | " 'metric': 'auc'}\n",
477 | "start = time.time()\n",
478 | "lgb_origi = lgb.train(train_set=lgb_train,\n",
479 | " early_stopping_rounds=10,\n",
480 | " num_boost_round=400,\n",
481 | " params=lgb_origi_params,\n",
482 | " valid_sets=lgb_test)\n",
483 | "end = time.time()\n",
484 | "print('运行时间为{}秒'.format(round(end-start,0)))"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 13,
490 | "metadata": {
491 | "scrolled": false
492 | },
493 | "outputs": [
494 | {
495 | "data": {
496 | "image/png": "\n",
497 | "text/plain": [
498 | ""
499 | ]
500 | },
501 | "metadata": {
502 | "needs_background": "light"
503 | },
504 | "output_type": "display_data"
505 | }
506 | ],
507 | "source": [
508 | "# 原生的lightgbm的AUC\n",
509 | "lgb_origi_pre = lgb_origi.predict(x_test)\n",
510 | "sc.plot_roc(y_test,lgb_origi_pre)"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "# 调参"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 18,
523 | "metadata": {},
524 | "outputs": [],
525 | "source": [
526 | "# 确定最大迭代次数,学习率设为0.1 \n",
527 | "base_parmas={'boosting_type':'gbdt',\n",
528 | " 'learning_rate':0.1,\n",
529 | " 'num_leaves':40,\n",
530 | " 'max_depth':-1,\n",
531 | " 'bagging_fraction':0.8,\n",
532 | " 'feature_fraction':0.8,\n",
533 | " 'lambda_l1':0,\n",
534 | " 'lambda_l2':0,\n",
535 | " 'min_data_in_leaf':20,\n",
536 | " 'min_sum_hessian_inleaf':0.001,\n",
537 | " 'metric':'auc'}\n",
538 | "cv_result = lgb.cv(train_set=lgb_train,\n",
539 | " num_boost_round=200,\n",
540 | " early_stopping_rounds=5,\n",
541 | " nfold=5,\n",
542 | " stratified=True,\n",
543 | " shuffle=True,\n",
544 | " params=base_parmas,\n",
545 | " metrics='auc',\n",
546 | " seed=0)"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 20,
552 | "metadata": {},
553 | "outputs": [
554 | {
555 | "name": "stdout",
556 | "output_type": "stream",
557 | "text": [
558 | "最大的迭代次数: 51\n",
559 | "交叉验证的AUC: 0.7271732572229754\n"
560 | ]
561 | }
562 | ],
563 | "source": [
564 | "print('最大的迭代次数: {}'.format(len(cv_result['auc-mean'])))\n",
565 | "print('交叉验证的AUC: {}'.format(max(cv_result['auc-mean'])))"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 24,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "name": "stdout",
575 | "output_type": "stream",
576 | "text": [
577 | "运行时间为:109.0\n"
578 | ]
579 | }
580 | ],
581 | "source": [
582 | "# num_leaves ,步长设为5\n",
583 | "param_find1 = {'num_leaves':range(30,60,5)}\n",
584 | "cv_fold = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)\n",
585 | "start = time.time()\n",
586 | "grid_search1 = GridSearchCV(estimator=lgb.LGBMClassifier(learning_rate=0.1,\n",
587 | " n_estimators = 51,\n",
588 | " max_depth=-1,\n",
589 | " min_child_weight=0.001,\n",
590 | " min_child_samples=20,\n",
591 | " subsample=0.8,\n",
592 | " colsample_bytree=0.8,\n",
593 | " reg_lambda=0,\n",
594 | " reg_alpha=0),\n",
595 | " cv = cv_fold,\n",
596 | " n_jobs=-1,\n",
597 | " param_grid = param_find1,\n",
598 | " scoring='roc_auc')\n",
599 | "grid_search1.fit(x_train,y_train)\n",
600 | "end = time.time()\n",
601 | "print('运行时间为:{}'.format(round(end-start,0)))"
602 | ]
603 | },
604 | {
605 | "cell_type": "code",
606 | "execution_count": 29,
607 | "metadata": {},
608 | "outputs": [
609 | {
610 | "name": "stdout",
611 | "output_type": "stream",
612 | "text": [
613 | "[mean: 0.73008, std: 0.01408, params: {'num_leaves': 30}, mean: 0.72994, std: 0.01638, params: {'num_leaves': 35}, mean: 0.72868, std: 0.01652, params: {'num_leaves': 40}, mean: 0.72776, std: 0.01038, params: {'num_leaves': 45}, mean: 0.72917, std: 0.01601, params: {'num_leaves': 50}, mean: 0.72519, std: 0.01338, params: {'num_leaves': 55}]\n",
614 | "\t\n",
615 | "{'num_leaves': 30}\n",
616 | "\t\n",
617 | "0.7300782078536177\n"
618 | ]
619 | }
620 | ],
621 | "source": [
622 | "print(grid_search1.grid_scores_)\n",
623 | "print('\\t')\n",
624 | "print(grid_search1.best_params_)\n",
625 | "print('\\t')\n",
626 | "print(grid_search1.best_score_)"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 30,
632 | "metadata": {},
633 | "outputs": [
634 | {
635 | "name": "stdout",
636 | "output_type": "stream",
637 | "text": [
638 | "[mean: 0.73327, std: 0.01248, params: {'num_leaves': 26}, mean: 0.73188, std: 0.01426, params: {'num_leaves': 28}, mean: 0.73355, std: 0.01589, params: {'num_leaves': 30}, mean: 0.73318, std: 0.01272, params: {'num_leaves': 32}]\n",
639 | "\t\n",
640 | "{'num_leaves': 30}\n",
641 | "\t\n",
642 | "0.733552244998121\n"
643 | ]
644 | }
645 | ],
646 | "source": [
647 | "# num_leaves,步长设为2 \n",
648 | "param_find2 = {'num_leaves':range(26,34,2)}\n",
649 | "grid_search2 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n",
650 | " learning_rate=0.1,\n",
651 | " min_child_weight=0.001,\n",
652 | " min_child_samples=20,\n",
653 | " subsample=0.8,\n",
654 | " colsample_bytree=0.8,\n",
655 | " reg_lambda=0,\n",
656 | " reg_alpha=0\n",
657 | " ),\n",
658 | " cv=cv_fold,\n",
659 | " n_jobs=-1,\n",
660 | " scoring='roc_auc',\n",
661 | " param_grid = param_find2)\n",
662 | "grid_search2.fit(x_train,y_train)\n",
663 | "print(grid_search2.grid_scores_)\n",
664 | "print('\\t')\n",
665 | "print(grid_search2.best_params_)\n",
666 | "print('\\t')\n",
667 | "print(grid_search2.best_score_)"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 36,
673 | "metadata": {},
674 | "outputs": [
675 | {
676 | "name": "stdout",
677 | "output_type": "stream",
678 | "text": [
679 | "运行时间:312.0 秒\n",
680 | "[mean: 0.73155, std: 0.01112, params: {'min_child_samples': 15, 'min_child_weight': 0.001}, mean: 0.73155, std: 0.01112, params: {'min_child_samples': 15, 'min_child_weight': 0.002}, mean: 0.73155, std: 0.01112, params: {'min_child_samples': 15, 'min_child_weight': 0.003}, mean: 0.73355, std: 0.01589, params: {'min_child_samples': 20, 'min_child_weight': 0.001}, mean: 0.73355, std: 0.01589, params: {'min_child_samples': 20, 'min_child_weight': 0.002}, mean: 0.73355, std: 0.01589, params: {'min_child_samples': 20, 'min_child_weight': 0.003}, mean: 0.73206, std: 0.01434, params: {'min_child_samples': 25, 'min_child_weight': 0.001}, mean: 0.73206, std: 0.01434, params: {'min_child_samples': 25, 'min_child_weight': 0.002}, mean: 0.73206, std: 0.01434, params: {'min_child_samples': 25, 'min_child_weight': 0.003}, mean: 0.73210, std: 0.01145, params: {'min_child_samples': 30, 'min_child_weight': 0.001}, mean: 0.73210, std: 0.01145, params: {'min_child_samples': 30, 'min_child_weight': 0.002}, mean: 0.73210, std: 0.01145, params: {'min_child_samples': 30, 'min_child_weight': 0.003}]\n",
681 | "\t\n",
682 | "{'min_child_samples': 20, 'min_child_weight': 0.001}\n",
683 | "\t\n",
684 | "0.733552244998121\n"
685 | ]
686 | }
687 | ],
688 | "source": [
689 | "# 确定num_leaves 为30 ,下面进行min_child_samples 和 min_child_weight的调参,设定步长为5\n",
690 | "param_find3 = {'min_child_samples':range(15,35,5),\n",
691 | " 'min_child_weight':[x/1000 for x in range(1,4,1)]}\n",
692 | "grid_search3 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n",
693 | " learning_rate=0.1,\n",
694 | " num_leaves=30,\n",
695 | " subsample=0.8,\n",
696 | " colsample_bytree=0.8,\n",
697 | " reg_lambda=0,\n",
698 | " reg_alpha=0\n",
699 | " ),\n",
700 | " cv=cv_fold,\n",
701 | " scoring='roc_auc',\n",
702 | " param_grid = param_find3,\n",
703 | " n_jobs=-1)\n",
704 | "start = time.time()\n",
705 | "grid_search3.fit(x_train,y_train)\n",
706 | "end = time.time()\n",
707 | "print('运行时间:{} 秒'.format(round(end-start,0)))\n",
708 | "print(grid_search3.grid_scores_)\n",
709 | "print('\\t')\n",
710 | "print(grid_search3.best_params_)\n",
711 | "print('\\t')\n",
712 | "print(grid_search3.best_score_)"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 37,
718 | "metadata": {},
719 | "outputs": [
720 | {
721 | "name": "stdout",
722 | "output_type": "stream",
723 | "text": [
724 | "运行时间:826.0 秒\n",
725 | "[mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.5}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.6}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.7}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.8}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.9}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 1.0}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.5}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.6}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.7}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.8}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.9}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 1.0}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.5}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.6}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.7}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.8}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.9}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 1.0}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.5}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.6}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.7}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.8}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.9}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 1.0}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.5}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.6}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.7}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.8}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.9}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 1.0}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.5}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.6}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.7}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.8}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.9}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 1.0}]\n",
726 | "\t\n",
727 | "{'colsample_bytree': 0.6, 'subsample': 0.5}\n",
728 | "\t\n",
729 | "0.7349957573843382\n"
730 | ]
731 | }
732 | ],
733 | "source": [
734 | "# 确定min_child_weight为0.001,min_child_samples为20,下面对subsample和colsample_bytree进行调参\n",
735 | "param_find4 = {'subsample':[x/10 for x in range(5,11,1)],\n",
736 | " 'colsample_bytree':[x/10 for x in range(5,11,1)]}\n",
737 | "grid_search4 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n",
738 | " learning_rate=0.1,\n",
739 | " min_child_samples=20,\n",
740 | " min_child_weight=0.001,\n",
741 | " num_leaves=30,\n",
742 | " subsample=0.8,\n",
743 | " colsample_bytree=0.8,\n",
744 | " reg_lambda=0,\n",
745 | " reg_alpha=0\n",
746 | " ),\n",
747 | " cv=cv_fold,\n",
748 | " scoring='roc_auc',\n",
749 | " param_grid = param_find4,\n",
750 | " n_jobs=-1)\n",
751 | "start = time.time()\n",
752 | "grid_search4.fit(x_train,y_train)\n",
753 | "end = time.time()\n",
754 | "print('运行时间:{} 秒'.format(round(end-start,0)))\n",
755 | "print(grid_search4.grid_scores_)\n",
756 | "print('\\t')\n",
757 | "print(grid_search4.best_params_)\n",
758 | "print('\\t')\n",
759 | "print(grid_search4.best_score_)"
760 | ]
761 | },
762 | {
763 | "cell_type": "code",
764 | "execution_count": 39,
765 | "metadata": {},
766 | "outputs": [
767 | {
768 | "name": "stdout",
769 | "output_type": "stream",
770 | "text": [
771 | "运行时间:692.0 秒\n",
772 | "[mean: 0.73386, std: 0.01566, params: {'reg_alpha': 0.001, 'reg_lambda': 0.001}, mean: 0.73284, std: 0.01099, params: {'reg_alpha': 0.001, 'reg_lambda': 0.01}, mean: 0.73024, std: 0.01294, params: {'reg_alpha': 0.001, 'reg_lambda': 0.03}, mean: 0.73565, std: 0.01237, params: {'reg_alpha': 0.001, 'reg_lambda': 0.08}, mean: 0.73300, std: 0.01580, params: {'reg_alpha': 0.001, 'reg_lambda': 0.1}, mean: 0.73713, std: 0.01489, params: {'reg_alpha': 0.001, 'reg_lambda': 0.3}, mean: 0.73173, std: 0.01727, params: {'reg_alpha': 0.01, 'reg_lambda': 0.001}, mean: 0.73586, std: 0.01282, params: {'reg_alpha': 0.01, 'reg_lambda': 0.01}, mean: 0.73424, std: 0.01136, params: {'reg_alpha': 0.01, 'reg_lambda': 0.03}, mean: 0.73601, std: 0.01579, params: {'reg_alpha': 0.01, 'reg_lambda': 0.08}, mean: 0.73688, std: 0.01218, params: {'reg_alpha': 0.01, 'reg_lambda': 0.1}, mean: 0.73459, std: 0.01598, params: {'reg_alpha': 0.01, 'reg_lambda': 0.3}, mean: 0.73395, std: 0.01492, params: {'reg_alpha': 0.03, 'reg_lambda': 0.001}, mean: 0.73688, std: 0.01137, params: {'reg_alpha': 0.03, 'reg_lambda': 0.01}, mean: 0.73430, std: 0.01592, params: {'reg_alpha': 0.03, 'reg_lambda': 0.03}, mean: 0.73501, std: 0.01268, params: {'reg_alpha': 0.03, 'reg_lambda': 0.08}, mean: 0.73462, std: 0.01437, params: {'reg_alpha': 0.03, 'reg_lambda': 0.1}, mean: 0.73890, std: 0.01465, params: {'reg_alpha': 0.03, 'reg_lambda': 0.3}, mean: 0.73408, std: 0.01293, params: {'reg_alpha': 0.08, 'reg_lambda': 0.001}, mean: 0.73217, std: 0.01456, params: {'reg_alpha': 0.08, 'reg_lambda': 0.01}, mean: 0.73468, std: 0.01092, params: {'reg_alpha': 0.08, 'reg_lambda': 0.03}, mean: 0.73542, std: 0.01050, params: {'reg_alpha': 0.08, 'reg_lambda': 0.08}, mean: 0.73603, std: 0.01564, params: {'reg_alpha': 0.08, 'reg_lambda': 0.1}, mean: 0.73706, std: 0.01759, params: {'reg_alpha': 0.08, 'reg_lambda': 0.3}, mean: 0.72988, std: 0.01310, params: {'reg_alpha': 0.1, 'reg_lambda': 0.001}, mean: 0.73350, std: 0.01248, params: {'reg_alpha': 0.1, 'reg_lambda': 0.01}, mean: 0.73526, std: 0.01280, params: {'reg_alpha': 0.1, 'reg_lambda': 0.03}, mean: 0.73386, std: 0.01461, params: {'reg_alpha': 0.1, 'reg_lambda': 0.08}, mean: 0.73635, std: 0.01596, params: {'reg_alpha': 0.1, 'reg_lambda': 0.1}, mean: 0.73542, std: 0.01512, params: {'reg_alpha': 0.1, 'reg_lambda': 0.3}, mean: 0.73620, std: 0.00951, params: {'reg_alpha': 0.3, 'reg_lambda': 0.001}, mean: 0.73713, std: 0.01541, params: {'reg_alpha': 0.3, 'reg_lambda': 0.01}, mean: 0.73943, std: 0.01238, params: {'reg_alpha': 0.3, 'reg_lambda': 0.03}, mean: 0.73593, std: 0.01351, params: {'reg_alpha': 0.3, 'reg_lambda': 0.08}, mean: 0.73402, std: 0.01277, params: {'reg_alpha': 0.3, 'reg_lambda': 0.1}, mean: 0.73655, std: 0.00920, params: {'reg_alpha': 0.3, 'reg_lambda': 0.3}]\n",
773 | "\t\n",
774 | "{'reg_alpha': 0.3, 'reg_lambda': 0.03}\n",
775 | "\t\n",
776 | "0.739431056578461\n"
777 | ]
778 | }
779 | ],
780 | "source": [
781 | "param_find5 = {'reg_lambda':[0.001,0.01,0.03,0.08,0.1,0.3],\n",
782 | " 'reg_alpha':[0.001,0.01,0.03,0.08,0.1,0.3]}\n",
783 | "grid_search5 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n",
784 | " learning_rate=0.1,\n",
785 | " min_child_samples=20,\n",
786 | " min_child_weight=0.001,\n",
787 | " num_leaves=30,\n",
788 | " subsample=0.5,\n",
789 | " colsample_bytree=0.6,\n",
790 | " ),\n",
791 | " cv=cv_fold,\n",
792 | " scoring='roc_auc',\n",
793 | " param_grid = param_find5,\n",
794 | " n_jobs=-1)\n",
795 | "start = time.time()\n",
796 | "grid_search5.fit(x_train,y_train)\n",
797 | "end = time.time()\n",
798 | "print('运行时间:{} 秒'.format(round(end-start,0)))\n",
799 | "print(grid_search5.grid_scores_)\n",
800 | "print('\\t')\n",
801 | "print(grid_search5.best_params_)\n",
802 | "print('\\t')\n",
803 | "print(grid_search5.best_score_)"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": 44,
809 | "metadata": {},
810 | "outputs": [],
811 | "source": [
812 | "# 将最佳参数再次带入cv函数,设定学习率为0.005\n",
813 | "best_params = {\n",
814 | " 'boosting_type':'gbdt',\n",
815 | " 'learning_rate':0.005,\n",
816 | " 'num_leaves':30,\n",
817 | " 'max_depth':-1,\n",
818 | " 'bagging_fraction':0.5,\n",
819 | " 'feature_fraction':0.6,\n",
820 | " 'min_data_in_leaf':20,\n",
821 | " 'min_sum_hessian_in_leaf':0.001,\n",
822 | " 'lambda_l1':0.3,\n",
823 | " 'lambda_l2':0.03,\n",
824 | " 'metric':'auc'\n",
825 | "}\n",
826 | "\n",
827 | "best_cv = lgb.cv(train_set=lgb_train,\n",
828 | " early_stopping_rounds=5,\n",
829 | " num_boost_round=2000,\n",
830 | " nfold=5,\n",
831 | " params=best_params,\n",
832 | " metrics='auc',\n",
833 | " stratified=True,\n",
834 | " shuffle=True,\n",
835 | " seed=0)"
836 | ]
837 | },
838 | {
839 | "cell_type": "code",
840 | "execution_count": 50,
841 | "metadata": {
842 | "scrolled": true
843 | },
844 | "outputs": [
845 | {
846 | "name": "stdout",
847 | "output_type": "stream",
848 | "text": [
849 | "最佳参数的迭代次数: 889\n",
850 | "交叉验证的AUC: 0.7357671213094057\n"
851 | ]
852 | }
853 | ],
854 | "source": [
855 | "print('最佳参数的迭代次数: {}'.format(len(best_cv['auc-mean'])))\n",
856 | "print('交叉验证的AUC: {}'.format(max(best_cv['auc-mean'])))"
857 | ]
858 | },
859 | {
860 | "cell_type": "code",
861 | "execution_count": 61,
862 | "metadata": {},
863 | "outputs": [
864 | {
865 | "data": {
866 | "text/plain": [
867 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.6,\n",
868 | " importance_type='split', learning_rate=0.005, max_depth=-1,\n",
869 | " min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n",
870 | " n_estimators=900, n_jobs=-1, num_leaves=30, objective=None,\n",
871 | " random_state=0, reg_alpha=0.3, reg_lambda=0.03, silent=True,\n",
872 | " subsample=0.5, subsample_for_bin=200000, subsample_freq=0)"
873 | ]
874 | },
875 | "execution_count": 61,
876 | "metadata": {},
877 | "output_type": "execute_result"
878 | }
879 | ],
880 | "source": [
881 | "lgb_single_model = lgb.LGBMClassifier(n_estimators=900,\n",
882 | " learning_rate=0.005,\n",
883 | " min_child_weight=0.001,\n",
884 | " min_child_samples = 20,\n",
885 | " subsample=0.5,\n",
886 | " colsample_bytree=0.6,\n",
887 | " num_leaves=30,\n",
888 | " max_depth=-1,\n",
889 | " reg_lambda=0.03,\n",
890 | " reg_alpha=0.3,\n",
891 | " random_state=0)\n",
892 | "lgb_single_model.fit(x_train,y_train)"
893 | ]
894 | },
895 | {
896 | "cell_type": "code",
897 | "execution_count": 64,
898 | "metadata": {},
899 | "outputs": [
900 | {
901 | "name": "stdout",
902 | "output_type": "stream",
903 | "text": [
904 | "lightgbm单模型的AUC:0.7535371506640257\n"
905 | ]
906 | },
907 | {
908 | "data": {
909 | "image/png": "\n",
910 | "text/plain": [
911 | ""
912 | ]
913 | },
914 | "metadata": {},
915 | "output_type": "display_data"
916 | }
917 | ],
918 | "source": [
919 | "pre = lgb_single_model.predict_proba(x_test)[:,1]\n",
920 | "print('lightgbm单模型的AUC:{}'.format(metrics.roc_auc_score(y_test,pre)))\n",
921 | "sc.plot_roc(y_test,pre)"
922 | ]
923 | }
924 | ],
925 | "metadata": {
926 | "kernelspec": {
927 | "display_name": "Python 3",
928 | "language": "python",
929 | "name": "python3"
930 | },
931 | "language_info": {
932 | "codemirror_mode": {
933 | "name": "ipython",
934 | "version": 3
935 | },
936 | "file_extension": ".py",
937 | "mimetype": "text/x-python",
938 | "name": "python",
939 | "nbconvert_exporter": "python",
940 | "pygments_lexer": "ipython3",
941 | "version": "3.7.0"
942 | },
943 | "toc": {
944 | "base_numbering": 1,
945 | "nav_menu": {},
946 | "number_sections": true,
947 | "sideBar": true,
948 | "skip_h1_title": false,
949 | "title_cell": "Table of Contents",
950 | "title_sidebar": "Contents",
951 | "toc_cell": false,
952 | "toc_position": {},
953 | "toc_section_display": true,
954 | "toc_window_display": false
955 | }
956 | },
957 | "nbformat": 4,
958 | "nbformat_minor": 2
959 | }
960 |
--------------------------------------------------------------------------------