├── README.md ├── data_input.ipynb ├── model_bagging_lightgbm.ipynb ├── feature_select.ipynb └── single_lightgbm_model.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # 拍拍贷"魔镜杯"风控算法比赛实战 2 | 3 | * 比赛项目链接 4 |
[“魔镜杯”风控算法大赛](https://www.kesci.com/home/competition/56cd5f02b89b5bd026cb39c9/content/0) 5 | 6 | * 知乎文章链接地址 7 |
[风控模型实战--"魔镜杯"风控算法大赛](https://zhuanlan.zhihu.com/p/56864235) 8 | 9 | * 代码目录说明 10 | 11 | 1. data_input: 数据导入及合并 12 | 2. data_EDA_clean: 数据清洗 13 | 3. feature_processing: 特征工程 14 | 4. feature_select: 特征筛选 15 | 5. single_lightgbm_model:单模型--lightgbm 16 | 6. model_bagging_lightgbm:lightgbm的bagging模型 17 | 18 | -------------------------------------------------------------------------------- /data_input.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np \n", 10 | "import pandas as pd \n", 11 | "import warnings\n", 12 | "warnings.filterwarnings('ignore')\n", 13 | "import os \n", 14 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 3, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "f_train1 = pd.read_csv('first_train1.csv',encoding='gbk')\n", 24 | "f_train2 = pd.read_csv('first_train2.csv',encoding='gbk')\n", 25 | "f_train3 = pd.read_csv('first_train3.csv',encoding='gbk')\n", 26 | "f_test1 = pd.read_csv('first_test1.csv',encoding='gb18030')\n", 27 | "f_test2 = pd.read_csv('first_test2.csv',encoding='gbk')\n", 28 | "f_test3 = pd.read_csv('first_test3.csv',encoding='gbk')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# 训练集和测试集合并\n", 38 | "f_train1['sample_status'] = 'train'\n", 39 | "f_test1['sample_status'] = 'test'\n", 40 | "df1 = pd.concat([f_train1,f_test1],axis=0).reset_index(drop=True)\n", 41 | "df2 = pd.concat([f_train2,f_test2],axis=0).reset_index(drop=True)\n", 42 | "df3 = pd.concat([f_train3,f_test3],axis=0).reset_index(drop=True)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 6, 48 | "metadata": { 49 | "scrolled": true 50 | }, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
\n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | "
IdxUserInfo_1UserInfo_2UserInfo_3UserInfo_4WeblogInfo_1WeblogInfo_2WeblogInfo_3WeblogInfo_4WeblogInfo_5...SocialNetwork_11SocialNetwork_12SocialNetwork_13SocialNetwork_14SocialNetwork_15SocialNetwork_16SocialNetwork_17targetListingInfosample_status
0100011.000深圳4.000深圳nan1.000nan1.0001.000...-100000102014-3-5train
1100021.000温州4.000温州nan0.000nan1.0001.000...-100000202014-2-26train
2100031.000宜昌3.000宜昌nan0.000nan2.0002.000...-1-11000002014-2-28train
3100064.000南平1.000南平nannannannannan...-1-10000002014-2-25train
4100075.000辽阳1.000辽阳nan0.000nan1.0001.000...-1-10000002014-2-27train
\n", 219 | "

5 rows × 229 columns

\n", 220 | "
" 221 | ], 222 | "text/plain": [ 223 | " Idx UserInfo_1 UserInfo_2 UserInfo_3 UserInfo_4 WeblogInfo_1 \\\n", 224 | "0 10001 1.000 深圳 4.000 深圳 nan \n", 225 | "1 10002 1.000 温州 4.000 温州 nan \n", 226 | "2 10003 1.000 宜昌 3.000 宜昌 nan \n", 227 | "3 10006 4.000 南平 1.000 南平 nan \n", 228 | "4 10007 5.000 辽阳 1.000 辽阳 nan \n", 229 | "\n", 230 | " WeblogInfo_2 WeblogInfo_3 WeblogInfo_4 WeblogInfo_5 ... \\\n", 231 | "0 1.000 nan 1.000 1.000 ... \n", 232 | "1 0.000 nan 1.000 1.000 ... \n", 233 | "2 0.000 nan 2.000 2.000 ... \n", 234 | "3 nan nan nan nan ... \n", 235 | "4 0.000 nan 1.000 1.000 ... \n", 236 | "\n", 237 | " SocialNetwork_11 SocialNetwork_12 SocialNetwork_13 SocialNetwork_14 \\\n", 238 | "0 -1 0 0 0 \n", 239 | "1 -1 0 0 0 \n", 240 | "2 -1 -1 1 0 \n", 241 | "3 -1 -1 0 0 \n", 242 | "4 -1 -1 0 0 \n", 243 | "\n", 244 | " SocialNetwork_15 SocialNetwork_16 SocialNetwork_17 target ListingInfo \\\n", 245 | "0 0 0 1 0 2014-3-5 \n", 246 | "1 0 0 2 0 2014-2-26 \n", 247 | "2 0 0 0 0 2014-2-28 \n", 248 | "3 0 0 0 0 2014-2-25 \n", 249 | "4 0 0 0 0 2014-2-27 \n", 250 | "\n", 251 | " sample_status \n", 252 | "0 train \n", 253 | "1 train \n", 254 | "2 train \n", 255 | "3 train \n", 256 | "4 train \n", 257 | "\n", 258 | "[5 rows x 229 columns]" 259 | ] 260 | }, 261 | "execution_count": 6, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "df1.head()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 7, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "# 保存数据至本地\n", 277 | "df1.to_csv('C:/Users/Administrator/Desktop/魔镜杯数据/data_input1.csv',encoding='gb18030',index=False)\n", 278 | "df2.to_csv('C:/Users/Administrator/Desktop/魔镜杯数据/data_input2.csv',encoding='gb18030',index=False)\n", 279 | "df3.to_csv('C:/Users/Administrator/Desktop/魔镜杯数据/data_input3.csv',encoding='gb18030',index=False)" 280 | ] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 3", 286 | "language": "python", 287 | "name": "python3" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.7.0" 300 | }, 301 | "toc": { 302 | "base_numbering": 1, 303 | "nav_menu": {}, 304 | "number_sections": true, 305 | "sideBar": true, 306 | "skip_h1_title": false, 307 | "title_cell": "Table of Contents", 308 | "title_sidebar": "Contents", 309 | "toc_cell": false, 310 | "toc_position": {}, 311 | "toc_section_display": true, 312 | "toc_window_display": false 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 2 317 | } 318 | -------------------------------------------------------------------------------- /model_bagging_lightgbm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-02-17T03:18:36.904108Z", 9 | "start_time": "2019-02-17T03:18:33.466960Z" 10 | }, 11 | "scrolled": true 12 | }, 13 | "outputs": [ 14 | { 15 | "name": "stderr", 16 | "output_type": "stream", 17 | "text": [ 18 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\utils\\__init__.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", 19 | " from collections import Sequence\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import numpy as np \n", 25 | "import pandas as pd\n", 26 | "import lightgbm as lgb\n", 27 | "import random\n", 28 | "from sklearn import metrics\n", 29 | "import warnings\n", 30 | "warnings.filterwarnings('ignore')\n", 31 | "import os \n", 32 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')\n", 33 | "import score_card as sc" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "ExecuteTime": { 41 | "end_time": "2019-02-17T03:18:42.561213Z", 42 | "start_time": "2019-02-17T03:18:36.904108Z" 43 | }, 44 | "scrolled": true 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(49701, 161)" 51 | ] 52 | }, 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "# Master数据\n", 60 | "df1 = pd.read_csv('feature_select_data1.csv',encoding='gb18030')\n", 61 | "df1.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": { 68 | "ExecuteTime": { 69 | "end_time": "2019-02-17T03:18:54.458570Z", 70 | "start_time": "2019-02-17T03:18:42.566214Z" 71 | } 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "(49701, 124)" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "# 排序特征数据\n", 87 | "rank_df = pd.read_csv('rank_feature.csv',encoding='gbk')\n", 88 | "rank_df.shape" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": { 95 | "ExecuteTime": { 96 | "end_time": "2019-02-17T03:18:57.831734Z", 97 | "start_time": "2019-02-17T03:18:54.461570Z" 98 | } 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "(49701, 51)" 105 | ] 106 | }, 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "# periods衍生特征数据\n", 114 | "periods_df = pd.read_csv('periods_feature.csv',encoding='gbk')\n", 115 | "periods_df.shape" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 5, 121 | "metadata": { 122 | "ExecuteTime": { 123 | "end_time": "2019-02-17T03:18:57.837734Z", 124 | "start_time": "2019-02-17T03:18:57.833734Z" 125 | } 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "# 原生特征(不含排序特征和periods衍生特征)\n", 130 | "feature1 = list(df1.columns)\n", 131 | "# 排序特征和periods衍生特征\n", 132 | "feature2 = list(rank_df.columns)+list(periods_df.columns)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": { 139 | "ExecuteTime": { 140 | "end_time": "2019-02-17T03:18:57.963735Z", 141 | "start_time": "2019-02-17T03:18:57.841734Z" 142 | } 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "# 对feature2进行随机打乱顺序\n", 147 | "random.shuffle(feature2)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 7, 153 | "metadata": { 154 | "ExecuteTime": { 155 | "end_time": "2019-02-17T03:18:58.129739Z", 156 | "start_time": "2019-02-17T03:18:57.964736Z" 157 | } 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "(49701, 336)" 164 | ] 165 | }, 166 | "execution_count": 7, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "# 合并数据集\n", 173 | "df = pd.concat([df1,rank_df,periods_df],axis=1)\n", 174 | "df.shape" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "metadata": { 181 | "ExecuteTime": { 182 | "end_time": "2019-02-17T03:18:58.272747Z", 183 | "start_time": "2019-02-17T03:18:58.133739Z" 184 | } 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "# 保存用户id\n", 189 | "data_idx = df.Idx" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 9, 195 | "metadata": { 196 | "ExecuteTime": { 197 | "end_time": "2019-02-17T03:18:58.440757Z", 198 | "start_time": "2019-02-17T03:18:58.274747Z" 199 | } 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "# 定义lightgbm的bagging函数\n", 204 | "def bagging_lightgbm(feature_fraction,bagging_fraction,ramdom_seed,n_feature):\n", 205 | " \n", 206 | " select_fea = feature1+feature2[:n_feature]\n", 207 | " \n", 208 | " data = df.loc[:,select_fea]\n", 209 | " train_x = data[data.sample_status=='train'].drop(['sample_status','target','Idx'],axis=1)\n", 210 | " train_y = data[data.sample_status=='train']['target']\n", 211 | " test_x = data[data.sample_status=='test'].drop(['sample_status','target','Idx'],axis=1)\n", 212 | " test_y = data[data.sample_status=='test']['target']\n", 213 | " \n", 214 | " test_user_id = list(data[data.sample_status=='test']['Idx'])\n", 215 | " \n", 216 | " \n", 217 | " dtrain = lgb.Dataset(train_x,train_y)\n", 218 | " dtest = lgb.Dataset(test_x,test_y)\n", 219 | " \n", 220 | " params={\n", 221 | " 'boosting_type':'gbdt',\n", 222 | " 'metric':'auc',\n", 223 | " 'num_leaves':30,\n", 224 | " 'min_data_in_leaf':20,\n", 225 | " 'min_sum_hessian_in_leaf':0.001,\n", 226 | " 'bagging_fraction':bagging_fraction,\n", 227 | " 'feature_fraction':feature_fraction,\n", 228 | " 'learning_rate':0.005,\n", 229 | " }\n", 230 | " \n", 231 | " # 寻找最佳的迭代次数\n", 232 | " cv_result = lgb.cv(train_set=dtrain,\n", 233 | " early_stopping_rounds=10,\n", 234 | " num_boost_round=1000,\n", 235 | " nfold=5,\n", 236 | " metrics='auc',\n", 237 | " seed=0,\n", 238 | " params=params,\n", 239 | " stratified=True,\n", 240 | " shuffle=True)\n", 241 | " max_auc = max(cv_result['auc-mean'])\n", 242 | " num_round = len(cv_result['auc-mean'])\n", 243 | " \n", 244 | " model = lgb.train(train_set=dtrain,early_stopping_rounds=10,num_boost_round=num_round,valid_sets=dtest,params=params)\n", 245 | " \n", 246 | " model_pre = list(model.predict(test_x))\n", 247 | " result_df = pd.DataFrame({'Idx':test_user_id,\n", 248 | " 'score':model_pre})\n", 249 | " return result_df\n", 250 | "\n", 251 | "# 对随机种子,bagging_fraction,feature_fraction及特征数量进行随机扰动\n", 252 | "random_seed = list(range(2018))\n", 253 | "bagging_fraction = [i/1000.0 for i in range(500,1000)]\n", 254 | "feature_fraction = [i/1000.0 for i in range(500,1000)]\n", 255 | "n_feature = list(range(50,174,2))\n", 256 | "\n", 257 | "random.shuffle(random_seed)\n", 258 | "random.shuffle(bagging_fraction)\n", 259 | "random.shuffle(feature_fraction)\n", 260 | "random.shuffle(n_feature)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "ExecuteTime": { 268 | "end_time": "2019-02-17T03:18:59.134796Z", 269 | "start_time": "2019-02-17T03:18:33.508Z" 270 | } 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "import time \n", 275 | "a= time.time()\n", 276 | "result_df_list=[]\n", 277 | "# 建立30个子模型,保存各个子模型输出的结果\n", 278 | "for i in range(30):\n", 279 | " result_df = bagging_lightgbm(feature_fraction=feature_fraction[i],\n", 280 | " n_feature=n_feature[i],\n", 281 | " ramdom_seed=random_seed[i],\n", 282 | " bagging_fraction=bagging_fraction[i])\n", 283 | " result_df_list.append(result_df)\n", 284 | "# 对30个子模型的结果average,得到bagging模型的最终结果\n", 285 | "prep_list = [list(x['prep']) for x in result_df_list]\n", 286 | "bagging_prep= list(np.sum(score_list,axis=0)/30)\n", 287 | "b = time.time()\n", 288 | "print('运行时间:{}'.format(round(b-a,0)))" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 103, 294 | "metadata": { 295 | "ExecuteTime": { 296 | "end_time": "2019-02-16T14:41:12.488155Z", 297 | "start_time": "2019-02-16T14:41:12.313145Z" 298 | } 299 | }, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "image/png": "\n", 304 | "text/plain": [ 305 | "
" 306 | ] 307 | }, 308 | "metadata": { 309 | "needs_background": "light" 310 | }, 311 | "output_type": "display_data" 312 | } 313 | ], 314 | "source": [ 315 | "# bagging模型的AUC\n", 316 | "test_y = list(df[df.sample_status=='test']['target'])\n", 317 | "sc.plot_roc(y_label=test_y,y_pred=ss)" 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.7.0" 338 | }, 339 | "toc": { 340 | "base_numbering": 1, 341 | "nav_menu": {}, 342 | "number_sections": true, 343 | "sideBar": true, 344 | "skip_h1_title": false, 345 | "title_cell": "Table of Contents", 346 | "title_sidebar": "Contents", 347 | "toc_cell": false, 348 | "toc_position": {}, 349 | "toc_section_display": true, 350 | "toc_window_display": false 351 | } 352 | }, 353 | "nbformat": 4, 354 | "nbformat_minor": 2 355 | } 356 | -------------------------------------------------------------------------------- /feature_select.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\utils\\__init__.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", 13 | " from collections import Sequence\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import numpy as np \n", 19 | "import math \n", 20 | "import pandas as pd \n", 21 | "pd.set_option('display.float_format',lambda x:'%.3f' % x)\n", 22 | "import matplotlib.pyplot as plt \n", 23 | "plt.style.use('ggplot')\n", 24 | "%matplotlib inline\n", 25 | "import seaborn as sns \n", 26 | "sns.set_palette('muted')\n", 27 | "sns.set_style('darkgrid')\n", 28 | "import warnings\n", 29 | "warnings.filterwarnings('ignore')\n", 30 | "import os \n", 31 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')\n", 32 | "import lightgbm as lgb \n", 33 | "from lightgbm import plot_importance" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# 导入feature_processing处理过后的数据\n", 43 | "data = pd.read_csv('data1_process.csv',encoding='gb18030')\n", 44 | "periods_df = pd.read_csv('periods_feature.csv',encoding='gbk')\n", 45 | "rank_df = pd.read_csv('rank_feature.csv',encoding='gbk')\n", 46 | "update_info = pd.read_csv('update_feature.csv',encoding='gbk')\n", 47 | "log_df = pd.read_csv('log_info_feature.csv',encoding='gbk')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "(49701, 237)" 59 | ] 60 | }, 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "# 合并衍生后的变量,data1不包含排序特征和periods衍生特征\n", 68 | "data1 = pd.merge(data,update_info,on='Idx',how='left')\n", 69 | "data1 = pd.merge(data1,log_df,on='Idx',how='left')\n", 70 | "data1.shape" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": { 77 | "scrolled": true 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "(49701, 412)" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "# data2包含排序特征和periods衍生特征\n", 93 | "data2 = pd.concat([data1,rank_df,periods_df],axis=1)\n", 94 | "data2.shape" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "data_idx = data.Idx\n", 104 | "df1 =data1.drop(['Idx'],axis=1)# 删除Idx\n", 105 | "# 测试集训练集的划分\n", 106 | "train_fea = np.array(df1[df1.sample_status=='train'].drop(['sample_status','target'],axis=1))\n", 107 | "test_fea = np.array(df1[df1.sample_status=='test'].drop(['sample_status','target'],axis=1))\n", 108 | "train_label = np.array(df1[df1.sample_status=='train']['target']).reshape(-1,1)\n", 109 | "test_label = np.array(df1[df1.sample_status=='test']['target']).reshape(-1,1)\n", 110 | "\n", 111 | "\n", 112 | "fea_names = list(df1.drop(['sample_status','target'],axis=1).columns)# 特征名字存成列表\n", 113 | "feature_importance_values = np.zeros(len(fea_names)) # \n", 114 | "\n", 115 | "# 训练10个lightgbm,并对10个模型输出的feature_importances_取平均\n", 116 | "for _ in range(10):\n", 117 | " model = lgb.LGBMClassifier(n_estimators=1000,learning_rate=0.05,n_jobs=-1,verbose = -1)\n", 118 | " model.fit(train_fea,train_label,eval_metric='auc',\n", 119 | " eval_set = [(test_fea, test_label)],\n", 120 | " early_stopping_rounds=100,verbose = -1)\n", 121 | " feature_importance_values += model.feature_importances_/10\n", 122 | "\n", 123 | "# 将feature_importance_values存成临时表\n", 124 | "fea_imp_df1 = pd.DataFrame({'feature':fea_names,\n", 125 | " 'fea_importance':feature_importance_values})\n", 126 | "fea_imp_df1 = fea_imp_df1.sort_values('fea_importance',ascending=False).reset_index(drop=True)\n", 127 | "fea_imp_df1['norm_importance'] = fea_imp_df1['fea_importance']/fea_imp_df1['fea_importance'].sum() # 特征重要性value的归一化\n", 128 | "fea_imp_df1['cum_importance'] = np.cumsum(fea_imp_df1['norm_importance'])# 特征重要性value的累加值\n", 129 | "fea_imp_df1.head()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "image/png": "\n", 140 | "text/plain": [ 141 | "
" 142 | ] 143 | }, 144 | "metadata": { 145 | "needs_background": "light" 146 | }, 147 | "output_type": "display_data" 148 | } 149 | ], 150 | "source": [ 151 | "# 特征重要性可视化\n", 152 | "plt.figure(figsize=(16,5))\n", 153 | "plt.rcParams['font.sans-serif']=['Microsoft YaHei']\n", 154 | "plt.subplot(1,2,1)\n", 155 | "plt.title('特征重要性')\n", 156 | "sns.barplot(data=fea_imp_df1.iloc[:10,:],x='norm_importance',y='feature')\n", 157 | "plt.subplot(1,2,2)\n", 158 | "plt.title('特征重要性累加图')\n", 159 | "plt.xlabel('特征个数')\n", 160 | "plt.ylabel('cum_importance')\n", 161 | "plt.plot(list(range(1, len(fea_names)+1)),fea_imp_df1['cum_importance'], 'r-')\n", 162 | "plt.show()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "特征重要性为0的变量个数为 :50\n", 175 | "['_userid', '_provinceid', 'SocialNetwork_17', '_nickname', '_orderid', '_otherwebshoptype', '_phonetype', 'is_sichuan_userinfo19', '_ppdaiaccount', 'china_telecom', 'is_jilin_userinfo7', '_relationshipid', '_workyears', '_webshopurl', '_residencetypeid', '_webshoptypeid', '_schoolname', '_secondemail', '_secondmobile', '_residenceyears', 'WeblogInfo_19_H', 'WeblogInfo_19_J', '_department', 'WeblogInfo_21_B', 'WeblogInfo_19_G', 'WeblogInfo_19_F', '_age', '_bussinessaddress', '_byuserid', 'WeblogInfo_19_E', 'is_weifang_UserInfo20', '_companysizeid', '_companytypeid', '_contactid', '_creationdate', 'WeblogInfo_27', '_idnumber', '_dormitoryphone', 'is_zibo_UserInfo8', '_flag_uctobcp', '_flag_uctopvr', '_gender', '_graduatedate', '_graduateschool', '_hasbusinesslicense', '_hasbuycar', '_hasppdaiaccount', '_hassborgjj', '_idaddress', 'is_weifang_UserInfo4']\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "# 剔除特征重要性为0的变量\n", 181 | "zero_imp_col = list(fea_imp_df1[fea_imp_df1.fea_importance==0].feature)\n", 182 | "fea_imp_df11 = fea_imp_df1[~(fea_imp_df1.feature.isin(zero_imp_col))]\n", 183 | "print('特征重要性为0的变量个数为 :{}'.format(len(zero_imp_col)))\n", 184 | "print(zero_imp_col)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 8, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "特征重要性比较弱的变量个数为:26\n", 197 | "['UserInfo_10', 'UserInfo_13', 'WeblogInfo_33', 'is_chengdu_UserInfo2', '_educationid', '_lastupdatedate', '_companyname', '_cityid', 'WeblogInfo_36', 'is_chengdu_UserInfo20', 'is_yantai_UserInfo2', 'is_tianjin_userinfo7', 'china_unicom', 'WeblogInfo_21_D', 'is_chengdu_UserInfo4', '_phone', '_position', '_regstepid', '_residenceaddress', '_residencephone', 'is_hunan_userinfo7', 'operator_unknown', 'WeblogInfo_21_A', 'WeblogInfo_21_C', 'is_sichuan_userinfo7', '_companyaddress']\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "# 剔除特征重要性比较弱的变量\n", 203 | "low_imp_col = list(fea_imp_df11[fea_imp_df11.cum_importance>=0.99].feature)\n", 204 | "print('特征重要性比较弱的变量个数为:{}'.format(len(low_imp_col)))\n", 205 | "print(low_imp_col)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 9, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "(49701, 160)" 217 | ] 218 | }, 219 | "execution_count": 9, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "# 删除特征重要性为0和比较弱的特征\n", 226 | "drop_imp_col = zero_imp_col+low_imp_col\n", 227 | "mydf1 = df1.drop(drop_imp_col,axis=1)\n", 228 | "mydf1.shape" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 10, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# 加上训练集测试集状态,保存数据\n", 238 | "sample_status = list(df1.sample_status)\n", 239 | "mydf1['sample_status'] = sample_status\n", 240 | "mydf1['Idx'] = data_idx\n", 241 | "mydf1.to_csv('../魔镜杯数据/feature_select_data1.csv',encoding='gb18030',index=False)" 242 | ] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": "Python 3", 248 | "language": "python", 249 | "name": "python3" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 3 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython3", 261 | "version": "3.7.0" 262 | }, 263 | "toc": { 264 | "base_numbering": 1, 265 | "nav_menu": {}, 266 | "number_sections": true, 267 | "sideBar": true, 268 | "skip_h1_title": false, 269 | "title_cell": "Table of Contents", 270 | "title_sidebar": "Contents", 271 | "toc_cell": false, 272 | "toc_position": {}, 273 | "toc_section_display": true, 274 | "toc_window_display": false 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /single_lightgbm_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-02-17T02:57:42.938856Z", 9 | "start_time": "2019-02-17T02:57:18.450164Z" 10 | } 11 | }, 12 | "outputs": [ 13 | { 14 | "name": "stderr", 15 | "output_type": "stream", 16 | "text": [ 17 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\utils\\__init__.py:4: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", 18 | " from collections import Sequence\n", 19 | "C:\\Users\\Administrator\\Anaconda3\\envs\\py3\\lib\\site-packages\\sklearn\\ensemble\\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", 20 | " from numpy.core.umath_tests import inner1d\n" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "import numpy as np \n", 26 | "import pandas as pd \n", 27 | "import matplotlib.pyplot as plt \n", 28 | "%matplotlib inline \n", 29 | "plt.style.use('ggplot')\n", 30 | "import seaborn as sns \n", 31 | "import os \n", 32 | "os.chdir('C:/Users/Administrator/Desktop/魔镜杯数据')\n", 33 | "import warnings \n", 34 | "warnings.filterwarnings('ignore')\n", 35 | "\n", 36 | "import lightgbm as lgb \n", 37 | "from lightgbm import plot_importance \n", 38 | "from sklearn.model_selection import GridSearchCV\n", 39 | "from sklearn.model_selection import train_test_split \n", 40 | "from sklearn import metrics\n", 41 | "from sklearn.model_selection import cross_val_score\n", 42 | "from sklearn.model_selection import StratifiedKFold\n", 43 | "\n", 44 | "import score_card as sc" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": { 51 | "ExecuteTime": { 52 | "end_time": "2019-02-17T02:57:51.284077Z", 53 | "start_time": "2019-02-17T02:57:42.938856Z" 54 | } 55 | }, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | "
UserInfo_1UserInfo_3WeblogInfo_2WeblogInfo_4WeblogInfo_5WeblogInfo_6WeblogInfo_7WeblogInfo_8WeblogInfo_15WeblogInfo_16..._mobilephone_qq_realname_turnoverupdate_time_cntupdate_all_cntlog_cntlog_timespanavg_log_timespanIdx
01.0004.0001.0001.0001.0001.00014.0000.0006.0000.000...1.0001.0000.0000.0001.00011.00019.0001.0000.63210001.000
11.0004.0000.0001.0001.0001.00014.0000.0000.0007.000...2.0001.0001.0000.0003.00021.00024.0001.00010.37510002.000
21.0003.0000.0002.0002.0002.0009.0003.0000.0003.000...1.0001.0000.0000.0001.00010.00014.0001.0000.50010003.000
34.0001.000nannannannan2.0000.0000.0000.000...1.0001.0000.0000.0001.00010.0007.0005.0000.00010006.000
45.0001.0000.0001.0001.0001.0003.0000.0000.0000.000...1.0001.0000.0000.0002.00010.0005.0000.0001.40010007.000
\n", 224 | "

5 rows × 161 columns

\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " UserInfo_1 UserInfo_3 WeblogInfo_2 WeblogInfo_4 WeblogInfo_5 \\\n", 229 | "0 1.000 4.000 1.000 1.000 1.000 \n", 230 | "1 1.000 4.000 0.000 1.000 1.000 \n", 231 | "2 1.000 3.000 0.000 2.000 2.000 \n", 232 | "3 4.000 1.000 nan nan nan \n", 233 | "4 5.000 1.000 0.000 1.000 1.000 \n", 234 | "\n", 235 | " WeblogInfo_6 WeblogInfo_7 WeblogInfo_8 WeblogInfo_15 WeblogInfo_16 \\\n", 236 | "0 1.000 14.000 0.000 6.000 0.000 \n", 237 | "1 1.000 14.000 0.000 0.000 7.000 \n", 238 | "2 2.000 9.000 3.000 0.000 3.000 \n", 239 | "3 nan 2.000 0.000 0.000 0.000 \n", 240 | "4 1.000 3.000 0.000 0.000 0.000 \n", 241 | "\n", 242 | " ... _mobilephone _qq _realname _turnover update_time_cnt \\\n", 243 | "0 ... 1.000 1.000 0.000 0.000 1.000 \n", 244 | "1 ... 2.000 1.000 1.000 0.000 3.000 \n", 245 | "2 ... 1.000 1.000 0.000 0.000 1.000 \n", 246 | "3 ... 1.000 1.000 0.000 0.000 1.000 \n", 247 | "4 ... 1.000 1.000 0.000 0.000 2.000 \n", 248 | "\n", 249 | " update_all_cnt log_cnt log_timespan avg_log_timespan Idx \n", 250 | "0 11.000 19.000 1.000 0.632 10001.000 \n", 251 | "1 21.000 24.000 1.000 10.375 10002.000 \n", 252 | "2 10.000 14.000 1.000 0.500 10003.000 \n", 253 | "3 10.000 7.000 5.000 0.000 10006.000 \n", 254 | "4 10.000 5.000 0.000 1.400 10007.000 \n", 255 | "\n", 256 | "[5 rows x 161 columns]" 257 | ] 258 | }, 259 | "execution_count": 2, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "df = pd.read_csv('feature_select_data1.csv',encoding='gb18030')\n", 266 | "df.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "# 两种版本的lgb默认参数模型" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "## sklearn版本" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 3, 286 | "metadata": { 287 | "ExecuteTime": { 288 | "end_time": "2019-02-17T02:57:51.599095Z", 289 | "start_time": "2019-02-17T02:57:51.295077Z" 290 | } 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "# 默认参数模型\n", 295 | "x_train = df[df.sample_status=='train'].drop(['Idx','sample_status','target'],axis=1)\n", 296 | "x_test = df[df.sample_status=='test'].drop(['Idx','sample_status','target'],axis=1)\n", 297 | "y_train = df[df.sample_status=='train']['target']\n", 298 | "y_test = df[df.sample_status=='test']['target']" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 4, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "运行时间为6.0秒\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "import time\n", 316 | "start = time.time()\n", 317 | "lgb_sklearn = lgb.LGBMClassifier(random_state=0).fit(x_train,y_train)\n", 318 | "end = time.time()\n", 319 | "print('运行时间为{}秒'.format(round(end-start,0)))" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 5, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "image/png": "\n", 330 | "text/plain": [ 331 | "
" 332 | ] 333 | }, 334 | "metadata": { 335 | "needs_background": "light" 336 | }, 337 | "output_type": "display_data" 338 | } 339 | ], 340 | "source": [ 341 | "# 默认参数模型的AUC\n", 342 | "lgb_sklearn_pre = lgb_sklearn.predict_proba(x_test)[:,1]\n", 343 | "sc.plot_roc(y_test,lgb_sklearn_pre)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 6, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "{'boosting_type': 'gbdt',\n", 355 | " 'class_weight': None,\n", 356 | " 'colsample_bytree': 1.0,\n", 357 | " 'importance_type': 'split',\n", 358 | " 'learning_rate': 0.1,\n", 359 | " 'max_depth': -1,\n", 360 | " 'min_child_samples': 20,\n", 361 | " 'min_child_weight': 0.001,\n", 362 | " 'min_split_gain': 0.0,\n", 363 | " 'n_estimators': 100,\n", 364 | " 'n_jobs': -1,\n", 365 | " 'num_leaves': 31,\n", 366 | " 'objective': None,\n", 367 | " 'random_state': 0,\n", 368 | " 'reg_alpha': 0.0,\n", 369 | " 'reg_lambda': 0.0,\n", 370 | " 'silent': True,\n", 371 | " 'subsample': 1.0,\n", 372 | " 'subsample_for_bin': 200000,\n", 373 | " 'subsample_freq': 0}" 374 | ] 375 | }, 376 | "execution_count": 6, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "lgb_sklearn.get_params()" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "## 原生版本 " 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 14, 395 | "metadata": { 396 | "scrolled": true 397 | }, 398 | "outputs": [ 399 | { 400 | "name": "stdout", 401 | "output_type": "stream", 402 | "text": [ 403 | "[1]\tvalid_0's auc: 0.675718\n", 404 | "Training until validation scores don't improve for 10 rounds.\n", 405 | "[2]\tvalid_0's auc: 0.683763\n", 406 | "[3]\tvalid_0's auc: 0.689045\n", 407 | "[4]\tvalid_0's auc: 0.688154\n", 408 | "[5]\tvalid_0's auc: 0.692355\n", 409 | "[6]\tvalid_0's auc: 0.692386\n", 410 | "[7]\tvalid_0's auc: 0.697619\n", 411 | "[8]\tvalid_0's auc: 0.699751\n", 412 | "[9]\tvalid_0's auc: 0.70051\n", 413 | "[10]\tvalid_0's auc: 0.702275\n", 414 | "[11]\tvalid_0's auc: 0.706518\n", 415 | "[12]\tvalid_0's auc: 0.70864\n", 416 | "[13]\tvalid_0's auc: 0.713132\n", 417 | "[14]\tvalid_0's auc: 0.715673\n", 418 | "[15]\tvalid_0's auc: 0.717739\n", 419 | "[16]\tvalid_0's auc: 0.719119\n", 420 | "[17]\tvalid_0's auc: 0.72115\n", 421 | "[18]\tvalid_0's auc: 0.723824\n", 422 | "[19]\tvalid_0's auc: 0.724232\n", 423 | "[20]\tvalid_0's auc: 0.726006\n", 424 | "[21]\tvalid_0's auc: 0.726508\n", 425 | "[22]\tvalid_0's auc: 0.726892\n", 426 | "[23]\tvalid_0's auc: 0.727921\n", 427 | "[24]\tvalid_0's auc: 0.729418\n", 428 | "[25]\tvalid_0's auc: 0.73087\n", 429 | "[26]\tvalid_0's auc: 0.732294\n", 430 | "[27]\tvalid_0's auc: 0.7336\n", 431 | "[28]\tvalid_0's auc: 0.734957\n", 432 | "[29]\tvalid_0's auc: 0.736162\n", 433 | "[30]\tvalid_0's auc: 0.737107\n", 434 | "[31]\tvalid_0's auc: 0.736938\n", 435 | "[32]\tvalid_0's auc: 0.73804\n", 436 | "[33]\tvalid_0's auc: 0.737969\n", 437 | "[34]\tvalid_0's auc: 0.738373\n", 438 | "[35]\tvalid_0's auc: 0.738153\n", 439 | "[36]\tvalid_0's auc: 0.739998\n", 440 | "[37]\tvalid_0's auc: 0.739689\n", 441 | "[38]\tvalid_0's auc: 0.740843\n", 442 | "[39]\tvalid_0's auc: 0.741177\n", 443 | "[40]\tvalid_0's auc: 0.741063\n", 444 | "[41]\tvalid_0's auc: 0.740791\n", 445 | "[42]\tvalid_0's auc: 0.741013\n", 446 | "[43]\tvalid_0's auc: 0.741408\n", 447 | "[44]\tvalid_0's auc: 0.741923\n", 448 | "[45]\tvalid_0's auc: 0.741994\n", 449 | "[46]\tvalid_0's auc: 0.74203\n", 450 | "[47]\tvalid_0's auc: 0.741826\n", 451 | "[48]\tvalid_0's auc: 0.741808\n", 452 | "[49]\tvalid_0's auc: 0.741153\n", 453 | "[50]\tvalid_0's auc: 0.740779\n", 454 | "[51]\tvalid_0's auc: 0.741177\n", 455 | "[52]\tvalid_0's auc: 0.741106\n", 456 | "[53]\tvalid_0's auc: 0.741315\n", 457 | "[54]\tvalid_0's auc: 0.740231\n", 458 | "[55]\tvalid_0's auc: 0.739891\n", 459 | "[56]\tvalid_0's auc: 0.740211\n", 460 | "Early stopping, best iteration is:\n", 461 | "[46]\tvalid_0's auc: 0.74203\n", 462 | "运行时间为6.0秒\n" 463 | ] 464 | } 465 | ], 466 | "source": [ 467 | "# 原生的lightgbm\n", 468 | "lgb_train = lgb.Dataset(x_train,y_train)\n", 469 | "lgb_test = lgb.Dataset(x_test,y_test,reference=lgb_train)\n", 470 | "lgb_origi_params = {'boosting_type':'gbdt',\n", 471 | " 'max_depth':-1,\n", 472 | " 'num_leaves':31,\n", 473 | " 'bagging_fraction':1.0,\n", 474 | " 'feature_fraction':1.0,\n", 475 | " 'learning_rate':0.1,\n", 476 | " 'metric': 'auc'}\n", 477 | "start = time.time()\n", 478 | "lgb_origi = lgb.train(train_set=lgb_train,\n", 479 | " early_stopping_rounds=10,\n", 480 | " num_boost_round=400,\n", 481 | " params=lgb_origi_params,\n", 482 | " valid_sets=lgb_test)\n", 483 | "end = time.time()\n", 484 | "print('运行时间为{}秒'.format(round(end-start,0)))" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 13, 490 | "metadata": { 491 | "scrolled": false 492 | }, 493 | "outputs": [ 494 | { 495 | "data": { 496 | "image/png": "\n", 497 | "text/plain": [ 498 | "
" 499 | ] 500 | }, 501 | "metadata": { 502 | "needs_background": "light" 503 | }, 504 | "output_type": "display_data" 505 | } 506 | ], 507 | "source": [ 508 | "# 原生的lightgbm的AUC\n", 509 | "lgb_origi_pre = lgb_origi.predict(x_test)\n", 510 | "sc.plot_roc(y_test,lgb_origi_pre)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "# 调参" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 18, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "# 确定最大迭代次数,学习率设为0.1 \n", 527 | "base_parmas={'boosting_type':'gbdt',\n", 528 | " 'learning_rate':0.1,\n", 529 | " 'num_leaves':40,\n", 530 | " 'max_depth':-1,\n", 531 | " 'bagging_fraction':0.8,\n", 532 | " 'feature_fraction':0.8,\n", 533 | " 'lambda_l1':0,\n", 534 | " 'lambda_l2':0,\n", 535 | " 'min_data_in_leaf':20,\n", 536 | " 'min_sum_hessian_inleaf':0.001,\n", 537 | " 'metric':'auc'}\n", 538 | "cv_result = lgb.cv(train_set=lgb_train,\n", 539 | " num_boost_round=200,\n", 540 | " early_stopping_rounds=5,\n", 541 | " nfold=5,\n", 542 | " stratified=True,\n", 543 | " shuffle=True,\n", 544 | " params=base_parmas,\n", 545 | " metrics='auc',\n", 546 | " seed=0)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 20, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "name": "stdout", 556 | "output_type": "stream", 557 | "text": [ 558 | "最大的迭代次数: 51\n", 559 | "交叉验证的AUC: 0.7271732572229754\n" 560 | ] 561 | } 562 | ], 563 | "source": [ 564 | "print('最大的迭代次数: {}'.format(len(cv_result['auc-mean'])))\n", 565 | "print('交叉验证的AUC: {}'.format(max(cv_result['auc-mean'])))" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 24, 571 | "metadata": {}, 572 | "outputs": [ 573 | { 574 | "name": "stdout", 575 | "output_type": "stream", 576 | "text": [ 577 | "运行时间为:109.0\n" 578 | ] 579 | } 580 | ], 581 | "source": [ 582 | "# num_leaves ,步长设为5\n", 583 | "param_find1 = {'num_leaves':range(30,60,5)}\n", 584 | "cv_fold = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)\n", 585 | "start = time.time()\n", 586 | "grid_search1 = GridSearchCV(estimator=lgb.LGBMClassifier(learning_rate=0.1,\n", 587 | " n_estimators = 51,\n", 588 | " max_depth=-1,\n", 589 | " min_child_weight=0.001,\n", 590 | " min_child_samples=20,\n", 591 | " subsample=0.8,\n", 592 | " colsample_bytree=0.8,\n", 593 | " reg_lambda=0,\n", 594 | " reg_alpha=0),\n", 595 | " cv = cv_fold,\n", 596 | " n_jobs=-1,\n", 597 | " param_grid = param_find1,\n", 598 | " scoring='roc_auc')\n", 599 | "grid_search1.fit(x_train,y_train)\n", 600 | "end = time.time()\n", 601 | "print('运行时间为:{}'.format(round(end-start,0)))" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 29, 607 | "metadata": {}, 608 | "outputs": [ 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "[mean: 0.73008, std: 0.01408, params: {'num_leaves': 30}, mean: 0.72994, std: 0.01638, params: {'num_leaves': 35}, mean: 0.72868, std: 0.01652, params: {'num_leaves': 40}, mean: 0.72776, std: 0.01038, params: {'num_leaves': 45}, mean: 0.72917, std: 0.01601, params: {'num_leaves': 50}, mean: 0.72519, std: 0.01338, params: {'num_leaves': 55}]\n", 614 | "\t\n", 615 | "{'num_leaves': 30}\n", 616 | "\t\n", 617 | "0.7300782078536177\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "print(grid_search1.grid_scores_)\n", 623 | "print('\\t')\n", 624 | "print(grid_search1.best_params_)\n", 625 | "print('\\t')\n", 626 | "print(grid_search1.best_score_)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 30, 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "name": "stdout", 636 | "output_type": "stream", 637 | "text": [ 638 | "[mean: 0.73327, std: 0.01248, params: {'num_leaves': 26}, mean: 0.73188, std: 0.01426, params: {'num_leaves': 28}, mean: 0.73355, std: 0.01589, params: {'num_leaves': 30}, mean: 0.73318, std: 0.01272, params: {'num_leaves': 32}]\n", 639 | "\t\n", 640 | "{'num_leaves': 30}\n", 641 | "\t\n", 642 | "0.733552244998121\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "# num_leaves,步长设为2 \n", 648 | "param_find2 = {'num_leaves':range(26,34,2)}\n", 649 | "grid_search2 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n", 650 | " learning_rate=0.1,\n", 651 | " min_child_weight=0.001,\n", 652 | " min_child_samples=20,\n", 653 | " subsample=0.8,\n", 654 | " colsample_bytree=0.8,\n", 655 | " reg_lambda=0,\n", 656 | " reg_alpha=0\n", 657 | " ),\n", 658 | " cv=cv_fold,\n", 659 | " n_jobs=-1,\n", 660 | " scoring='roc_auc',\n", 661 | " param_grid = param_find2)\n", 662 | "grid_search2.fit(x_train,y_train)\n", 663 | "print(grid_search2.grid_scores_)\n", 664 | "print('\\t')\n", 665 | "print(grid_search2.best_params_)\n", 666 | "print('\\t')\n", 667 | "print(grid_search2.best_score_)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 36, 673 | "metadata": {}, 674 | "outputs": [ 675 | { 676 | "name": "stdout", 677 | "output_type": "stream", 678 | "text": [ 679 | "运行时间:312.0 秒\n", 680 | "[mean: 0.73155, std: 0.01112, params: {'min_child_samples': 15, 'min_child_weight': 0.001}, mean: 0.73155, std: 0.01112, params: {'min_child_samples': 15, 'min_child_weight': 0.002}, mean: 0.73155, std: 0.01112, params: {'min_child_samples': 15, 'min_child_weight': 0.003}, mean: 0.73355, std: 0.01589, params: {'min_child_samples': 20, 'min_child_weight': 0.001}, mean: 0.73355, std: 0.01589, params: {'min_child_samples': 20, 'min_child_weight': 0.002}, mean: 0.73355, std: 0.01589, params: {'min_child_samples': 20, 'min_child_weight': 0.003}, mean: 0.73206, std: 0.01434, params: {'min_child_samples': 25, 'min_child_weight': 0.001}, mean: 0.73206, std: 0.01434, params: {'min_child_samples': 25, 'min_child_weight': 0.002}, mean: 0.73206, std: 0.01434, params: {'min_child_samples': 25, 'min_child_weight': 0.003}, mean: 0.73210, std: 0.01145, params: {'min_child_samples': 30, 'min_child_weight': 0.001}, mean: 0.73210, std: 0.01145, params: {'min_child_samples': 30, 'min_child_weight': 0.002}, mean: 0.73210, std: 0.01145, params: {'min_child_samples': 30, 'min_child_weight': 0.003}]\n", 681 | "\t\n", 682 | "{'min_child_samples': 20, 'min_child_weight': 0.001}\n", 683 | "\t\n", 684 | "0.733552244998121\n" 685 | ] 686 | } 687 | ], 688 | "source": [ 689 | "# 确定num_leaves 为30 ,下面进行min_child_samples 和 min_child_weight的调参,设定步长为5\n", 690 | "param_find3 = {'min_child_samples':range(15,35,5),\n", 691 | " 'min_child_weight':[x/1000 for x in range(1,4,1)]}\n", 692 | "grid_search3 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n", 693 | " learning_rate=0.1,\n", 694 | " num_leaves=30,\n", 695 | " subsample=0.8,\n", 696 | " colsample_bytree=0.8,\n", 697 | " reg_lambda=0,\n", 698 | " reg_alpha=0\n", 699 | " ),\n", 700 | " cv=cv_fold,\n", 701 | " scoring='roc_auc',\n", 702 | " param_grid = param_find3,\n", 703 | " n_jobs=-1)\n", 704 | "start = time.time()\n", 705 | "grid_search3.fit(x_train,y_train)\n", 706 | "end = time.time()\n", 707 | "print('运行时间:{} 秒'.format(round(end-start,0)))\n", 708 | "print(grid_search3.grid_scores_)\n", 709 | "print('\\t')\n", 710 | "print(grid_search3.best_params_)\n", 711 | "print('\\t')\n", 712 | "print(grid_search3.best_score_)" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 37, 718 | "metadata": {}, 719 | "outputs": [ 720 | { 721 | "name": "stdout", 722 | "output_type": "stream", 723 | "text": [ 724 | "运行时间:826.0 秒\n", 725 | "[mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.5}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.6}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.7}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.8}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 0.9}, mean: 0.73467, std: 0.01475, params: {'colsample_bytree': 0.5, 'subsample': 1.0}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.5}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.6}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.7}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.8}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 0.9}, mean: 0.73500, std: 0.01559, params: {'colsample_bytree': 0.6, 'subsample': 1.0}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.5}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.6}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.7}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.8}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 0.9}, mean: 0.73053, std: 0.01389, params: {'colsample_bytree': 0.7, 'subsample': 1.0}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.5}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.6}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.7}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.8}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 0.9}, mean: 0.73355, std: 0.01589, params: {'colsample_bytree': 0.8, 'subsample': 1.0}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.5}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.6}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.7}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.8}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 0.9}, mean: 0.73304, std: 0.01103, params: {'colsample_bytree': 0.9, 'subsample': 1.0}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.5}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.6}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.7}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.8}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 0.9}, mean: 0.73427, std: 0.01462, params: {'colsample_bytree': 1.0, 'subsample': 1.0}]\n", 726 | "\t\n", 727 | "{'colsample_bytree': 0.6, 'subsample': 0.5}\n", 728 | "\t\n", 729 | "0.7349957573843382\n" 730 | ] 731 | } 732 | ], 733 | "source": [ 734 | "# 确定min_child_weight为0.001,min_child_samples为20,下面对subsample和colsample_bytree进行调参\n", 735 | "param_find4 = {'subsample':[x/10 for x in range(5,11,1)],\n", 736 | " 'colsample_bytree':[x/10 for x in range(5,11,1)]}\n", 737 | "grid_search4 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n", 738 | " learning_rate=0.1,\n", 739 | " min_child_samples=20,\n", 740 | " min_child_weight=0.001,\n", 741 | " num_leaves=30,\n", 742 | " subsample=0.8,\n", 743 | " colsample_bytree=0.8,\n", 744 | " reg_lambda=0,\n", 745 | " reg_alpha=0\n", 746 | " ),\n", 747 | " cv=cv_fold,\n", 748 | " scoring='roc_auc',\n", 749 | " param_grid = param_find4,\n", 750 | " n_jobs=-1)\n", 751 | "start = time.time()\n", 752 | "grid_search4.fit(x_train,y_train)\n", 753 | "end = time.time()\n", 754 | "print('运行时间:{} 秒'.format(round(end-start,0)))\n", 755 | "print(grid_search4.grid_scores_)\n", 756 | "print('\\t')\n", 757 | "print(grid_search4.best_params_)\n", 758 | "print('\\t')\n", 759 | "print(grid_search4.best_score_)" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": 39, 765 | "metadata": {}, 766 | "outputs": [ 767 | { 768 | "name": "stdout", 769 | "output_type": "stream", 770 | "text": [ 771 | "运行时间:692.0 秒\n", 772 | "[mean: 0.73386, std: 0.01566, params: {'reg_alpha': 0.001, 'reg_lambda': 0.001}, mean: 0.73284, std: 0.01099, params: {'reg_alpha': 0.001, 'reg_lambda': 0.01}, mean: 0.73024, std: 0.01294, params: {'reg_alpha': 0.001, 'reg_lambda': 0.03}, mean: 0.73565, std: 0.01237, params: {'reg_alpha': 0.001, 'reg_lambda': 0.08}, mean: 0.73300, std: 0.01580, params: {'reg_alpha': 0.001, 'reg_lambda': 0.1}, mean: 0.73713, std: 0.01489, params: {'reg_alpha': 0.001, 'reg_lambda': 0.3}, mean: 0.73173, std: 0.01727, params: {'reg_alpha': 0.01, 'reg_lambda': 0.001}, mean: 0.73586, std: 0.01282, params: {'reg_alpha': 0.01, 'reg_lambda': 0.01}, mean: 0.73424, std: 0.01136, params: {'reg_alpha': 0.01, 'reg_lambda': 0.03}, mean: 0.73601, std: 0.01579, params: {'reg_alpha': 0.01, 'reg_lambda': 0.08}, mean: 0.73688, std: 0.01218, params: {'reg_alpha': 0.01, 'reg_lambda': 0.1}, mean: 0.73459, std: 0.01598, params: {'reg_alpha': 0.01, 'reg_lambda': 0.3}, mean: 0.73395, std: 0.01492, params: {'reg_alpha': 0.03, 'reg_lambda': 0.001}, mean: 0.73688, std: 0.01137, params: {'reg_alpha': 0.03, 'reg_lambda': 0.01}, mean: 0.73430, std: 0.01592, params: {'reg_alpha': 0.03, 'reg_lambda': 0.03}, mean: 0.73501, std: 0.01268, params: {'reg_alpha': 0.03, 'reg_lambda': 0.08}, mean: 0.73462, std: 0.01437, params: {'reg_alpha': 0.03, 'reg_lambda': 0.1}, mean: 0.73890, std: 0.01465, params: {'reg_alpha': 0.03, 'reg_lambda': 0.3}, mean: 0.73408, std: 0.01293, params: {'reg_alpha': 0.08, 'reg_lambda': 0.001}, mean: 0.73217, std: 0.01456, params: {'reg_alpha': 0.08, 'reg_lambda': 0.01}, mean: 0.73468, std: 0.01092, params: {'reg_alpha': 0.08, 'reg_lambda': 0.03}, mean: 0.73542, std: 0.01050, params: {'reg_alpha': 0.08, 'reg_lambda': 0.08}, mean: 0.73603, std: 0.01564, params: {'reg_alpha': 0.08, 'reg_lambda': 0.1}, mean: 0.73706, std: 0.01759, params: {'reg_alpha': 0.08, 'reg_lambda': 0.3}, mean: 0.72988, std: 0.01310, params: {'reg_alpha': 0.1, 'reg_lambda': 0.001}, mean: 0.73350, std: 0.01248, params: {'reg_alpha': 0.1, 'reg_lambda': 0.01}, mean: 0.73526, std: 0.01280, params: {'reg_alpha': 0.1, 'reg_lambda': 0.03}, mean: 0.73386, std: 0.01461, params: {'reg_alpha': 0.1, 'reg_lambda': 0.08}, mean: 0.73635, std: 0.01596, params: {'reg_alpha': 0.1, 'reg_lambda': 0.1}, mean: 0.73542, std: 0.01512, params: {'reg_alpha': 0.1, 'reg_lambda': 0.3}, mean: 0.73620, std: 0.00951, params: {'reg_alpha': 0.3, 'reg_lambda': 0.001}, mean: 0.73713, std: 0.01541, params: {'reg_alpha': 0.3, 'reg_lambda': 0.01}, mean: 0.73943, std: 0.01238, params: {'reg_alpha': 0.3, 'reg_lambda': 0.03}, mean: 0.73593, std: 0.01351, params: {'reg_alpha': 0.3, 'reg_lambda': 0.08}, mean: 0.73402, std: 0.01277, params: {'reg_alpha': 0.3, 'reg_lambda': 0.1}, mean: 0.73655, std: 0.00920, params: {'reg_alpha': 0.3, 'reg_lambda': 0.3}]\n", 773 | "\t\n", 774 | "{'reg_alpha': 0.3, 'reg_lambda': 0.03}\n", 775 | "\t\n", 776 | "0.739431056578461\n" 777 | ] 778 | } 779 | ], 780 | "source": [ 781 | "param_find5 = {'reg_lambda':[0.001,0.01,0.03,0.08,0.1,0.3],\n", 782 | " 'reg_alpha':[0.001,0.01,0.03,0.08,0.1,0.3]}\n", 783 | "grid_search5 = GridSearchCV(estimator=lgb.LGBMClassifier(estimator=51,\n", 784 | " learning_rate=0.1,\n", 785 | " min_child_samples=20,\n", 786 | " min_child_weight=0.001,\n", 787 | " num_leaves=30,\n", 788 | " subsample=0.5,\n", 789 | " colsample_bytree=0.6,\n", 790 | " ),\n", 791 | " cv=cv_fold,\n", 792 | " scoring='roc_auc',\n", 793 | " param_grid = param_find5,\n", 794 | " n_jobs=-1)\n", 795 | "start = time.time()\n", 796 | "grid_search5.fit(x_train,y_train)\n", 797 | "end = time.time()\n", 798 | "print('运行时间:{} 秒'.format(round(end-start,0)))\n", 799 | "print(grid_search5.grid_scores_)\n", 800 | "print('\\t')\n", 801 | "print(grid_search5.best_params_)\n", 802 | "print('\\t')\n", 803 | "print(grid_search5.best_score_)" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 44, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "# 将最佳参数再次带入cv函数,设定学习率为0.005\n", 813 | "best_params = {\n", 814 | " 'boosting_type':'gbdt',\n", 815 | " 'learning_rate':0.005,\n", 816 | " 'num_leaves':30,\n", 817 | " 'max_depth':-1,\n", 818 | " 'bagging_fraction':0.5,\n", 819 | " 'feature_fraction':0.6,\n", 820 | " 'min_data_in_leaf':20,\n", 821 | " 'min_sum_hessian_in_leaf':0.001,\n", 822 | " 'lambda_l1':0.3,\n", 823 | " 'lambda_l2':0.03,\n", 824 | " 'metric':'auc'\n", 825 | "}\n", 826 | "\n", 827 | "best_cv = lgb.cv(train_set=lgb_train,\n", 828 | " early_stopping_rounds=5,\n", 829 | " num_boost_round=2000,\n", 830 | " nfold=5,\n", 831 | " params=best_params,\n", 832 | " metrics='auc',\n", 833 | " stratified=True,\n", 834 | " shuffle=True,\n", 835 | " seed=0)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": 50, 841 | "metadata": { 842 | "scrolled": true 843 | }, 844 | "outputs": [ 845 | { 846 | "name": "stdout", 847 | "output_type": "stream", 848 | "text": [ 849 | "最佳参数的迭代次数: 889\n", 850 | "交叉验证的AUC: 0.7357671213094057\n" 851 | ] 852 | } 853 | ], 854 | "source": [ 855 | "print('最佳参数的迭代次数: {}'.format(len(best_cv['auc-mean'])))\n", 856 | "print('交叉验证的AUC: {}'.format(max(best_cv['auc-mean'])))" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": 61, 862 | "metadata": {}, 863 | "outputs": [ 864 | { 865 | "data": { 866 | "text/plain": [ 867 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.6,\n", 868 | " importance_type='split', learning_rate=0.005, max_depth=-1,\n", 869 | " min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n", 870 | " n_estimators=900, n_jobs=-1, num_leaves=30, objective=None,\n", 871 | " random_state=0, reg_alpha=0.3, reg_lambda=0.03, silent=True,\n", 872 | " subsample=0.5, subsample_for_bin=200000, subsample_freq=0)" 873 | ] 874 | }, 875 | "execution_count": 61, 876 | "metadata": {}, 877 | "output_type": "execute_result" 878 | } 879 | ], 880 | "source": [ 881 | "lgb_single_model = lgb.LGBMClassifier(n_estimators=900,\n", 882 | " learning_rate=0.005,\n", 883 | " min_child_weight=0.001,\n", 884 | " min_child_samples = 20,\n", 885 | " subsample=0.5,\n", 886 | " colsample_bytree=0.6,\n", 887 | " num_leaves=30,\n", 888 | " max_depth=-1,\n", 889 | " reg_lambda=0.03,\n", 890 | " reg_alpha=0.3,\n", 891 | " random_state=0)\n", 892 | "lgb_single_model.fit(x_train,y_train)" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": 64, 898 | "metadata": {}, 899 | "outputs": [ 900 | { 901 | "name": "stdout", 902 | "output_type": "stream", 903 | "text": [ 904 | "lightgbm单模型的AUC:0.7535371506640257\n" 905 | ] 906 | }, 907 | { 908 | "data": { 909 | "image/png": "\n", 910 | "text/plain": [ 911 | "
" 912 | ] 913 | }, 914 | "metadata": {}, 915 | "output_type": "display_data" 916 | } 917 | ], 918 | "source": [ 919 | "pre = lgb_single_model.predict_proba(x_test)[:,1]\n", 920 | "print('lightgbm单模型的AUC:{}'.format(metrics.roc_auc_score(y_test,pre)))\n", 921 | "sc.plot_roc(y_test,pre)" 922 | ] 923 | } 924 | ], 925 | "metadata": { 926 | "kernelspec": { 927 | "display_name": "Python 3", 928 | "language": "python", 929 | "name": "python3" 930 | }, 931 | "language_info": { 932 | "codemirror_mode": { 933 | "name": "ipython", 934 | "version": 3 935 | }, 936 | "file_extension": ".py", 937 | "mimetype": "text/x-python", 938 | "name": "python", 939 | "nbconvert_exporter": "python", 940 | "pygments_lexer": "ipython3", 941 | "version": "3.7.0" 942 | }, 943 | "toc": { 944 | "base_numbering": 1, 945 | "nav_menu": {}, 946 | "number_sections": true, 947 | "sideBar": true, 948 | "skip_h1_title": false, 949 | "title_cell": "Table of Contents", 950 | "title_sidebar": "Contents", 951 | "toc_cell": false, 952 | "toc_position": {}, 953 | "toc_section_display": true, 954 | "toc_window_display": false 955 | } 956 | }, 957 | "nbformat": 4, 958 | "nbformat_minor": 2 959 | } 960 | --------------------------------------------------------------------------------