├── README.md
└── ques2_210fea_74151.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # 2019_cmb_fintech
 2 | 2019招行fintech精英训练营线上赛  
 3 | 第二题基于收支记录判断借贷意愿baseline(a榜：0.74151 b榜：0.74242)  
 4 | **整理不易，如果大家觉得有用欢迎star，谢谢**
 5 | ## 赛题描述
 6 | 手机APP的首页首屏，是流量最大的页面，对应页面资源极其珍贵。现要将借钱卡片投放到首页，展现给最有点击意愿（借钱意愿）的用户。请基于用户真实的历史收支数据来预测未来一段时间内用户是否会点击借钱卡片。
 7 | 
 8 | 其中，收支数据是指用户的交易数据，覆盖收入、支出、本人资金往来等各种类型。本赛题需依据用户的收支数据，预测一周内（20190307至20190313）目标用户点击借钱卡片的概率。
 9 | ## 评测指标
10 | AUC的全称是Area under the Curve of ROC，即ROC曲线下方的面积。在机器学习领域，AUC值常被用来评价一个二分类模型的训练效果。
11 | 
12 | 本题将根据参赛选手提交的预测结果计算AUC，四舍五入精确到小数点后5位。
13 | ## 解决方案
14 | 见notebook  
15 | 这份代码可以复现线上a榜0.74151，b榜0.74242的成绩  
16 | 共210维特征，提取特征部分用时2min左右，模型训练用时6min左右 
17 | 


--------------------------------------------------------------------------------
/ques2_210fea_74151.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import xgboost as xgb\n",
 14 |     "from sklearn.model_selection import StratifiedKFold, KFold\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "from sklearn import preprocessing\n",
 17 |     "import warnings\n",
 18 |     "warnings.filterwarnings('ignore')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "transaction_history = pd.read_csv('data/FT_Camp_2/sz_detail.csv')\n",
 30 |     "sz_id_inf = pd.read_csv('data/FT_Camp_2/trx_cod.csv')\n",
 31 |     "g2_cod_inf = pd.read_csv('data/FT_Camp_2/g2.csv')\n",
 32 |     "cuts_inf = pd.read_csv('data/FT_Camp_2/cust_bas_inf.csv')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "## 用户信息， 性别转类别特征，划分年龄区间\n",
 44 |     "cuts_inf['gender'] = cuts_inf['gender'].astype('category').cat.codes\n",
 45 |     "cuts_inf['age'] = cuts_inf.age.apply(lambda x: int(x) if x not in ['3', '\\\\N'] else np.nan)\n",
 46 |     "cuts_inf['age_range'] = pd.cut(cuts_inf['age'], [0, 19, 29, 39, 49, 59, 100], labels=False).astype('category').cat.codes\n",
 47 |     "cuts_inf['aum227'] = cuts_inf.aum227.apply(lambda x: float(x) if x != '\\\\N' else np.nan)\n",
 48 |     "cuts_inf['aum306'] = cuts_inf.aum306.apply(lambda x: float(x) if x != '\\\\N' else np.nan)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "## 收支分类 转类别特征\n",
 60 |     "sz_id_inf['cat1'] = sz_id_inf['cat1'].astype('category').cat.codes"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "## sz_id g2 类别编码 train test 保持一致的编码\n",
 72 |     "transaction_history = transaction_history.sort_values(['id', 'prt_dt']).reset_index(drop=True)\n",
 73 |     "transaction_history['g2_cod'] = transaction_history['g2_cod'].fillna('-1')\n",
 74 |     "sz_le = preprocessing.LabelEncoder()\n",
 75 |     "sz_le.fit(transaction_history.sz_id.values)\n",
 76 |     "g2_le = preprocessing.LabelEncoder()\n",
 77 |     "g2_le.fit(transaction_history.g2_cod.values)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {
 84 |     "collapsed": true
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "### 特征提取函数\n",
 89 |     "def get_feature(end_time, data_name):\n",
 90 |     "    if data_name == 'train':\n",
 91 |     "        data = pd.read_csv('data/FT_Camp_2/train.csv')\n",
 92 |     "    else:\n",
 93 |     "        data = pd.read_csv('data/FT_Camp_2/pred_users.csv')\n",
 94 |     "    \n",
 95 |     "    transaction_history_data = transaction_history[(transaction_history['prt_dt']<=end_time)].reset_index(drop=True)\n",
 96 |     "\n",
 97 |     "    ## 将一些辅助类别信息补充到交易信息表里\n",
 98 |     "    transaction_history_data = pd.merge(transaction_history_data, cuts_inf[['id', 'gender', 'age', 'age_range']], on=['id'], how='left')\n",
 99 |     "    transaction_history_data = pd.merge(transaction_history_data, sz_id_inf[['sz_id', 'cat1']], on=['sz_id'], how='left')\n",
100 |     "\n",
101 |     "    transaction_history_data['sz_id'] = sz_le.transform(transaction_history_data['sz_id'].values)\n",
102 |     "    transaction_history_data['g2_cod'] = g2_le.transform(transaction_history_data['g2_cod'].values)\n",
103 |     "\n",
104 |     "    transaction_history_data = transaction_history_data[['id', 'sz_id', 'cat1', 'g2_cod', 'gender', 'age', 'age_range', 'rmb_amt','prt_dt']]\n",
105 |     "\n",
106 |     "    ## 训练集用户属性 信息\n",
107 |     "    data = pd.merge(data, cuts_inf[['id', 'gender', 'age', 'age_range']], on=['id'], how='left')\n",
108 |     "\n",
109 |     "    ## 训练集用户记录里最多的 sz_id g2_cod\n",
110 |     "    temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['prt_dt'].count()\\\n",
111 |     "    .sort_values(['id', 'prt_dt'], ascending=[True, False]).groupby('id').apply(lambda x: x['sz_id'].values[0])\\\n",
112 |     "    .reset_index().rename(columns={0:'sz_id'})\n",
113 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
114 |     "    data['sz_id'] = data['sz_id'].astype('category').cat.codes\n",
115 |     "\n",
116 |     "    temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['prt_dt'].count()\\\n",
117 |     "    .sort_values(['id', 'prt_dt'], ascending=[True, False]).groupby('id').apply(lambda x: x['g2_cod'].values[0])\\\n",
118 |     "    .reset_index().rename(columns={0:'g2_cod'})\n",
119 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
120 |     "    data['g2_cod'] = data['g2_cod'].astype('category').cat.codes\n",
121 |     "\n",
122 |     "    ### 用户交易条目数和交易天数 平均每天交易数\n",
123 |     "    temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_trans_count':'count'})\n",
124 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
125 |     "\n",
126 |     "    temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_trans_day_count':'nunique'})\n",
127 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
128 |     "\n",
129 |     "    data['user_trans_day_mean'] = data['user_trans_count']/data['user_trans_day_count']\n",
130 |     "\n",
131 |     "    ## 用户 总交易额 收入 支出 条目、天平均\n",
132 |     "    temp = transaction_history_data[transaction_history_data['rmb_amt']>=0].groupby('id', as_index=False)['rmb_amt'].agg({'user_sr_sum':'sum'})\n",
133 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
134 |     "\n",
135 |     "    temp = transaction_history_data[transaction_history_data['rmb_amt']<0].groupby('id', as_index=False)['rmb_amt'].agg({'user_zc_sum':'sum'})\n",
136 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
137 |     "\n",
138 |     "    data['user_sr_+_zc'] = data['user_sr_sum'] + data['user_zc_sum']\n",
139 |     "    data['user_sr_-_zc'] = data['user_sr_sum'] - data['user_zc_sum']\n",
140 |     "\n",
141 |     "    for i in ['user_sr_sum', 'user_zc_sum', 'user_sr_+_zc', 'user_sr_-_zc']:\n",
142 |     "        col = i + '/trans'\n",
143 |     "        data[col] = data[i]/data['user_trans_count']\n",
144 |     "        col = i + '/days'\n",
145 |     "        data[col] = data[i]/data['user_trans_day_count']\n",
146 |     "\n",
147 |     "    ## 用户 总交易额 收入 支出 均值，方差 统计值\n",
148 |     "    temp = transaction_history_data.groupby('id', as_index=False)['rmb_amt']\\\n",
149 |     "    .agg({'user_rmb_mean':'mean', 'user_rmb_std':'std'})\n",
150 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
151 |     "\n",
152 |     "    temp = transaction_history_data[transaction_history_data['rmb_amt']>=0].groupby('id', as_index=False)['rmb_amt']\\\n",
153 |     "    .agg({'user_sr_mean':'mean', 'user_sr_std':'std'})\n",
154 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
155 |     "\n",
156 |     "    temp = transaction_history_data[transaction_history_data['rmb_amt']<0].groupby('id', as_index=False)['rmb_amt']\\\n",
157 |     "    .agg({'user_zc_mean':'mean', 'user_zc_std':'std'})\n",
158 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
159 |     "\n",
160 |     "    ## 用户 sz三种不同类型计数占比\n",
161 |     "    temp = transaction_history_data.groupby(['id', 'cat1'])['prt_dt'].count().unstack().reset_index()\\\n",
162 |     "    .rename(columns={0:'user_cat1_0_count', 1:'user_cat1_1_count', 2:'user_cat1_2_count'})\n",
163 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
164 |     "\n",
165 |     "    for i in ['user_cat1_0_count', 'user_cat1_1_count', 'user_cat1_2_count']:\n",
166 |     "        col = i.replace('count', 'ratio')\n",
167 |     "        data[col] = data[i]/data['user_trans_count']\n",
168 |     "\n",
169 |     "    ## 用户 交易的第一天和最后一天\n",
170 |     "    temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_first_day':'min', 'user_last_day':'max'})\n",
171 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
172 |     "\n",
173 |     "    data['user_first_day'] = data['user_first_day'].astype('category').cat.codes\n",
174 |     "    data['user_last_day'] = data['user_last_day'].astype('category').cat.codes\n",
175 |     "\n",
176 |     "    ## 用户 交易记录 sz_id 占比\n",
177 |     "    temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['prt_dt'].count()\n",
178 |     "    temp = pd.merge(temp, transaction_history_data[['id', 'prt_dt']].groupby('id', as_index=False)['prt_dt'].agg({'sz_id_ratio': 'count'}),\\\n",
179 |     "             on=['id'], how='left')\n",
180 |     "    temp['sz_id_ratio'] = temp['prt_dt']/temp['sz_id_ratio']\n",
181 |     "    temp = pd.pivot_table(temp[['id', 'sz_id', 'sz_id_ratio']], index=['id', 'sz_id']).unstack().reset_index().fillna(0)\n",
182 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
183 |     "\n",
184 |     "    ## 用户 交易记录 g2_cod 占比\n",
185 |     "    temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['prt_dt'].count()\n",
186 |     "    temp = pd.merge(temp, transaction_history_data[['id', 'prt_dt']].groupby('id', as_index=False)['prt_dt'].agg({'g2_cod_ratio': 'count'}),\\\n",
187 |     "             on=['id'], how='left')\n",
188 |     "    temp['g2_cod_ratio'] = temp['prt_dt']/temp['g2_cod_ratio']\n",
189 |     "    temp = pd.pivot_table(temp[['id', 'g2_cod', 'g2_cod_ratio']], index=['id', 'g2_cod']).unstack().reset_index().fillna(0)\n",
190 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
191 |     "    \n",
192 |     "    ## 用户余额\n",
193 |     "    if data_name == 'train':\n",
194 |     "        data = pd.merge(data, cuts_inf[['id', 'aum227']], on=['id'], how='left').rename(columns={'aum227':'aum_last'})\n",
195 |     "    else:\n",
196 |     "        data = pd.merge(data, cuts_inf[['id', 'aum306']], on=['id'], how='left').rename(columns={'aum306':'aum_last'})\n",
197 |     "    \n",
198 |     "    ## 用户 交易记录 sz_id 金额求和\n",
199 |     "    temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['rmb_amt'].agg({'sz_id_amt':'sum'})\n",
200 |     "    temp = pd.pivot_table(temp[['id', 'sz_id', 'sz_id_amt']], index=['id', 'sz_id']).unstack().reset_index().fillna(0)\n",
201 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
202 |     "    \n",
203 |     "    ## 用户 交易记录 g2_cod 金额求和\n",
204 |     "    temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['rmb_amt'].agg({'g2_cod_amt':'sum'})\n",
205 |     "    temp = pd.pivot_table(temp[['id', 'g2_cod', 'g2_cod_amt']], index=['id', 'g2_cod']).unstack().reset_index().fillna(0)\n",
206 |     "    data = pd.merge(data, temp, on=['id'], how='left')\n",
207 |     "    \n",
208 |     "    return data"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "## 1-1 ~ 2-27 训练集提取特征 1-1 ~ 3-6 测试集提取特征\n",
220 |     "train = get_feature('2019-02-27', 'train')\n",
221 |     "test = get_feature('2019-03-06', 'test')"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "## 处理列名格式\n",
233 |     "rename_col = []\n",
234 |     "for i in train.columns:\n",
235 |     "    if isinstance(i, tuple):\n",
236 |     "        rename_col.append(i[0]+'_'+str(i[1]))\n",
237 |     "    else:\n",
238 |     "        rename_col.append(i)\n",
239 |     "train.columns = rename_col\n",
240 |     "\n",
241 |     "rename_col = []\n",
242 |     "for i in test.columns:\n",
243 |     "    if isinstance(i, tuple):\n",
244 |     "        rename_col.append(i[0]+'_'+str(i[1]))\n",
245 |     "    else:\n",
246 |     "        rename_col.append(i)\n",
247 |     "test.columns = rename_col"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "y = train['click_w228'].values"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {
265 |     "collapsed": true
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "## 模型用到的210个特征\n",
270 |     "## 具体筛选方式是先跑一折xgb模型，保留特征重要度大于10的210个特征，后期固定下来所以略去具体的训练步骤\n",
271 |     "col_lst = [ i for i in train.columns if i in [\n",
272 |     "       'aum_last', 'age', 'user_cat1_1_ratio', 'user_trans_day_mean','sz_id_ratio_28', 'sz_id_amt_31', 'user_cat1_0_ratio',\n",
273 |     "       'user_sr_std', 'user_cat1_2_ratio', 'user_zc_std', 'sz_id_amt_53','user_sr_+_zc', 'user_cat1_2_count', 'user_sr_+_zc/trans',\n",
274 |     "       'user_first_day', 'g2_cod_amt_221', 'user_cat1_1_count', 'sz_id_ratio_31', 'sz_id_ratio_53', 'user_sr_+_zc/days',\n",
275 |     "       'sz_id_ratio_15', 'user_sr_mean', 'user_cat1_0_count','g2_cod_ratio_239', 'user_last_day', 'sz_id_amt_28',\n",
276 |     "       'g2_cod_ratio_221', 'sz_id_ratio_32', 'g2_cod_ratio_223','g2_cod_ratio_203', 'sz_id_ratio_39', 'sz_id_amt_15',\n",
277 |     "       'sz_id_ratio_42', 'sz_id_ratio_52', 'g2_cod_amt_265','user_zc_sum', 'user_rmb_mean', 'sz_id_amt_52', 'sz_id_amt_54',\n",
278 |     "       'sz_id_ratio_40', 'g2_cod_amt_223', 'g2_cod_ratio_201','user_sr_sum', 'user_rmb_std', 'sz_id_ratio_54', 'g2_cod_amt_201',\n",
279 |     "       'g2_cod_amt_243', 'g2_cod_ratio_243', 'sz_id_amt_32','g2_cod_amt_203', 'sz_id_ratio_45', 'sz_id_ratio_1',\n",
280 |     "       'user_trans_count', 'sz_id_amt_10', 'g2_cod_ratio_158','user_trans_day_count', 'sz_id_amt_42', 'sz_id_ratio_30',\n",
281 |     "       'g2_cod_ratio_117', 'sz_id_ratio_27', 'g2_cod_amt_306','g2_cod_ratio_265', 'user_sr_sum/days', 'sz_id_amt_45',\n",
282 |     "       'user_zc_sum/days', 'user_sr_-_zc/days', 'user_sr_sum/trans','sz_id_amt_40', 'user_zc_mean', 'sz_id_amt_33', 'sz_id_ratio_7',\n",
283 |     "       'sz_id_ratio_33', 'sz_id_amt_39', 'sz_id_ratio_16', 'g2_cod_ratio_304', 'sz_id_ratio_2', 'g2_cod_ratio_306',\n",
284 |     "       'g2_cod_ratio_266', 'user_sr_-_zc', 'g2_cod_ratio_300','g2_cod_ratio_303', 'sz_id_ratio_3', 'user_zc_sum/trans',\n",
285 |     "       'sz_id_amt_27', 'g2_cod_amt_253', 'g2_cod_amt_304','g2_cod_amt_117', 'sz_id_ratio_10', 'g2_cod_amt_300', 'gender',\n",
286 |     "       'g2_cod_ratio_119', 'g2_cod_amt_303', 'g2_cod_amt_112','sz_id_ratio_19', 'g2_cod_ratio_278', 'g2_cod_ratio_253',\n",
287 |     "       'user_sr_-_zc/trans', 'sz_id', 'g2_cod_ratio_345', 'g2_cod','g2_cod_ratio_129', 'sz_id_amt_7', 'sz_id_amt_24', 'sz_id_amt_30',\n",
288 |     "       'g2_cod_ratio_222', 'sz_id_amt_1', 'g2_cod_amt_340','g2_cod_ratio_31', 'sz_id_amt_19', 'sz_id_amt_3', 'sz_id_amt_41',\n",
289 |     "       'sz_id_amt_16', 'g2_cod_ratio_340', 'sz_id_ratio_29','g2_cod_ratio_185', 'sz_id_amt_46', 'g2_cod_amt_31',\n",
290 |     "       'g2_cod_amt_158', 'sz_id_amt_11', 'g2_cod_ratio_112','g2_cod_ratio_121', 'sz_id_amt_2', 'sz_id_ratio_41',\n",
291 |     "       'sz_id_ratio_24', 'sz_id_ratio_11', 'g2_cod_amt_346','sz_id_ratio_6', 'sz_id_ratio_46', 'g2_cod_amt_278',\n",
292 |     "       'g2_cod_amt_119', 'g2_cod_amt_345', 'g2_cod_amt_239','sz_id_amt_29', 'g2_cod_ratio_346', 'g2_cod_amt_121',\n",
293 |     "       'g2_cod_amt_157', 'sz_id_amt_37', 'sz_id_ratio_34','g2_cod_ratio_34', 'sz_id_amt_0', 'sz_id_ratio_26',\n",
294 |     "       'g2_cod_amt_222', 'g2_cod_amt_185', 'sz_id_ratio_17','g2_cod_amt_34', 'g2_cod_ratio_314', 'sz_id_amt_6',\n",
295 |     "       'g2_cod_ratio_187', 'sz_id_amt_12', 'sz_id_ratio_12','g2_cod_amt_268', 'g2_cod_amt_129', 'sz_id_ratio_36',\n",
296 |     "       'sz_id_amt_17', 'sz_id_ratio_37', 'g2_cod_ratio_268', 'age_range','g2_cod_ratio_283', 'g2_cod_amt_41', 'g2_cod_amt_269',\n",
297 |     "       'sz_id_ratio_23', 'sz_id_ratio_4', 'g2_cod_amt_187','g2_cod_ratio_262', 'g2_cod_ratio_293', 'g2_cod_ratio_128',\n",
298 |     "       'sz_id_ratio_0', 'g2_cod_ratio_148', 'g2_cod_amt_283','sz_id_amt_55', 'g2_cod_ratio_130', 'sz_id_amt_49',\n",
299 |     "       'g2_cod_amt_120', 'g2_cod_amt_293', 'g2_cod_amt_266', 'g2_cod_ratio_120', 'g2_cod_ratio_267', 'g2_cod_ratio_172',\n",
300 |     "       'g2_cod_amt_314', 'g2_cod_ratio_263', 'sz_id_amt_34','sz_id_amt_26', 'g2_cod_amt_267', 'g2_cod_ratio_174',\n",
301 |     "       'g2_cod_ratio_269', 'g2_cod_amt_134', 'sz_id_amt_4','sz_id_amt_36', 'sz_id_ratio_44', 'g2_cod_ratio_157',\n",
302 |     "       'g2_cod_amt_174', 'sz_id_ratio_25', 'sz_id_amt_25', 'sz_id_amt_8','sz_id_amt_9', 'g2_cod_ratio_292', 'g2_cod_ratio_349',\n",
303 |     "       'g2_cod_amt_263', 'g2_cod_amt_130', 'sz_id_amt_22','sz_id_ratio_9', 'g2_cod_amt_206', 'sz_id_ratio_22',\n",
304 |     "       'g2_cod_ratio_350', 'sz_id_ratio_51', 'sz_id_amt_51','g2_cod_ratio_41', 'sz_id_ratio_47', 'sz_id_ratio_5',\n",
305 |     "       'g2_cod_amt_172']]"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "## 模型训练部分，分层抽样，10折cv只跑一折即可\n",
317 |     "skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n",
318 |     "auc_list = []\n",
319 |     "sub_list = []\n",
320 |     "\n",
321 |     "for k, (train_idx, test_idx) in enumerate(skf.split(y, y)):\n",
322 |     "    if k in [9]:\n",
323 |     "        print(k+1)\n",
324 |     "    \n",
325 |     "        X_train, X_test, y_train, y_test = \\\n",
326 |     "        train[col_lst].iloc[train_idx], train[col_lst].iloc[test_idx], y[train_idx], y[test_idx]\n",
327 |     "\n",
328 |     "        xgb_train = xgb.DMatrix(X_train, label=y_train)\n",
329 |     "        xgb_val  = xgb.DMatrix(X_test,  label=y_test)\n",
330 |     "\n",
331 |     "        \n",
332 |     "        params = {\n",
333 |     "                'booster': 'gbtree',\n",
334 |     "                'objective': 'binary:logistic',\n",
335 |     "                'eval_metric': 'auc',\n",
336 |     "                'gamma': 0.1,\n",
337 |     "                'min_child_weight': 1.1,\n",
338 |     "                'learning_rate' : 0.01,\n",
339 |     "                'max_depth': 5,\n",
340 |     "                'subsample': 0.8,\n",
341 |     "                'colsample_bytree': 0.8,\n",
342 |     "                'colsample_bylevel': 0.8,\n",
343 |     "                'lambda': 10,\n",
344 |     "                'verbose_eval': 1,\n",
345 |     "                'nthread': 6,\n",
346 |     "                'silent': 1,\n",
347 |     "        }\n",
348 |     "\n",
349 |     "        evallist = [(xgb_train, 'train'), (xgb_val, 'eval')]\n",
350 |     "        gbm = xgb.train(params, xgb_train, 3000, evallist, early_stopping_rounds=60, verbose_eval=50) \n",
351 |     "    \n",
352 |     "        auc_list.append(gbm.best_score)\n",
353 |     "        sub_list.append(gbm.predict(xgb.DMatrix(test[col_lst]), ntree_limit=gbm.best_ntree_limit)) "
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "test['score'] = sub_list[0]\n",
365 |     "test[['id', 'score']].to_csv('ques2_210fea_cv1.csv', index=None)"
366 |    ]
367 |   }
368 |  ],
369 |  "metadata": {
370 |   "kernelspec": {
371 |    "display_name": "Python [conda env:py3]",
372 |    "language": "python",
373 |    "name": "conda-env-py3-py"
374 |   },
375 |   "language_info": {
376 |    "codemirror_mode": {
377 |     "name": "ipython",
378 |     "version": 3
379 |    },
380 |    "file_extension": ".py",
381 |    "mimetype": "text/x-python",
382 |    "name": "python",
383 |    "nbconvert_exporter": "python",
384 |    "pygments_lexer": "ipython3",
385 |    "version": "3.6.5"
386 |   }
387 |  },
388 |  "nbformat": 4,
389 |  "nbformat_minor": 2
390 | }
391 | 


--------------------------------------------------------------------------------