├── README.md └── ques2_210fea_74151.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # 2019_cmb_fintech 2 | 2019招行fintech精英训练营线上赛 3 | 第二题基于收支记录判断借贷意愿baseline(a榜:0.74151 b榜:0.74242) 4 | **整理不易,如果大家觉得有用欢迎star,谢谢** 5 | ## 赛题描述 6 | 手机APP的首页首屏,是流量最大的页面,对应页面资源极其珍贵。现要将借钱卡片投放到首页,展现给最有点击意愿(借钱意愿)的用户。请基于用户真实的历史收支数据来预测未来一段时间内用户是否会点击借钱卡片。 7 | 8 | 其中,收支数据是指用户的交易数据,覆盖收入、支出、本人资金往来等各种类型。本赛题需依据用户的收支数据,预测一周内(20190307至20190313)目标用户点击借钱卡片的概率。 9 | ## 评测指标 10 | AUC的全称是Area under the Curve of ROC,即ROC曲线下方的面积。在机器学习领域,AUC值常被用来评价一个二分类模型的训练效果。 11 | 12 | 本题将根据参赛选手提交的预测结果计算AUC,四舍五入精确到小数点后5位。 13 | ## 解决方案 14 | 见notebook 15 | 这份代码可以复现线上a榜0.74151,b榜0.74242的成绩 16 | 共210维特征,提取特征部分用时2min左右,模型训练用时6min左右 17 | -------------------------------------------------------------------------------- /ques2_210fea_74151.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import xgboost as xgb\n", 14 | "from sklearn.model_selection import StratifiedKFold, KFold\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "from sklearn import preprocessing\n", 17 | "import warnings\n", 18 | "warnings.filterwarnings('ignore')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "transaction_history = pd.read_csv('data/FT_Camp_2/sz_detail.csv')\n", 30 | "sz_id_inf = pd.read_csv('data/FT_Camp_2/trx_cod.csv')\n", 31 | "g2_cod_inf = pd.read_csv('data/FT_Camp_2/g2.csv')\n", 32 | "cuts_inf = pd.read_csv('data/FT_Camp_2/cust_bas_inf.csv')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "## 用户信息, 性别转类别特征,划分年龄区间\n", 44 | "cuts_inf['gender'] = cuts_inf['gender'].astype('category').cat.codes\n", 45 | "cuts_inf['age'] = cuts_inf.age.apply(lambda x: int(x) if x not in ['3', '\\\\N'] else np.nan)\n", 46 | "cuts_inf['age_range'] = pd.cut(cuts_inf['age'], [0, 19, 29, 39, 49, 59, 100], labels=False).astype('category').cat.codes\n", 47 | "cuts_inf['aum227'] = cuts_inf.aum227.apply(lambda x: float(x) if x != '\\\\N' else np.nan)\n", 48 | "cuts_inf['aum306'] = cuts_inf.aum306.apply(lambda x: float(x) if x != '\\\\N' else np.nan)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "## 收支分类 转类别特征\n", 60 | "sz_id_inf['cat1'] = sz_id_inf['cat1'].astype('category').cat.codes" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "## sz_id g2 类别编码 train test 保持一致的编码\n", 72 | "transaction_history = transaction_history.sort_values(['id', 'prt_dt']).reset_index(drop=True)\n", 73 | "transaction_history['g2_cod'] = transaction_history['g2_cod'].fillna('-1')\n", 74 | "sz_le = preprocessing.LabelEncoder()\n", 75 | "sz_le.fit(transaction_history.sz_id.values)\n", 76 | "g2_le = preprocessing.LabelEncoder()\n", 77 | "g2_le.fit(transaction_history.g2_cod.values)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "### 特征提取函数\n", 89 | "def get_feature(end_time, data_name):\n", 90 | " if data_name == 'train':\n", 91 | " data = pd.read_csv('data/FT_Camp_2/train.csv')\n", 92 | " else:\n", 93 | " data = pd.read_csv('data/FT_Camp_2/pred_users.csv')\n", 94 | " \n", 95 | " transaction_history_data = transaction_history[(transaction_history['prt_dt']<=end_time)].reset_index(drop=True)\n", 96 | "\n", 97 | " ## 将一些辅助类别信息补充到交易信息表里\n", 98 | " transaction_history_data = pd.merge(transaction_history_data, cuts_inf[['id', 'gender', 'age', 'age_range']], on=['id'], how='left')\n", 99 | " transaction_history_data = pd.merge(transaction_history_data, sz_id_inf[['sz_id', 'cat1']], on=['sz_id'], how='left')\n", 100 | "\n", 101 | " transaction_history_data['sz_id'] = sz_le.transform(transaction_history_data['sz_id'].values)\n", 102 | " transaction_history_data['g2_cod'] = g2_le.transform(transaction_history_data['g2_cod'].values)\n", 103 | "\n", 104 | " transaction_history_data = transaction_history_data[['id', 'sz_id', 'cat1', 'g2_cod', 'gender', 'age', 'age_range', 'rmb_amt','prt_dt']]\n", 105 | "\n", 106 | " ## 训练集用户属性 信息\n", 107 | " data = pd.merge(data, cuts_inf[['id', 'gender', 'age', 'age_range']], on=['id'], how='left')\n", 108 | "\n", 109 | " ## 训练集用户记录里最多的 sz_id g2_cod\n", 110 | " temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['prt_dt'].count()\\\n", 111 | " .sort_values(['id', 'prt_dt'], ascending=[True, False]).groupby('id').apply(lambda x: x['sz_id'].values[0])\\\n", 112 | " .reset_index().rename(columns={0:'sz_id'})\n", 113 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 114 | " data['sz_id'] = data['sz_id'].astype('category').cat.codes\n", 115 | "\n", 116 | " temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['prt_dt'].count()\\\n", 117 | " .sort_values(['id', 'prt_dt'], ascending=[True, False]).groupby('id').apply(lambda x: x['g2_cod'].values[0])\\\n", 118 | " .reset_index().rename(columns={0:'g2_cod'})\n", 119 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 120 | " data['g2_cod'] = data['g2_cod'].astype('category').cat.codes\n", 121 | "\n", 122 | " ### 用户交易条目数和交易天数 平均每天交易数\n", 123 | " temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_trans_count':'count'})\n", 124 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 125 | "\n", 126 | " temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_trans_day_count':'nunique'})\n", 127 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 128 | "\n", 129 | " data['user_trans_day_mean'] = data['user_trans_count']/data['user_trans_day_count']\n", 130 | "\n", 131 | " ## 用户 总交易额 收入 支出 条目、天平均\n", 132 | " temp = transaction_history_data[transaction_history_data['rmb_amt']>=0].groupby('id', as_index=False)['rmb_amt'].agg({'user_sr_sum':'sum'})\n", 133 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 134 | "\n", 135 | " temp = transaction_history_data[transaction_history_data['rmb_amt']<0].groupby('id', as_index=False)['rmb_amt'].agg({'user_zc_sum':'sum'})\n", 136 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 137 | "\n", 138 | " data['user_sr_+_zc'] = data['user_sr_sum'] + data['user_zc_sum']\n", 139 | " data['user_sr_-_zc'] = data['user_sr_sum'] - data['user_zc_sum']\n", 140 | "\n", 141 | " for i in ['user_sr_sum', 'user_zc_sum', 'user_sr_+_zc', 'user_sr_-_zc']:\n", 142 | " col = i + '/trans'\n", 143 | " data[col] = data[i]/data['user_trans_count']\n", 144 | " col = i + '/days'\n", 145 | " data[col] = data[i]/data['user_trans_day_count']\n", 146 | "\n", 147 | " ## 用户 总交易额 收入 支出 均值,方差 统计值\n", 148 | " temp = transaction_history_data.groupby('id', as_index=False)['rmb_amt']\\\n", 149 | " .agg({'user_rmb_mean':'mean', 'user_rmb_std':'std'})\n", 150 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 151 | "\n", 152 | " temp = transaction_history_data[transaction_history_data['rmb_amt']>=0].groupby('id', as_index=False)['rmb_amt']\\\n", 153 | " .agg({'user_sr_mean':'mean', 'user_sr_std':'std'})\n", 154 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 155 | "\n", 156 | " temp = transaction_history_data[transaction_history_data['rmb_amt']<0].groupby('id', as_index=False)['rmb_amt']\\\n", 157 | " .agg({'user_zc_mean':'mean', 'user_zc_std':'std'})\n", 158 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 159 | "\n", 160 | " ## 用户 sz三种不同类型计数占比\n", 161 | " temp = transaction_history_data.groupby(['id', 'cat1'])['prt_dt'].count().unstack().reset_index()\\\n", 162 | " .rename(columns={0:'user_cat1_0_count', 1:'user_cat1_1_count', 2:'user_cat1_2_count'})\n", 163 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 164 | "\n", 165 | " for i in ['user_cat1_0_count', 'user_cat1_1_count', 'user_cat1_2_count']:\n", 166 | " col = i.replace('count', 'ratio')\n", 167 | " data[col] = data[i]/data['user_trans_count']\n", 168 | "\n", 169 | " ## 用户 交易的第一天和最后一天\n", 170 | " temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_first_day':'min', 'user_last_day':'max'})\n", 171 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 172 | "\n", 173 | " data['user_first_day'] = data['user_first_day'].astype('category').cat.codes\n", 174 | " data['user_last_day'] = data['user_last_day'].astype('category').cat.codes\n", 175 | "\n", 176 | " ## 用户 交易记录 sz_id 占比\n", 177 | " temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['prt_dt'].count()\n", 178 | " temp = pd.merge(temp, transaction_history_data[['id', 'prt_dt']].groupby('id', as_index=False)['prt_dt'].agg({'sz_id_ratio': 'count'}),\\\n", 179 | " on=['id'], how='left')\n", 180 | " temp['sz_id_ratio'] = temp['prt_dt']/temp['sz_id_ratio']\n", 181 | " temp = pd.pivot_table(temp[['id', 'sz_id', 'sz_id_ratio']], index=['id', 'sz_id']).unstack().reset_index().fillna(0)\n", 182 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 183 | "\n", 184 | " ## 用户 交易记录 g2_cod 占比\n", 185 | " temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['prt_dt'].count()\n", 186 | " temp = pd.merge(temp, transaction_history_data[['id', 'prt_dt']].groupby('id', as_index=False)['prt_dt'].agg({'g2_cod_ratio': 'count'}),\\\n", 187 | " on=['id'], how='left')\n", 188 | " temp['g2_cod_ratio'] = temp['prt_dt']/temp['g2_cod_ratio']\n", 189 | " temp = pd.pivot_table(temp[['id', 'g2_cod', 'g2_cod_ratio']], index=['id', 'g2_cod']).unstack().reset_index().fillna(0)\n", 190 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 191 | " \n", 192 | " ## 用户余额\n", 193 | " if data_name == 'train':\n", 194 | " data = pd.merge(data, cuts_inf[['id', 'aum227']], on=['id'], how='left').rename(columns={'aum227':'aum_last'})\n", 195 | " else:\n", 196 | " data = pd.merge(data, cuts_inf[['id', 'aum306']], on=['id'], how='left').rename(columns={'aum306':'aum_last'})\n", 197 | " \n", 198 | " ## 用户 交易记录 sz_id 金额求和\n", 199 | " temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['rmb_amt'].agg({'sz_id_amt':'sum'})\n", 200 | " temp = pd.pivot_table(temp[['id', 'sz_id', 'sz_id_amt']], index=['id', 'sz_id']).unstack().reset_index().fillna(0)\n", 201 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 202 | " \n", 203 | " ## 用户 交易记录 g2_cod 金额求和\n", 204 | " temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['rmb_amt'].agg({'g2_cod_amt':'sum'})\n", 205 | " temp = pd.pivot_table(temp[['id', 'g2_cod', 'g2_cod_amt']], index=['id', 'g2_cod']).unstack().reset_index().fillna(0)\n", 206 | " data = pd.merge(data, temp, on=['id'], how='left')\n", 207 | " \n", 208 | " return data" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "## 1-1 ~ 2-27 训练集提取特征 1-1 ~ 3-6 测试集提取特征\n", 220 | "train = get_feature('2019-02-27', 'train')\n", 221 | "test = get_feature('2019-03-06', 'test')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "## 处理列名格式\n", 233 | "rename_col = []\n", 234 | "for i in train.columns:\n", 235 | " if isinstance(i, tuple):\n", 236 | " rename_col.append(i[0]+'_'+str(i[1]))\n", 237 | " else:\n", 238 | " rename_col.append(i)\n", 239 | "train.columns = rename_col\n", 240 | "\n", 241 | "rename_col = []\n", 242 | "for i in test.columns:\n", 243 | " if isinstance(i, tuple):\n", 244 | " rename_col.append(i[0]+'_'+str(i[1]))\n", 245 | " else:\n", 246 | " rename_col.append(i)\n", 247 | "test.columns = rename_col" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "y = train['click_w228'].values" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "## 模型用到的210个特征\n", 270 | "## 具体筛选方式是先跑一折xgb模型,保留特征重要度大于10的210个特征,后期固定下来所以略去具体的训练步骤\n", 271 | "col_lst = [ i for i in train.columns if i in [\n", 272 | " 'aum_last', 'age', 'user_cat1_1_ratio', 'user_trans_day_mean','sz_id_ratio_28', 'sz_id_amt_31', 'user_cat1_0_ratio',\n", 273 | " 'user_sr_std', 'user_cat1_2_ratio', 'user_zc_std', 'sz_id_amt_53','user_sr_+_zc', 'user_cat1_2_count', 'user_sr_+_zc/trans',\n", 274 | " 'user_first_day', 'g2_cod_amt_221', 'user_cat1_1_count', 'sz_id_ratio_31', 'sz_id_ratio_53', 'user_sr_+_zc/days',\n", 275 | " 'sz_id_ratio_15', 'user_sr_mean', 'user_cat1_0_count','g2_cod_ratio_239', 'user_last_day', 'sz_id_amt_28',\n", 276 | " 'g2_cod_ratio_221', 'sz_id_ratio_32', 'g2_cod_ratio_223','g2_cod_ratio_203', 'sz_id_ratio_39', 'sz_id_amt_15',\n", 277 | " 'sz_id_ratio_42', 'sz_id_ratio_52', 'g2_cod_amt_265','user_zc_sum', 'user_rmb_mean', 'sz_id_amt_52', 'sz_id_amt_54',\n", 278 | " 'sz_id_ratio_40', 'g2_cod_amt_223', 'g2_cod_ratio_201','user_sr_sum', 'user_rmb_std', 'sz_id_ratio_54', 'g2_cod_amt_201',\n", 279 | " 'g2_cod_amt_243', 'g2_cod_ratio_243', 'sz_id_amt_32','g2_cod_amt_203', 'sz_id_ratio_45', 'sz_id_ratio_1',\n", 280 | " 'user_trans_count', 'sz_id_amt_10', 'g2_cod_ratio_158','user_trans_day_count', 'sz_id_amt_42', 'sz_id_ratio_30',\n", 281 | " 'g2_cod_ratio_117', 'sz_id_ratio_27', 'g2_cod_amt_306','g2_cod_ratio_265', 'user_sr_sum/days', 'sz_id_amt_45',\n", 282 | " 'user_zc_sum/days', 'user_sr_-_zc/days', 'user_sr_sum/trans','sz_id_amt_40', 'user_zc_mean', 'sz_id_amt_33', 'sz_id_ratio_7',\n", 283 | " 'sz_id_ratio_33', 'sz_id_amt_39', 'sz_id_ratio_16', 'g2_cod_ratio_304', 'sz_id_ratio_2', 'g2_cod_ratio_306',\n", 284 | " 'g2_cod_ratio_266', 'user_sr_-_zc', 'g2_cod_ratio_300','g2_cod_ratio_303', 'sz_id_ratio_3', 'user_zc_sum/trans',\n", 285 | " 'sz_id_amt_27', 'g2_cod_amt_253', 'g2_cod_amt_304','g2_cod_amt_117', 'sz_id_ratio_10', 'g2_cod_amt_300', 'gender',\n", 286 | " 'g2_cod_ratio_119', 'g2_cod_amt_303', 'g2_cod_amt_112','sz_id_ratio_19', 'g2_cod_ratio_278', 'g2_cod_ratio_253',\n", 287 | " 'user_sr_-_zc/trans', 'sz_id', 'g2_cod_ratio_345', 'g2_cod','g2_cod_ratio_129', 'sz_id_amt_7', 'sz_id_amt_24', 'sz_id_amt_30',\n", 288 | " 'g2_cod_ratio_222', 'sz_id_amt_1', 'g2_cod_amt_340','g2_cod_ratio_31', 'sz_id_amt_19', 'sz_id_amt_3', 'sz_id_amt_41',\n", 289 | " 'sz_id_amt_16', 'g2_cod_ratio_340', 'sz_id_ratio_29','g2_cod_ratio_185', 'sz_id_amt_46', 'g2_cod_amt_31',\n", 290 | " 'g2_cod_amt_158', 'sz_id_amt_11', 'g2_cod_ratio_112','g2_cod_ratio_121', 'sz_id_amt_2', 'sz_id_ratio_41',\n", 291 | " 'sz_id_ratio_24', 'sz_id_ratio_11', 'g2_cod_amt_346','sz_id_ratio_6', 'sz_id_ratio_46', 'g2_cod_amt_278',\n", 292 | " 'g2_cod_amt_119', 'g2_cod_amt_345', 'g2_cod_amt_239','sz_id_amt_29', 'g2_cod_ratio_346', 'g2_cod_amt_121',\n", 293 | " 'g2_cod_amt_157', 'sz_id_amt_37', 'sz_id_ratio_34','g2_cod_ratio_34', 'sz_id_amt_0', 'sz_id_ratio_26',\n", 294 | " 'g2_cod_amt_222', 'g2_cod_amt_185', 'sz_id_ratio_17','g2_cod_amt_34', 'g2_cod_ratio_314', 'sz_id_amt_6',\n", 295 | " 'g2_cod_ratio_187', 'sz_id_amt_12', 'sz_id_ratio_12','g2_cod_amt_268', 'g2_cod_amt_129', 'sz_id_ratio_36',\n", 296 | " 'sz_id_amt_17', 'sz_id_ratio_37', 'g2_cod_ratio_268', 'age_range','g2_cod_ratio_283', 'g2_cod_amt_41', 'g2_cod_amt_269',\n", 297 | " 'sz_id_ratio_23', 'sz_id_ratio_4', 'g2_cod_amt_187','g2_cod_ratio_262', 'g2_cod_ratio_293', 'g2_cod_ratio_128',\n", 298 | " 'sz_id_ratio_0', 'g2_cod_ratio_148', 'g2_cod_amt_283','sz_id_amt_55', 'g2_cod_ratio_130', 'sz_id_amt_49',\n", 299 | " 'g2_cod_amt_120', 'g2_cod_amt_293', 'g2_cod_amt_266', 'g2_cod_ratio_120', 'g2_cod_ratio_267', 'g2_cod_ratio_172',\n", 300 | " 'g2_cod_amt_314', 'g2_cod_ratio_263', 'sz_id_amt_34','sz_id_amt_26', 'g2_cod_amt_267', 'g2_cod_ratio_174',\n", 301 | " 'g2_cod_ratio_269', 'g2_cod_amt_134', 'sz_id_amt_4','sz_id_amt_36', 'sz_id_ratio_44', 'g2_cod_ratio_157',\n", 302 | " 'g2_cod_amt_174', 'sz_id_ratio_25', 'sz_id_amt_25', 'sz_id_amt_8','sz_id_amt_9', 'g2_cod_ratio_292', 'g2_cod_ratio_349',\n", 303 | " 'g2_cod_amt_263', 'g2_cod_amt_130', 'sz_id_amt_22','sz_id_ratio_9', 'g2_cod_amt_206', 'sz_id_ratio_22',\n", 304 | " 'g2_cod_ratio_350', 'sz_id_ratio_51', 'sz_id_amt_51','g2_cod_ratio_41', 'sz_id_ratio_47', 'sz_id_ratio_5',\n", 305 | " 'g2_cod_amt_172']]" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "## 模型训练部分,分层抽样,10折cv只跑一折即可\n", 317 | "skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n", 318 | "auc_list = []\n", 319 | "sub_list = []\n", 320 | "\n", 321 | "for k, (train_idx, test_idx) in enumerate(skf.split(y, y)):\n", 322 | " if k in [9]:\n", 323 | " print(k+1)\n", 324 | " \n", 325 | " X_train, X_test, y_train, y_test = \\\n", 326 | " train[col_lst].iloc[train_idx], train[col_lst].iloc[test_idx], y[train_idx], y[test_idx]\n", 327 | "\n", 328 | " xgb_train = xgb.DMatrix(X_train, label=y_train)\n", 329 | " xgb_val = xgb.DMatrix(X_test, label=y_test)\n", 330 | "\n", 331 | " \n", 332 | " params = {\n", 333 | " 'booster': 'gbtree',\n", 334 | " 'objective': 'binary:logistic',\n", 335 | " 'eval_metric': 'auc',\n", 336 | " 'gamma': 0.1,\n", 337 | " 'min_child_weight': 1.1,\n", 338 | " 'learning_rate' : 0.01,\n", 339 | " 'max_depth': 5,\n", 340 | " 'subsample': 0.8,\n", 341 | " 'colsample_bytree': 0.8,\n", 342 | " 'colsample_bylevel': 0.8,\n", 343 | " 'lambda': 10,\n", 344 | " 'verbose_eval': 1,\n", 345 | " 'nthread': 6,\n", 346 | " 'silent': 1,\n", 347 | " }\n", 348 | "\n", 349 | " evallist = [(xgb_train, 'train'), (xgb_val, 'eval')]\n", 350 | " gbm = xgb.train(params, xgb_train, 3000, evallist, early_stopping_rounds=60, verbose_eval=50) \n", 351 | " \n", 352 | " auc_list.append(gbm.best_score)\n", 353 | " sub_list.append(gbm.predict(xgb.DMatrix(test[col_lst]), ntree_limit=gbm.best_ntree_limit)) " 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "test['score'] = sub_list[0]\n", 365 | "test[['id', 'score']].to_csv('ques2_210fea_cv1.csv', index=None)" 366 | ] 367 | } 368 | ], 369 | "metadata": { 370 | "kernelspec": { 371 | "display_name": "Python [conda env:py3]", 372 | "language": "python", 373 | "name": "conda-env-py3-py" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 3 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython3", 385 | "version": "3.6.5" 386 | } 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 2 390 | } 391 | --------------------------------------------------------------------------------