├── 2020XMBank_baseline.ipynb
└── readme


/2020XMBank_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import lightgbm as lgb\n",
 12 |     "import warnings\n",
 13 |     "from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report\n",
 14 |     "from sklearn.model_selection import StratifiedKFold, KFold\n",
 15 |     "from sklearn.preprocessing import LabelEncoder\n",
 16 |     "import os\n",
 17 |     "warnings.filterwarnings('ignore')"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### 读取数据"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "这里依次获取主办方提供的数据。为了后面做特征方便，增加了mon和season字段，并且当读取测试集时对mon和season进行了特别的处理，保证了测试集发生时间在训练集之后。"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "y_Q3_3 = pd.read_csv('y_train_3/y_Q3_3.csv')\n",
 41 |     "y_Q4_3 = pd.read_csv('y_train_3/y_Q4_3.csv')"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "aum_m10.csv\n",
 54 |       "aum_m11.csv\n",
 55 |       "aum_m12.csv\n",
 56 |       "aum_m7.csv\n",
 57 |       "aum_m8.csv\n",
 58 |       "aum_m9.csv\n",
 59 |       "aum_m1.csv\n",
 60 |       "aum_m2.csv\n",
 61 |       "aum_m3.csv\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "aum_fils = os.listdir('x_train/aum_train/')+os.listdir('x_test/aum_test/')\n",
 67 |     "aum = []\n",
 68 |     "for f in aum_fils:\n",
 69 |     "    print(f)\n",
 70 |     "    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))\n",
 71 |     "    if mon>=7:\n",
 72 |     "        tmp = pd.read_csv('x_train/aum_train/'+f)\n",
 73 |     "        tmp['mon'] = mon\n",
 74 |     "    else:\n",
 75 |     "        tmp = pd.read_csv('x_test/aum_test/'+f)\n",
 76 |     "        tmp['mon'] = mon+12\n",
 77 |     "    aum.append(tmp)\n",
 78 |     "aum = pd.concat(aum, axis=0, ignore_index=True)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "behavior_m10.csv\n",
 91 |       "behavior_m11.csv\n",
 92 |       "behavior_m12.csv\n",
 93 |       "behavior_m7.csv\n",
 94 |       "behavior_m8.csv\n",
 95 |       "behavior_m9.csv\n",
 96 |       "behavior_m1.csv\n",
 97 |       "behavior_m2.csv\n",
 98 |       "behavior_m3.csv\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "behavior_fils = os.listdir('x_train/behavior_train/')+os.listdir('x_test/behavior_test/')\n",
104 |     "behavior = []\n",
105 |     "for f in behavior_fils:\n",
106 |     "    print(f)\n",
107 |     "    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))\n",
108 |     "    if mon>=7:\n",
109 |     "        tmp = pd.read_csv('x_train/behavior_train/'+f)\n",
110 |     "        tmp['mon'] = mon\n",
111 |     "    else:\n",
112 |     "        tmp = pd.read_csv('x_test/behavior_test/'+f)\n",
113 |     "        tmp['mon'] = mon+12\n",
114 |     "    behavior.append(tmp)\n",
115 |     "behavior = pd.concat(behavior, axis=0, ignore_index=True)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 5,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "big_event_Q3.csv\n",
128 |       "big_event_Q4.csv\n",
129 |       "big_event_Q1.csv\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "event_fils = os.listdir('x_train/big_event_train/')+os.listdir('x_test/big_event_test/')\n",
135 |     "event = []\n",
136 |     "for f in event_fils:\n",
137 |     "    print(f)\n",
138 |     "    season = int((f.split('.')[0]).split('_')[-1].replace('Q', ''))\n",
139 |     "    if season>=3:\n",
140 |     "        tmp = pd.read_csv('x_train/big_event_train/'+f)\n",
141 |     "    else:\n",
142 |     "        tmp = pd.read_csv('x_test/big_event_test/'+f)\n",
143 |     "    tmp['season'] = season\n",
144 |     "    event.append(tmp)\n",
145 |     "event = pd.concat(event, axis=0, ignore_index=True)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 6,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "cunkuan_m10.csv\n",
158 |       "cunkuan_m11.csv\n",
159 |       "cunkuan_m12.csv\n",
160 |       "cunkuan_m7.csv\n",
161 |       "cunkuan_m8.csv\n",
162 |       "cunkuan_m9.csv\n",
163 |       "cunkuan_m1.csv\n",
164 |       "cunkuan_m2.csv\n",
165 |       "cunkuan_m3.csv\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "cunkuan_fils = os.listdir('x_train/cunkuan_train/')+os.listdir('x_test/cunkuan_test/')\n",
171 |     "cunkuan = []\n",
172 |     "for f in cunkuan_fils:\n",
173 |     "    print(f)\n",
174 |     "    mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))\n",
175 |     "    if mon>=7:\n",
176 |     "        tmp = pd.read_csv('x_train/cunkuan_train/'+f)\n",
177 |     "        tmp['mon'] = mon\n",
178 |     "    else:\n",
179 |     "        tmp = pd.read_csv('x_test/cunkuan_test/'+f)\n",
180 |     "        tmp['mon'] = mon+12\n",
181 |     "    cunkuan.append(tmp)\n",
182 |     "cunkuan = pd.concat(cunkuan, axis=0, ignore_index=True)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 7,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "cust_avli_Q3 = pd.read_csv('x_train/cust_avli_Q3.csv')\n",
192 |     "cust_avli_Q4 = pd.read_csv('x_train/cust_avli_Q4.csv')\n",
193 |     "cust_info_Q3 = pd.read_csv('x_train/cust_info_Q3.csv')\n",
194 |     "cust_info_Q4 = pd.read_csv('x_train/cust_info_Q4.csv')\n",
195 |     "\n",
196 |     "cust_avli_Q1 = pd.read_csv('x_test/cust_avli_Q1.csv')\n",
197 |     "cust_info_Q1 = pd.read_csv('x_test/cust_info_Q1.csv')"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "### 特征工程"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 8,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/plain": [
215 |        "((76170, 2), (76722, 1))"
216 |       ]
217 |      },
218 |      "execution_count": 8,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "train = y_Q4_3.copy()\n",
225 |     "test = cust_avli_Q1.copy()\n",
226 |     "train.shape, test.shape"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "第一组特征很自然的想到用户历史的label，例如在预测季度4的用户时，使用用户在季度3的label作为特征。可以简单看到这个特征的kappa值可以达到0.238+。"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 11,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "y_Q3_3 = y_Q3_3.rename(columns={'label': 'bef_label'})\n",
243 |     "train = train.merge(y_Q3_3, on=['cust_no'], how='left')\n",
244 |     "\n",
245 |     "y_Q4_3 = y_Q4_3.rename(columns={'label': 'bef_label'})\n",
246 |     "test = test.merge(y_Q4_3, on=['cust_no'], how='left')"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 12,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "0.23896181609901146"
258 |       ]
259 |      },
260 |      "execution_count": 12,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "cohen_kappa_score((train['label']+1), (train['bef_label'].fillna(1)+1))"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "接下来可以拼接下用户的基础特征，这里我只是对一些类别变量做了LabelEncoder。"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 10,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "train = train.merge(cust_info_Q4, on=['cust_no'], how='left')\n",
283 |     "test = test.merge(cust_info_Q1, on=['cust_no'], how='left')"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 11,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "for col in [f for f in train.select_dtypes('object').columns if f not in ['label', 'cust_no']]:\n",
293 |     "    train[col].fillna('-1', inplace=True)\n",
294 |     "    test[col].fillna('-1', inplace=True)\n",
295 |     "    le = LabelEncoder()\n",
296 |     "    le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))\n",
297 |     "    train[col] = le.transform(train[col])\n",
298 |     "    test[col] = le.transform(test[col])"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 12,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "((76170, 23), (76722, 22))"
310 |       ]
311 |      },
312 |      "execution_count": 12,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "train.shape, test.shape"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "这题最重要的应该是用户行为相关的数据，下面我们开始做一些简单的操作：\n",
326 |     "1. 用户当季度存款（cunkuan）的mean、max、min、std、sum、last的统计\n",
327 |     "2. 用户当季度最后一个月的aum数据\n",
328 |     "3. 用户当季度最后一个月的behavior数据\n",
329 |     "4. 用户当季度的event的特征，这里大多数都是时间，所以用该季度月末的后一天做时间差特征"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 13,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "cunkuan['C3'] = cunkuan['C1'] / cunkuan['C2']\n",
339 |     "cunkuan = cunkuan.sort_values(by=['cust_no', 'mon']).reset_index(drop=True)\n",
340 |     "\n",
341 |     "agg_stat = {'C1': ['mean', 'max', 'min', 'std', 'sum', 'last'],\n",
342 |     "            'C2': ['mean', 'sum', 'min', 'max', 'std', 'last'],\n",
343 |     "            'C3': ['mean', 'max', 'min', 'std', 'sum', 'last']}\n",
344 |     "group_df = cunkuan[(cunkuan['mon']<=12)&(cunkuan['mon']>=10)].groupby(['cust_no']).agg(agg_stat)\n",
345 |     "group_df.columns = [f[0]+'_'+f[1] for f in group_df.columns]\n",
346 |     "group_df.reset_index(inplace=True)\n",
347 |     "train = train.merge(group_df, on=['cust_no'], how='left')\n",
348 |     "\n",
349 |     "group_df = cunkuan[(cunkuan['mon']<=15)&(cunkuan['mon']>=13)].groupby(['cust_no']).agg(agg_stat)\n",
350 |     "group_df.columns = [f[0]+'_'+f[1] for f in group_df.columns]\n",
351 |     "group_df.reset_index(inplace=True)\n",
352 |     "test = test.merge(group_df, on=['cust_no'], how='left')"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 14,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "X_cols = [f for f in aum.columns if f.startswith('X')]\n",
362 |     "aum['X_sum'] = aum[X_cols].sum(axis=1)\n",
363 |     "aum['X_num'] = (aum[X_cols]>0).sum(axis=1)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 15,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "X_cols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']\n",
373 |     "tmp = aum[aum['mon']==12].copy()\n",
374 |     "del tmp['mon']\n",
375 |     "train = train.merge(tmp, on=['cust_no'], how='left')\n",
376 |     "\n",
377 |     "tmp = aum[aum['mon']==15].copy()\n",
378 |     "del tmp['mon']\n",
379 |     "test = test.merge(tmp, on=['cust_no'], how='left')"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 16,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "behavior['B5-B3'] = behavior['B5'] - behavior['B3']\n",
389 |     "tmp = behavior[behavior['mon']==12].copy()\n",
390 |     "del tmp['mon']\n",
391 |     "train = train.merge(tmp, on=['cust_no'], how='left')\n",
392 |     "\n",
393 |     "tmp = behavior[behavior['mon']==15].copy()\n",
394 |     "del tmp['mon']\n",
395 |     "test = test.merge(tmp, on=['cust_no'], how='left')"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 17,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "train['B6_gap'] = (pd.to_datetime('2020-01-01 00:00:00') - pd.to_datetime(train['B6'])).dt.total_seconds()\n",
405 |     "test['B6_gap'] = (pd.to_datetime('2020-04-01 00:00:00') - pd.to_datetime(test['B6'])).dt.total_seconds()"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 18,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "train['B6_hour'] = pd.to_datetime(train['B6']).dt.hour\n",
415 |     "test['B6_hour'] = pd.to_datetime(test['B6']).dt.hour"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 19,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "E_cols = [f for f in event.columns if f.startswith('E')]\n",
425 |     "event['event_num'] = len(E_cols) - event[E_cols].isnull().sum(axis=1)\n",
426 |     "\n",
427 |     "tmp = event[event['season']==4].copy()\n",
428 |     "del tmp['season']\n",
429 |     "train = train.merge(tmp, on=['cust_no'], how='left')\n",
430 |     "\n",
431 |     "tmp = event[event['season']==1].copy()\n",
432 |     "del tmp['season']\n",
433 |     "test = test.merge(tmp, on=['cust_no'], how='left')"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 20,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "for col in E_cols:\n",
443 |     "    if col not in ['E15', 'E17']:\n",
444 |     "        train[col] = (pd.to_datetime('2020-01-01 00:00:00') - pd.to_datetime(train[col])).dt.days\n",
445 |     "        test[col] = (pd.to_datetime('2020-04-01 00:00:00') - pd.to_datetime(test[col])).dt.days"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "metadata": {},
451 |    "source": [
452 |     "### 模型训练"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "以上就构成了我们baseline的基础特征，下面开始训练模型。这里采用的是Lightgbm进行5折的多分类，早停直接使用kappa值。因为训练多分类时，目标值的最小值得是0，所以我们对原始label做+1的处理（记得提交的时候要改回来）。"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 22,
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "def kappa(preds, train_data):\n",
469 |     "    y_true = train_data.label\n",
470 |     "    preds = np.argmax(preds.reshape(3, -1), axis=0)\n",
471 |     "    score = cohen_kappa_score(y_true, preds)\n",
472 |     "    return 'kappa', score, True\n",
473 |     "\n",
474 |     "def LGB_classfication_model(train, target, test, k):\n",
475 |     "    feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]\n",
476 |     "    print('Current num of features:', len(feats))\n",
477 |     "    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)\n",
478 |     "    oof_preds = np.zeros(train.shape[0])\n",
479 |     "    oof_probs = np.zeros((train.shape[0], 3))\n",
480 |     "    output_preds = []\n",
481 |     "    feature_importance_df = pd.DataFrame()\n",
482 |     "    offline_score = []\n",
483 |     "    for i, (train_index, test_index) in enumerate(folds.split(train, target)):\n",
484 |     "        train_y, test_y = target[train_index], target[test_index]\n",
485 |     "        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]\n",
486 |     "        dtrain = lgb.Dataset(train_X,\n",
487 |     "                             label=train_y,\n",
488 |     "                            )\n",
489 |     "        dval = lgb.Dataset(test_X,\n",
490 |     "                           label=test_y)\n",
491 |     "        parameters = {\n",
492 |     "            'learning_rate': 0.05,\n",
493 |     "            'boosting_type': 'gbdt',\n",
494 |     "            'objective': 'multiclass',\n",
495 |     "            'metric': 'None',\n",
496 |     "            'num_leaves': 63,\n",
497 |     "            'num_class': 3,\n",
498 |     "            'feature_fraction': 0.8,\n",
499 |     "            'bagging_fraction': 0.8,\n",
500 |     "            'min_data_in_leaf': 20,\n",
501 |     "            'verbose': -1,\n",
502 |     "            'nthread': 12\n",
503 |     "        }\n",
504 |     "        lgb_model = lgb.train(\n",
505 |     "            parameters,\n",
506 |     "            dtrain,\n",
507 |     "            num_boost_round=5000,\n",
508 |     "            valid_sets=[dval],\n",
509 |     "            early_stopping_rounds=100,\n",
510 |     "            verbose_eval=100,\n",
511 |     "            feval=kappa,\n",
512 |     "        )\n",
513 |     "        oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)\n",
514 |     "        oof_preds[test_index] = np.argmax(lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration), axis=1)\n",
515 |     "        offline_score.append(lgb_model.best_score['valid_0']['kappa'])\n",
516 |     "        output_preds.append(lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration))\n",
517 |     "        # feature importance\n",
518 |     "        fold_importance_df = pd.DataFrame()\n",
519 |     "        fold_importance_df[\"feature\"] = feats\n",
520 |     "        fold_importance_df[\"importance\"] = lgb_model.feature_importance(importance_type='gain')\n",
521 |     "        fold_importance_df[\"fold\"] = i + 1\n",
522 |     "        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
523 |     "    print('OOF-MEAN-KAPPA score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))\n",
524 |     "    print('feature importance:')\n",
525 |     "    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(15))\n",
526 |     "    print('confusion matrix:')\n",
527 |     "    print(confusion_matrix(target, oof_preds))\n",
528 |     "    print('classfication report:')\n",
529 |     "    print(classification_report(target, oof_preds))\n",
530 |     "\n",
531 |     "    return output_preds, oof_probs, np.mean(offline_score)"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 23,
537 |    "metadata": {},
538 |    "outputs": [
539 |     {
540 |      "name": "stdout",
541 |      "output_type": "stream",
542 |      "text": [
543 |       "Current num of features: 75\n",
544 |       "Training until validation scores don't improve for 100 rounds\n",
545 |       "[100]\tvalid_0's kappa: 0.391214\n",
546 |       "[200]\tvalid_0's kappa: 0.407406\n",
547 |       "[300]\tvalid_0's kappa: 0.407399\n",
548 |       "Early stopping, best iteration is:\n",
549 |       "[284]\tvalid_0's kappa: 0.409634\n",
550 |       "Training until validation scores don't improve for 100 rounds\n",
551 |       "[100]\tvalid_0's kappa: 0.392955\n",
552 |       "[200]\tvalid_0's kappa: 0.407885\n",
553 |       "[300]\tvalid_0's kappa: 0.410009\n",
554 |       "[400]\tvalid_0's kappa: 0.412524\n",
555 |       "Early stopping, best iteration is:\n",
556 |       "[390]\tvalid_0's kappa: 0.414518\n",
557 |       "Training until validation scores don't improve for 100 rounds\n",
558 |       "[100]\tvalid_0's kappa: 0.391538\n",
559 |       "[200]\tvalid_0's kappa: 0.405639\n",
560 |       "[300]\tvalid_0's kappa: 0.40816\n",
561 |       "[400]\tvalid_0's kappa: 0.411267\n",
562 |       "[500]\tvalid_0's kappa: 0.412\n",
563 |       "[600]\tvalid_0's kappa: 0.412379\n",
564 |       "Early stopping, best iteration is:\n",
565 |       "[535]\tvalid_0's kappa: 0.414253\n",
566 |       "Training until validation scores don't improve for 100 rounds\n",
567 |       "[100]\tvalid_0's kappa: 0.391315\n",
568 |       "[200]\tvalid_0's kappa: 0.408037\n",
569 |       "[300]\tvalid_0's kappa: 0.409429\n",
570 |       "Early stopping, best iteration is:\n",
571 |       "[220]\tvalid_0's kappa: 0.410795\n",
572 |       "Training until validation scores don't improve for 100 rounds\n",
573 |       "[100]\tvalid_0's kappa: 0.392679\n",
574 |       "[200]\tvalid_0's kappa: 0.407817\n",
575 |       "[300]\tvalid_0's kappa: 0.4105\n",
576 |       "[400]\tvalid_0's kappa: 0.411369\n",
577 |       "[500]\tvalid_0's kappa: 0.413088\n",
578 |       "[600]\tvalid_0's kappa: 0.41554\n",
579 |       "[700]\tvalid_0's kappa: 0.416812\n",
580 |       "Early stopping, best iteration is:\n",
581 |       "[678]\tvalid_0's kappa: 0.419653\n",
582 |       "OOF-MEAN-KAPPA score:0.413771, OOF-STD:0.003503\n",
583 |       "feature importance:\n",
584 |       "feature\n",
585 |       "X_sum        81101.494288\n",
586 |       "B6_gap       35705.408467\n",
587 |       "bef_label    28336.175362\n",
588 |       "C1_std       25292.655775\n",
589 |       "C1_last      24126.071295\n",
590 |       "C2_last      22143.340695\n",
591 |       "C1_min       19387.615105\n",
592 |       "B7           15646.010900\n",
593 |       "C3_std       12826.683402\n",
594 |       "E16          12518.379958\n",
595 |       "B6_hour      11938.840487\n",
596 |       "E1           11709.834091\n",
597 |       "X3           11251.225545\n",
598 |       "E6           10824.601992\n",
599 |       "E18           9626.040079\n",
600 |       "Name: importance, dtype: float64\n",
601 |       "confusion matrix:\n",
602 |       "[[ 6201  1203  4183]\n",
603 |       " [ 1271  4121  9795]\n",
604 |       " [ 1459  2423 45514]]\n",
605 |       "classfication report:\n",
606 |       "              precision    recall  f1-score   support\n",
607 |       "\n",
608 |       "           0       0.69      0.54      0.60     11587\n",
609 |       "           1       0.53      0.27      0.36     15187\n",
610 |       "           2       0.77      0.92      0.84     49396\n",
611 |       "\n",
612 |       "    accuracy                           0.73     76170\n",
613 |       "   macro avg       0.66      0.58      0.60     76170\n",
614 |       "weighted avg       0.71      0.73      0.71     76170\n",
615 |       "\n"
616 |      ]
617 |     }
618 |    ],
619 |    "source": [
620 |     "target = train['label'] + 1\n",
621 |     "lgb_preds, lgb_oof, lgb_score = LGB_classfication_model(train, target, test, 5)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "markdown",
626 |    "metadata": {},
627 |    "source": [
628 |     "### 线上提交"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 24,
634 |    "metadata": {},
635 |    "outputs": [
636 |     {
637 |      "data": {
638 |       "text/plain": [
639 |        " 1    0.756928\n",
640 |        "-1    0.123120\n",
641 |        " 0    0.119953\n",
642 |        "Name: label, dtype: float64"
643 |       ]
644 |      },
645 |      "execution_count": 24,
646 |      "metadata": {},
647 |      "output_type": "execute_result"
648 |     }
649 |    ],
650 |    "source": [
651 |     "sub_df = test[['cust_no']].copy()\n",
652 |     "sub_df['label'] = np.argmax(np.mean(lgb_preds, axis=0), axis=1) - 1\n",
653 |     "sub_df['label'].value_counts(normalize=True)"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 25,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "sub_df.to_csv('baseline_sub.csv', index=False)"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 26,
668 |    "metadata": {},
669 |    "outputs": [
670 |     {
671 |      "data": {
672 |       "text/html": [
673 |        "<div>\n",
674 |        "<style scoped>\n",
675 |        "    .dataframe tbody tr th:only-of-type {\n",
676 |        "        vertical-align: middle;\n",
677 |        "    }\n",
678 |        "\n",
679 |        "    .dataframe tbody tr th {\n",
680 |        "        vertical-align: top;\n",
681 |        "    }\n",
682 |        "\n",
683 |        "    .dataframe thead th {\n",
684 |        "        text-align: right;\n",
685 |        "    }\n",
686 |        "</style>\n",
687 |        "<table border=\"1\" class=\"dataframe\">\n",
688 |        "  <thead>\n",
689 |        "    <tr style=\"text-align: right;\">\n",
690 |        "      <th></th>\n",
691 |        "      <th>cust_no</th>\n",
692 |        "      <th>label</th>\n",
693 |        "    </tr>\n",
694 |        "  </thead>\n",
695 |        "  <tbody>\n",
696 |        "    <tr>\n",
697 |        "      <td>0</td>\n",
698 |        "      <td>0x3b9b4615</td>\n",
699 |        "      <td>0</td>\n",
700 |        "    </tr>\n",
701 |        "    <tr>\n",
702 |        "      <td>1</td>\n",
703 |        "      <td>0x3b9ae61b</td>\n",
704 |        "      <td>1</td>\n",
705 |        "    </tr>\n",
706 |        "    <tr>\n",
707 |        "      <td>2</td>\n",
708 |        "      <td>0x3b9add69</td>\n",
709 |        "      <td>0</td>\n",
710 |        "    </tr>\n",
711 |        "    <tr>\n",
712 |        "      <td>3</td>\n",
713 |        "      <td>0x3b9b3601</td>\n",
714 |        "      <td>1</td>\n",
715 |        "    </tr>\n",
716 |        "    <tr>\n",
717 |        "      <td>4</td>\n",
718 |        "      <td>0x3b9b2599</td>\n",
719 |        "      <td>0</td>\n",
720 |        "    </tr>\n",
721 |        "  </tbody>\n",
722 |        "</table>\n",
723 |        "</div>"
724 |       ],
725 |       "text/plain": [
726 |        "      cust_no  label\n",
727 |        "0  0x3b9b4615      0\n",
728 |        "1  0x3b9ae61b      1\n",
729 |        "2  0x3b9add69      0\n",
730 |        "3  0x3b9b3601      1\n",
731 |        "4  0x3b9b2599      0"
732 |       ]
733 |      },
734 |      "execution_count": 26,
735 |      "metadata": {},
736 |      "output_type": "execute_result"
737 |     }
738 |    ],
739 |    "source": [
740 |     "sub_df.head()"
741 |    ]
742 |   }
743 |  ],
744 |  "metadata": {
745 |   "kernelspec": {
746 |    "display_name": "Python 2",
747 |    "language": "python",
748 |    "name": "python2"
749 |   },
750 |   "language_info": {
751 |    "codemirror_mode": {
752 |     "name": "ipython",
753 |     "version": 2
754 |    },
755 |    "file_extension": ".py",
756 |    "mimetype": "text/x-python",
757 |    "name": "python",
758 |    "nbconvert_exporter": "python",
759 |    "pygments_lexer": "ipython2",
760 |    "version": "2.7.16"
761 |   },
762 |   "toc": {
763 |    "base_numbering": 1,
764 |    "nav_menu": {},
765 |    "number_sections": true,
766 |    "sideBar": true,
767 |    "skip_h1_title": false,
768 |    "title_cell": "Table of Contents",
769 |    "title_sidebar": "Contents",
770 |    "toc_cell": false,
771 |    "toc_position": {},
772 |    "toc_section_display": true,
773 |    "toc_window_display": false
774 |   }
775 |  },
776 |  "nbformat": 4,
777 |  "nbformat_minor": 2
778 | }
779 | 


--------------------------------------------------------------------------------
/readme:
--------------------------------------------------------------------------------
1 | 2020厦门国际银行数创金融杯建模大赛Baseline，线上0.39+。
2 | 欢迎大家关注我们的公众号：OTTO Data Lab
3 | 


--------------------------------------------------------------------------------