├── 2020XMBank_baseline.ipynb └── readme /2020XMBank_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import lightgbm as lgb\n", 12 | "import warnings\n", 13 | "from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report\n", 14 | "from sklearn.model_selection import StratifiedKFold, KFold\n", 15 | "from sklearn.preprocessing import LabelEncoder\n", 16 | "import os\n", 17 | "warnings.filterwarnings('ignore')" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### 读取数据" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "这里依次获取主办方提供的数据。为了后面做特征方便,增加了mon和season字段,并且当读取测试集时对mon和season进行了特别的处理,保证了测试集发生时间在训练集之后。" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "y_Q3_3 = pd.read_csv('y_train_3/y_Q3_3.csv')\n", 41 | "y_Q4_3 = pd.read_csv('y_train_3/y_Q4_3.csv')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "aum_m10.csv\n", 54 | "aum_m11.csv\n", 55 | "aum_m12.csv\n", 56 | "aum_m7.csv\n", 57 | "aum_m8.csv\n", 58 | "aum_m9.csv\n", 59 | "aum_m1.csv\n", 60 | "aum_m2.csv\n", 61 | "aum_m3.csv\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "aum_fils = os.listdir('x_train/aum_train/')+os.listdir('x_test/aum_test/')\n", 67 | "aum = []\n", 68 | "for f in aum_fils:\n", 69 | " print(f)\n", 70 | " mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))\n", 71 | " if mon>=7:\n", 72 | " tmp = pd.read_csv('x_train/aum_train/'+f)\n", 73 | " tmp['mon'] = mon\n", 74 | " else:\n", 75 | " tmp = pd.read_csv('x_test/aum_test/'+f)\n", 76 | " tmp['mon'] = mon+12\n", 77 | " aum.append(tmp)\n", 78 | "aum = pd.concat(aum, axis=0, ignore_index=True)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "behavior_m10.csv\n", 91 | "behavior_m11.csv\n", 92 | "behavior_m12.csv\n", 93 | "behavior_m7.csv\n", 94 | "behavior_m8.csv\n", 95 | "behavior_m9.csv\n", 96 | "behavior_m1.csv\n", 97 | "behavior_m2.csv\n", 98 | "behavior_m3.csv\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "behavior_fils = os.listdir('x_train/behavior_train/')+os.listdir('x_test/behavior_test/')\n", 104 | "behavior = []\n", 105 | "for f in behavior_fils:\n", 106 | " print(f)\n", 107 | " mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))\n", 108 | " if mon>=7:\n", 109 | " tmp = pd.read_csv('x_train/behavior_train/'+f)\n", 110 | " tmp['mon'] = mon\n", 111 | " else:\n", 112 | " tmp = pd.read_csv('x_test/behavior_test/'+f)\n", 113 | " tmp['mon'] = mon+12\n", 114 | " behavior.append(tmp)\n", 115 | "behavior = pd.concat(behavior, axis=0, ignore_index=True)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 5, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "big_event_Q3.csv\n", 128 | "big_event_Q4.csv\n", 129 | "big_event_Q1.csv\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "event_fils = os.listdir('x_train/big_event_train/')+os.listdir('x_test/big_event_test/')\n", 135 | "event = []\n", 136 | "for f in event_fils:\n", 137 | " print(f)\n", 138 | " season = int((f.split('.')[0]).split('_')[-1].replace('Q', ''))\n", 139 | " if season>=3:\n", 140 | " tmp = pd.read_csv('x_train/big_event_train/'+f)\n", 141 | " else:\n", 142 | " tmp = pd.read_csv('x_test/big_event_test/'+f)\n", 143 | " tmp['season'] = season\n", 144 | " event.append(tmp)\n", 145 | "event = pd.concat(event, axis=0, ignore_index=True)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "cunkuan_m10.csv\n", 158 | "cunkuan_m11.csv\n", 159 | "cunkuan_m12.csv\n", 160 | "cunkuan_m7.csv\n", 161 | "cunkuan_m8.csv\n", 162 | "cunkuan_m9.csv\n", 163 | "cunkuan_m1.csv\n", 164 | "cunkuan_m2.csv\n", 165 | "cunkuan_m3.csv\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "cunkuan_fils = os.listdir('x_train/cunkuan_train/')+os.listdir('x_test/cunkuan_test/')\n", 171 | "cunkuan = []\n", 172 | "for f in cunkuan_fils:\n", 173 | " print(f)\n", 174 | " mon = int((f.split('.')[0]).split('_')[-1].replace('m', ''))\n", 175 | " if mon>=7:\n", 176 | " tmp = pd.read_csv('x_train/cunkuan_train/'+f)\n", 177 | " tmp['mon'] = mon\n", 178 | " else:\n", 179 | " tmp = pd.read_csv('x_test/cunkuan_test/'+f)\n", 180 | " tmp['mon'] = mon+12\n", 181 | " cunkuan.append(tmp)\n", 182 | "cunkuan = pd.concat(cunkuan, axis=0, ignore_index=True)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "cust_avli_Q3 = pd.read_csv('x_train/cust_avli_Q3.csv')\n", 192 | "cust_avli_Q4 = pd.read_csv('x_train/cust_avli_Q4.csv')\n", 193 | "cust_info_Q3 = pd.read_csv('x_train/cust_info_Q3.csv')\n", 194 | "cust_info_Q4 = pd.read_csv('x_train/cust_info_Q4.csv')\n", 195 | "\n", 196 | "cust_avli_Q1 = pd.read_csv('x_test/cust_avli_Q1.csv')\n", 197 | "cust_info_Q1 = pd.read_csv('x_test/cust_info_Q1.csv')" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### 特征工程" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 8, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "((76170, 2), (76722, 1))" 216 | ] 217 | }, 218 | "execution_count": 8, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "train = y_Q4_3.copy()\n", 225 | "test = cust_avli_Q1.copy()\n", 226 | "train.shape, test.shape" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "第一组特征很自然的想到用户历史的label,例如在预测季度4的用户时,使用用户在季度3的label作为特征。可以简单看到这个特征的kappa值可以达到0.238+。" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "y_Q3_3 = y_Q3_3.rename(columns={'label': 'bef_label'})\n", 243 | "train = train.merge(y_Q3_3, on=['cust_no'], how='left')\n", 244 | "\n", 245 | "y_Q4_3 = y_Q4_3.rename(columns={'label': 'bef_label'})\n", 246 | "test = test.merge(y_Q4_3, on=['cust_no'], how='left')" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 12, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "0.23896181609901146" 258 | ] 259 | }, 260 | "execution_count": 12, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "cohen_kappa_score((train['label']+1), (train['bef_label'].fillna(1)+1))" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "接下来可以拼接下用户的基础特征,这里我只是对一些类别变量做了LabelEncoder。" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 10, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "train = train.merge(cust_info_Q4, on=['cust_no'], how='left')\n", 283 | "test = test.merge(cust_info_Q1, on=['cust_no'], how='left')" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 11, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "for col in [f for f in train.select_dtypes('object').columns if f not in ['label', 'cust_no']]:\n", 293 | " train[col].fillna('-1', inplace=True)\n", 294 | " test[col].fillna('-1', inplace=True)\n", 295 | " le = LabelEncoder()\n", 296 | " le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))\n", 297 | " train[col] = le.transform(train[col])\n", 298 | " test[col] = le.transform(test[col])" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 12, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "((76170, 23), (76722, 22))" 310 | ] 311 | }, 312 | "execution_count": 12, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "train.shape, test.shape" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "这题最重要的应该是用户行为相关的数据,下面我们开始做一些简单的操作:\n", 326 | "1. 用户当季度存款(cunkuan)的mean、max、min、std、sum、last的统计\n", 327 | "2. 用户当季度最后一个月的aum数据\n", 328 | "3. 用户当季度最后一个月的behavior数据\n", 329 | "4. 用户当季度的event的特征,这里大多数都是时间,所以用该季度月末的后一天做时间差特征" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 13, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "cunkuan['C3'] = cunkuan['C1'] / cunkuan['C2']\n", 339 | "cunkuan = cunkuan.sort_values(by=['cust_no', 'mon']).reset_index(drop=True)\n", 340 | "\n", 341 | "agg_stat = {'C1': ['mean', 'max', 'min', 'std', 'sum', 'last'],\n", 342 | " 'C2': ['mean', 'sum', 'min', 'max', 'std', 'last'],\n", 343 | " 'C3': ['mean', 'max', 'min', 'std', 'sum', 'last']}\n", 344 | "group_df = cunkuan[(cunkuan['mon']<=12)&(cunkuan['mon']>=10)].groupby(['cust_no']).agg(agg_stat)\n", 345 | "group_df.columns = [f[0]+'_'+f[1] for f in group_df.columns]\n", 346 | "group_df.reset_index(inplace=True)\n", 347 | "train = train.merge(group_df, on=['cust_no'], how='left')\n", 348 | "\n", 349 | "group_df = cunkuan[(cunkuan['mon']<=15)&(cunkuan['mon']>=13)].groupby(['cust_no']).agg(agg_stat)\n", 350 | "group_df.columns = [f[0]+'_'+f[1] for f in group_df.columns]\n", 351 | "group_df.reset_index(inplace=True)\n", 352 | "test = test.merge(group_df, on=['cust_no'], how='left')" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 14, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "X_cols = [f for f in aum.columns if f.startswith('X')]\n", 362 | "aum['X_sum'] = aum[X_cols].sum(axis=1)\n", 363 | "aum['X_num'] = (aum[X_cols]>0).sum(axis=1)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 15, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "X_cols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']\n", 373 | "tmp = aum[aum['mon']==12].copy()\n", 374 | "del tmp['mon']\n", 375 | "train = train.merge(tmp, on=['cust_no'], how='left')\n", 376 | "\n", 377 | "tmp = aum[aum['mon']==15].copy()\n", 378 | "del tmp['mon']\n", 379 | "test = test.merge(tmp, on=['cust_no'], how='left')" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 16, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "behavior['B5-B3'] = behavior['B5'] - behavior['B3']\n", 389 | "tmp = behavior[behavior['mon']==12].copy()\n", 390 | "del tmp['mon']\n", 391 | "train = train.merge(tmp, on=['cust_no'], how='left')\n", 392 | "\n", 393 | "tmp = behavior[behavior['mon']==15].copy()\n", 394 | "del tmp['mon']\n", 395 | "test = test.merge(tmp, on=['cust_no'], how='left')" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 17, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "train['B6_gap'] = (pd.to_datetime('2020-01-01 00:00:00') - pd.to_datetime(train['B6'])).dt.total_seconds()\n", 405 | "test['B6_gap'] = (pd.to_datetime('2020-04-01 00:00:00') - pd.to_datetime(test['B6'])).dt.total_seconds()" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 18, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "train['B6_hour'] = pd.to_datetime(train['B6']).dt.hour\n", 415 | "test['B6_hour'] = pd.to_datetime(test['B6']).dt.hour" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 19, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "E_cols = [f for f in event.columns if f.startswith('E')]\n", 425 | "event['event_num'] = len(E_cols) - event[E_cols].isnull().sum(axis=1)\n", 426 | "\n", 427 | "tmp = event[event['season']==4].copy()\n", 428 | "del tmp['season']\n", 429 | "train = train.merge(tmp, on=['cust_no'], how='left')\n", 430 | "\n", 431 | "tmp = event[event['season']==1].copy()\n", 432 | "del tmp['season']\n", 433 | "test = test.merge(tmp, on=['cust_no'], how='left')" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 20, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "for col in E_cols:\n", 443 | " if col not in ['E15', 'E17']:\n", 444 | " train[col] = (pd.to_datetime('2020-01-01 00:00:00') - pd.to_datetime(train[col])).dt.days\n", 445 | " test[col] = (pd.to_datetime('2020-04-01 00:00:00') - pd.to_datetime(test[col])).dt.days" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "### 模型训练" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "以上就构成了我们baseline的基础特征,下面开始训练模型。这里采用的是Lightgbm进行5折的多分类,早停直接使用kappa值。因为训练多分类时,目标值的最小值得是0,所以我们对原始label做+1的处理(记得提交的时候要改回来)。" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 22, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "def kappa(preds, train_data):\n", 469 | " y_true = train_data.label\n", 470 | " preds = np.argmax(preds.reshape(3, -1), axis=0)\n", 471 | " score = cohen_kappa_score(y_true, preds)\n", 472 | " return 'kappa', score, True\n", 473 | "\n", 474 | "def LGB_classfication_model(train, target, test, k):\n", 475 | " feats = [f for f in train.columns if f not in ['cust_no', 'label', 'I7', 'I9', 'B6']]\n", 476 | " print('Current num of features:', len(feats))\n", 477 | " folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)\n", 478 | " oof_preds = np.zeros(train.shape[0])\n", 479 | " oof_probs = np.zeros((train.shape[0], 3))\n", 480 | " output_preds = []\n", 481 | " feature_importance_df = pd.DataFrame()\n", 482 | " offline_score = []\n", 483 | " for i, (train_index, test_index) in enumerate(folds.split(train, target)):\n", 484 | " train_y, test_y = target[train_index], target[test_index]\n", 485 | " train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]\n", 486 | " dtrain = lgb.Dataset(train_X,\n", 487 | " label=train_y,\n", 488 | " )\n", 489 | " dval = lgb.Dataset(test_X,\n", 490 | " label=test_y)\n", 491 | " parameters = {\n", 492 | " 'learning_rate': 0.05,\n", 493 | " 'boosting_type': 'gbdt',\n", 494 | " 'objective': 'multiclass',\n", 495 | " 'metric': 'None',\n", 496 | " 'num_leaves': 63,\n", 497 | " 'num_class': 3,\n", 498 | " 'feature_fraction': 0.8,\n", 499 | " 'bagging_fraction': 0.8,\n", 500 | " 'min_data_in_leaf': 20,\n", 501 | " 'verbose': -1,\n", 502 | " 'nthread': 12\n", 503 | " }\n", 504 | " lgb_model = lgb.train(\n", 505 | " parameters,\n", 506 | " dtrain,\n", 507 | " num_boost_round=5000,\n", 508 | " valid_sets=[dval],\n", 509 | " early_stopping_rounds=100,\n", 510 | " verbose_eval=100,\n", 511 | " feval=kappa,\n", 512 | " )\n", 513 | " oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)\n", 514 | " oof_preds[test_index] = np.argmax(lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration), axis=1)\n", 515 | " offline_score.append(lgb_model.best_score['valid_0']['kappa'])\n", 516 | " output_preds.append(lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration))\n", 517 | " # feature importance\n", 518 | " fold_importance_df = pd.DataFrame()\n", 519 | " fold_importance_df[\"feature\"] = feats\n", 520 | " fold_importance_df[\"importance\"] = lgb_model.feature_importance(importance_type='gain')\n", 521 | " fold_importance_df[\"fold\"] = i + 1\n", 522 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 523 | " print('OOF-MEAN-KAPPA score:%.6f, OOF-STD:%.6f' % (np.mean(offline_score), np.std(offline_score)))\n", 524 | " print('feature importance:')\n", 525 | " print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(15))\n", 526 | " print('confusion matrix:')\n", 527 | " print(confusion_matrix(target, oof_preds))\n", 528 | " print('classfication report:')\n", 529 | " print(classification_report(target, oof_preds))\n", 530 | "\n", 531 | " return output_preds, oof_probs, np.mean(offline_score)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 23, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "Current num of features: 75\n", 544 | "Training until validation scores don't improve for 100 rounds\n", 545 | "[100]\tvalid_0's kappa: 0.391214\n", 546 | "[200]\tvalid_0's kappa: 0.407406\n", 547 | "[300]\tvalid_0's kappa: 0.407399\n", 548 | "Early stopping, best iteration is:\n", 549 | "[284]\tvalid_0's kappa: 0.409634\n", 550 | "Training until validation scores don't improve for 100 rounds\n", 551 | "[100]\tvalid_0's kappa: 0.392955\n", 552 | "[200]\tvalid_0's kappa: 0.407885\n", 553 | "[300]\tvalid_0's kappa: 0.410009\n", 554 | "[400]\tvalid_0's kappa: 0.412524\n", 555 | "Early stopping, best iteration is:\n", 556 | "[390]\tvalid_0's kappa: 0.414518\n", 557 | "Training until validation scores don't improve for 100 rounds\n", 558 | "[100]\tvalid_0's kappa: 0.391538\n", 559 | "[200]\tvalid_0's kappa: 0.405639\n", 560 | "[300]\tvalid_0's kappa: 0.40816\n", 561 | "[400]\tvalid_0's kappa: 0.411267\n", 562 | "[500]\tvalid_0's kappa: 0.412\n", 563 | "[600]\tvalid_0's kappa: 0.412379\n", 564 | "Early stopping, best iteration is:\n", 565 | "[535]\tvalid_0's kappa: 0.414253\n", 566 | "Training until validation scores don't improve for 100 rounds\n", 567 | "[100]\tvalid_0's kappa: 0.391315\n", 568 | "[200]\tvalid_0's kappa: 0.408037\n", 569 | "[300]\tvalid_0's kappa: 0.409429\n", 570 | "Early stopping, best iteration is:\n", 571 | "[220]\tvalid_0's kappa: 0.410795\n", 572 | "Training until validation scores don't improve for 100 rounds\n", 573 | "[100]\tvalid_0's kappa: 0.392679\n", 574 | "[200]\tvalid_0's kappa: 0.407817\n", 575 | "[300]\tvalid_0's kappa: 0.4105\n", 576 | "[400]\tvalid_0's kappa: 0.411369\n", 577 | "[500]\tvalid_0's kappa: 0.413088\n", 578 | "[600]\tvalid_0's kappa: 0.41554\n", 579 | "[700]\tvalid_0's kappa: 0.416812\n", 580 | "Early stopping, best iteration is:\n", 581 | "[678]\tvalid_0's kappa: 0.419653\n", 582 | "OOF-MEAN-KAPPA score:0.413771, OOF-STD:0.003503\n", 583 | "feature importance:\n", 584 | "feature\n", 585 | "X_sum 81101.494288\n", 586 | "B6_gap 35705.408467\n", 587 | "bef_label 28336.175362\n", 588 | "C1_std 25292.655775\n", 589 | "C1_last 24126.071295\n", 590 | "C2_last 22143.340695\n", 591 | "C1_min 19387.615105\n", 592 | "B7 15646.010900\n", 593 | "C3_std 12826.683402\n", 594 | "E16 12518.379958\n", 595 | "B6_hour 11938.840487\n", 596 | "E1 11709.834091\n", 597 | "X3 11251.225545\n", 598 | "E6 10824.601992\n", 599 | "E18 9626.040079\n", 600 | "Name: importance, dtype: float64\n", 601 | "confusion matrix:\n", 602 | "[[ 6201 1203 4183]\n", 603 | " [ 1271 4121 9795]\n", 604 | " [ 1459 2423 45514]]\n", 605 | "classfication report:\n", 606 | " precision recall f1-score support\n", 607 | "\n", 608 | " 0 0.69 0.54 0.60 11587\n", 609 | " 1 0.53 0.27 0.36 15187\n", 610 | " 2 0.77 0.92 0.84 49396\n", 611 | "\n", 612 | " accuracy 0.73 76170\n", 613 | " macro avg 0.66 0.58 0.60 76170\n", 614 | "weighted avg 0.71 0.73 0.71 76170\n", 615 | "\n" 616 | ] 617 | } 618 | ], 619 | "source": [ 620 | "target = train['label'] + 1\n", 621 | "lgb_preds, lgb_oof, lgb_score = LGB_classfication_model(train, target, test, 5)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "### 线上提交" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 24, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | " 1 0.756928\n", 640 | "-1 0.123120\n", 641 | " 0 0.119953\n", 642 | "Name: label, dtype: float64" 643 | ] 644 | }, 645 | "execution_count": 24, 646 | "metadata": {}, 647 | "output_type": "execute_result" 648 | } 649 | ], 650 | "source": [ 651 | "sub_df = test[['cust_no']].copy()\n", 652 | "sub_df['label'] = np.argmax(np.mean(lgb_preds, axis=0), axis=1) - 1\n", 653 | "sub_df['label'].value_counts(normalize=True)" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 25, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "sub_df.to_csv('baseline_sub.csv', index=False)" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 26, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "data": { 672 | "text/html": [ 673 | "
\n", 691 | " | cust_no | \n", 692 | "label | \n", 693 | "
---|---|---|
0 | \n", 698 | "0x3b9b4615 | \n", 699 | "0 | \n", 700 | "
1 | \n", 703 | "0x3b9ae61b | \n", 704 | "1 | \n", 705 | "
2 | \n", 708 | "0x3b9add69 | \n", 709 | "0 | \n", 710 | "
3 | \n", 713 | "0x3b9b3601 | \n", 714 | "1 | \n", 715 | "
4 | \n", 718 | "0x3b9b2599 | \n", 719 | "0 | \n", 720 | "