├── README.md ├── code ├── .ipynb_checkpoints │ ├── Baseline-checkpoint.ipynb │ └── Baseline_bagging_version-checkpoint.ipynb ├── Baseline.ipynb └── Baseline_bagging_version.ipynb └── input ├── submit_example.csv ├── test_dataset.csv └── train_dataset.csv /README.md: -------------------------------------------------------------------------------- 1 | # Credit-Scoring-Regression by YourVenn@Kaggle 2 | - 消费者人群画像—信用智能评分比赛开源 3 | - 请把源数据放在 input/ 4 | - Baseline: 线上6379+,Baseline_bagging_version: 线上6388+ 5 | - 认为有用的朋友,方便的话求点赞~ 谢谢! 6 | -------------------------------------------------------------------------------- /code/.ipynb_checkpoints/Baseline-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 14, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "#过年没时间做了,专心搞kaggle去了\n", 19 | "#kaggle玩家欢迎和我交流,ID是YourVenn" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n", 32 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 33 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 34 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 35 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "import time\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import seaborn as sns\n", 43 | "import numpy as np\n", 44 | "import pandas as pd\n", 45 | "import lightgbm as lgb\n", 46 | "from sklearn.model_selection import StratifiedKFold\n", 47 | "from sklearn.preprocessing import LabelEncoder" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Input data" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data_path = '../input/'\n", 66 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n", 67 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n", 68 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Pre-processing" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | "
用户编码用户实名制是否通过核实用户年龄是否大学生客户是否黑名单客户是否4G不健康客户用户网龄(月)用户最近一次缴费距今时长(月)缴费用户最近一次缴费金额(元)用户近6个月平均消费值(元)...当月是否景点游览当月是否体育场馆消费当月网购类应用使用次数当月物流快递类应用使用次数当月金融理财类应用使用总次数当月视频播放类应用使用次数当月飞机类应用使用次数当月火车类应用使用次数当月旅游资讯类应用使用次数信用分
0a4651f98c82948b186bdcdc8108381b4144000186199.8163.86...117130274071450030664
\n", 154 | "

1 rows × 30 columns

\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n", 159 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n", 160 | "\n", 161 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n", 162 | "0 0 186 1 99.8 163.86 ... \n", 163 | "\n", 164 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n", 165 | "0 1 1 713 0 2740 \n", 166 | "\n", 167 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n", 168 | "0 7145 0 0 30 664 \n", 169 | "\n", 170 | "[1 rows x 30 columns]" 171 | ] 172 | }, 173 | "execution_count": 3, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "train_data.head(1)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n", 192 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n", 193 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n", 194 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n", 195 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n", 196 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n", 197 | " '当月旅游资讯类应用使用次数', '信用分'],\n", 198 | " dtype='object')\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "#all chinese name- -\n", 204 | "#rename one by one\n", 205 | "print(train_data.columns)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 5, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n", 217 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n", 218 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n", 219 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n", 220 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n", 221 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n", 222 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n", 223 | " 'tour_app_count','score']\n", 224 | "test_data.columns = train_data.columns[:-1]" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Feature Engineering" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 6, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "name": "stderr", 241 | "output_type": "stream", 242 | "text": [ 243 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n", 244 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 245 | "\n", 246 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 247 | " \n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n", 253 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n", 254 | "\n", 255 | "def produce_offline_feat(train_data):\n", 256 | " train_data['top_up_amount_offline'] = 0\n", 257 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n", 258 | " train_data['top_up_amount'] != 0] = 1\n", 259 | " return train_data\n", 260 | "\n", 261 | "train_data = produce_offline_feat(train_data)\n", 262 | "test_data = produce_offline_feat(test_data)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 7, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "def produce_fee_rate(train_data):\n", 274 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n", 275 | " train_data['current_fee_stability'] = \\\n", 276 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n", 277 | " \n", 278 | " #当月话费/当月账户余额\n", 279 | " train_data['use_left_rate'] = \\\n", 280 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n", 281 | " return train_data\n", 282 | "\n", 283 | "train_data = produce_fee_rate(train_data)\n", 284 | "test_data = produce_fee_rate(test_data)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### Training" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 8, 297 | "metadata": { 298 | "collapsed": true 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "def display_importances(feature_importance_df_):\n", 303 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n", 304 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n", 305 | " plt.figure(figsize=(8, 10))\n", 306 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n", 307 | " plt.title('LightGBM Features (avg over folds)')\n", 308 | " plt.tight_layout()\n", 309 | " plt.show()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "#para\n", 321 | "params = {\n", 322 | " 'learning_rate': 0.01,\n", 323 | " 'boosting_type': 'gbdt',\n", 324 | " 'objective': 'regression_l1',\n", 325 | " 'metric': 'mae',\n", 326 | " 'feature_fraction': 0.6,\n", 327 | " 'bagging_fraction': 0.8,\n", 328 | " 'bagging_freq': 2,\n", 329 | " 'num_leaves': 31,\n", 330 | " 'verbose': -1,\n", 331 | " 'max_depth': 5,\n", 332 | " 'lambda_l2': 5, 'lambda_l1': 0\n", 333 | "}" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 10, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "NFOLDS = 5\n", 345 | "train_label = train_data['score']\n", 346 | "kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2019)\n", 347 | "kf = kfold.split(train_data, train_label)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 11, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "Index(['uid', 'true_name_flag', 'age', 'uni_student_flag', 'blk_list_flag',\n", 359 | " '4g_unhealth_flag', 'net_age_till_now', 'top_up_month_diff',\n", 360 | " 'top_up_amount', 'recent_6month_avg_use', 'total_account_fee',\n", 361 | " 'curr_month_balance', 'curr_overdue_flag', 'cost_sensitivity',\n", 362 | " 'connect_num', 'freq_shopping_flag', 'recent_3month_shopping_count',\n", 363 | " 'wanda_flag', 'sam_flag', 'movie_flag', 'tour_flag', 'sport_flag',\n", 364 | " 'online_shopping_count', 'express_count', 'finance_app_count',\n", 365 | " 'video_app_count', 'flight_count', 'train_count', 'tour_app_count',\n", 366 | " 'score', 'top_up_amount_offline', 'current_fee_stability',\n", 367 | " 'use_left_rate'],\n", 368 | " dtype='object')" 369 | ] 370 | }, 371 | "execution_count": 11, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "train_data.columns" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 12, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n", 389 | "test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 13, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stderr", 399 | "output_type": "stream", 400 | "text": [ 401 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n", 402 | " % (min_groups, self.n_splits)), Warning)\n" 403 | ] 404 | }, 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "fold: 0 training\n", 410 | "Training until validation scores don't improve for 50 rounds.\n", 411 | "Early stopping, best iteration is:\n", 412 | "[2834]\tvalid_0's l1: 14.7519\n", 413 | "fold: 1 training\n", 414 | "Training until validation scores don't improve for 50 rounds.\n", 415 | "Early stopping, best iteration is:\n", 416 | "[2780]\tvalid_0's l1: 14.6775\n", 417 | "fold: 2 training\n", 418 | "Training until validation scores don't improve for 50 rounds.\n", 419 | "Early stopping, best iteration is:\n", 420 | "[3745]\tvalid_0's l1: 14.728\n", 421 | "fold: 3 training\n", 422 | "Training until validation scores don't improve for 50 rounds.\n", 423 | "Early stopping, best iteration is:\n", 424 | "[3009]\tvalid_0's l1: 14.46\n", 425 | "fold: 4 training\n", 426 | "Training until validation scores don't improve for 50 rounds.\n", 427 | "Early stopping, best iteration is:\n", 428 | "[2544]\tvalid_0's l1: 14.7818\n", 429 | "cv score for valid is: 0.06377613710442855\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "cv_pred = np.zeros(test_data.shape[0])\n", 435 | "valid_best_l2_all = 0\n", 436 | "\n", 437 | "feature_importance_df = pd.DataFrame()\n", 438 | "count = 0\n", 439 | "for i, (train_fold, validate) in enumerate(kf):\n", 440 | " print('fold: ',i, ' training')\n", 441 | " X_train, X_validate, label_train, label_validate = \\\n", 442 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n", 443 | " train_label[train_fold], train_label[validate]\n", 444 | " dtrain = lgb.Dataset(X_train, label_train)\n", 445 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n", 446 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n", 447 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n", 448 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n", 449 | "\n", 450 | " fold_importance_df = pd.DataFrame()\n", 451 | " fold_importance_df[\"feature\"] = list(X_train.columns)\n", 452 | " fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)\n", 453 | " fold_importance_df[\"fold\"] = count + 1\n", 454 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 455 | " count += 1\n", 456 | "\n", 457 | "cv_pred /= NFOLDS\n", 458 | "valid_best_l2_all /= NFOLDS\n", 459 | "print('cv score for valid is: ', 1/(1+valid_best_l2_all))" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 15, 465 | "metadata": { 466 | "collapsed": true 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "display_importances(feature_importance_df)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "collapsed": true 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "baseline\n", 482 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n", 483 | " \n", 484 | "#充值金额是否为整数\n", 485 | "cv score for valid is: 0.06343660584697094\n", 486 | "#当月话费/半年话费\n", 487 | "cv score for valid is: 0.06349188259250227\n", 488 | "#当月话费/余额\n", 489 | "cv score for valid is: 0.06350638782547711\n", 490 | " \n", 491 | "#leaves 31\n", 492 | "cv score for valid is: 0.06354362406472286\n", 493 | "#remove l1, l2 = 5\n", 494 | "cv score for valid is: 0.06358730556250403\n", 495 | "#feature fraction 0.7\n", 496 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n", 497 | "max_depth 5, objective l1\n", 498 | "cv score for valid is: 0.06367445081783887\n", 499 | "feature fraction 0.6\n", 500 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n", 501 | "remove blk flag\n", 502 | "cv score for valid is: 0.06377613710442855" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "### Submit" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 39, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "name": "stderr", 519 | "output_type": "stream", 520 | "text": [ 521 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 522 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 523 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 524 | "\n", 525 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 526 | " \n" 527 | ] 528 | } 529 | ], 530 | "source": [ 531 | "test_data_sub = test_data[['uid']]\n", 532 | "test_data_sub['score'] = cv_pred\n", 533 | "test_data_sub.columns = ['id','score']" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 40, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "name": "stderr", 543 | "output_type": "stream", 544 | "text": [ 545 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 546 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 547 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 548 | "\n", 549 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 550 | " \"\"\"Entry point for launching an IPython kernel.\n" 551 | ] 552 | } 553 | ], 554 | "source": [ 555 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 41, 561 | "metadata": { 562 | "collapsed": true 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "test_data_sub.to_csv('../output/baseline_63776.csv', index=False)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": { 573 | "collapsed": true 574 | }, 575 | "outputs": [], 576 | "source": [] 577 | } 578 | ], 579 | "metadata": { 580 | "kernelspec": { 581 | "display_name": "Python 3", 582 | "language": "python", 583 | "name": "python3" 584 | }, 585 | "language_info": { 586 | "codemirror_mode": { 587 | "name": "ipython", 588 | "version": 3 589 | }, 590 | "file_extension": ".py", 591 | "mimetype": "text/x-python", 592 | "name": "python", 593 | "nbconvert_exporter": "python", 594 | "pygments_lexer": "ipython3", 595 | "version": "3.6.1" 596 | } 597 | }, 598 | "nbformat": 4, 599 | "nbformat_minor": 2 600 | } 601 | -------------------------------------------------------------------------------- /code/.ipynb_checkpoints/Baseline_bagging_version-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n", 20 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 21 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 22 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 23 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import time\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import lightgbm as lgb\n", 34 | "from sklearn.model_selection import StratifiedKFold\n", 35 | "from sklearn.preprocessing import LabelEncoder" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Input data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "data_path = '../input/'\n", 54 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n", 55 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n", 56 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Pre-processing" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/html": [ 74 | "
\n", 75 | "\n", 88 | "\n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
用户编码用户实名制是否通过核实用户年龄是否大学生客户是否黑名单客户是否4G不健康客户用户网龄(月)用户最近一次缴费距今时长(月)缴费用户最近一次缴费金额(元)用户近6个月平均消费值(元)...当月是否景点游览当月是否体育场馆消费当月网购类应用使用次数当月物流快递类应用使用次数当月金融理财类应用使用总次数当月视频播放类应用使用次数当月飞机类应用使用次数当月火车类应用使用次数当月旅游资讯类应用使用次数信用分
0a4651f98c82948b186bdcdc8108381b4144000186199.8163.86...117130274071450030664
\n", 142 | "

1 rows × 30 columns

\n", 143 | "
" 144 | ], 145 | "text/plain": [ 146 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n", 147 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n", 148 | "\n", 149 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n", 150 | "0 0 186 1 99.8 163.86 ... \n", 151 | "\n", 152 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n", 153 | "0 1 1 713 0 2740 \n", 154 | "\n", 155 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n", 156 | "0 7145 0 0 30 664 \n", 157 | "\n", 158 | "[1 rows x 30 columns]" 159 | ] 160 | }, 161 | "execution_count": 3, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "train_data.head(1)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 4, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n", 180 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n", 181 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n", 182 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n", 183 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n", 184 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n", 185 | " '当月旅游资讯类应用使用次数', '信用分'],\n", 186 | " dtype='object')\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "#all chinese name- -\n", 192 | "#rename one by one\n", 193 | "print(train_data.columns)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 5, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n", 205 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n", 206 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n", 207 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n", 208 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n", 209 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n", 210 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n", 211 | " 'tour_app_count','score']\n", 212 | "test_data.columns = train_data.columns[:-1]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Feature Engineering" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 6, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stderr", 238 | "output_type": "stream", 239 | "text": [ 240 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", 241 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 242 | "\n", 243 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 244 | " # This is added back by InteractiveShellApp.init_path()\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n", 250 | "#先前余额,当前余额 + 当月话费 - 上次缴费 --- useless\n", 251 | "#充值金额/余额 --- useless\n", 252 | "#当月话费/最近充值金额 --- useless\n", 253 | "#六个月均值/充值金额 --- useless\n", 254 | "\n", 255 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n", 256 | "\n", 257 | "def produce_offline_feat(train_data):\n", 258 | " train_data['top_up_amount_offline'] = 0\n", 259 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n", 260 | " train_data['top_up_amount'] != 0] = 1\n", 261 | " return train_data\n", 262 | "\n", 263 | "train_data = produce_offline_feat(train_data)\n", 264 | "test_data = produce_offline_feat(test_data)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 7, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "def produce_fee_rate(train_data):\n", 276 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n", 277 | " train_data['current_fee_stability'] = \\\n", 278 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n", 279 | " \n", 280 | " #当月话费/当月账户余额\n", 281 | " train_data['use_left_rate'] = \\\n", 282 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n", 283 | " return train_data\n", 284 | "\n", 285 | "train_data = produce_fee_rate(train_data)\n", 286 | "test_data = produce_fee_rate(test_data)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "### Training" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 8, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "def display_importances(feature_importance_df_):\n", 305 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n", 306 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n", 307 | " plt.figure(figsize=(8, 10))\n", 308 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n", 309 | " plt.title('LightGBM Features (avg over folds)')\n", 310 | " plt.tight_layout()\n", 311 | " plt.show()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 9, 317 | "metadata": { 318 | "collapsed": true 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "#para\n", 323 | "params = {\n", 324 | " 'learning_rate': 0.01,\n", 325 | " 'boosting_type': 'gbdt',\n", 326 | " 'objective': 'regression_l1',\n", 327 | " 'metric': 'mae',\n", 328 | " 'feature_fraction': 0.6,\n", 329 | " 'bagging_fraction': 0.8,\n", 330 | " 'bagging_freq': 2,\n", 331 | " 'num_leaves': 31,\n", 332 | " 'verbose': -1,\n", 333 | " 'max_depth': 5,\n", 334 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n", 335 | "}" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 10, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "#para\n", 347 | "params2 = {\n", 348 | " 'learning_rate': 0.01,\n", 349 | " 'boosting_type': 'gbdt',\n", 350 | " 'objective': 'regression_l2',\n", 351 | " 'metric': 'mae',\n", 352 | " 'feature_fraction': 0.6,\n", 353 | " 'bagging_fraction': 0.8,\n", 354 | " 'bagging_freq': 2,\n", 355 | " 'num_leaves': 31,\n", 356 | " 'verbose': -1,\n", 357 | " 'max_depth': 5,\n", 358 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n", 359 | " 'seed': 89\n", 360 | "}" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 11, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "name": "stderr", 370 | "output_type": "stream", 371 | "text": [ 372 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n", 373 | " % (min_groups, self.n_splits)), Warning)\n" 374 | ] 375 | }, 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "fold: 0 training\n", 381 | "Training until validation scores don't improve for 50 rounds.\n", 382 | "Early stopping, best iteration is:\n", 383 | "[2433]\tvalid_0's l1: 14.7441\n", 384 | "fold: 1 training\n", 385 | "Training until validation scores don't improve for 50 rounds.\n", 386 | "Early stopping, best iteration is:\n", 387 | "[1876]\tvalid_0's l1: 14.8595\n", 388 | "fold: 2 training\n", 389 | "Training until validation scores don't improve for 50 rounds.\n", 390 | "Early stopping, best iteration is:\n", 391 | "[2459]\tvalid_0's l1: 14.7082\n", 392 | "fold: 3 training\n", 393 | "Training until validation scores don't improve for 50 rounds.\n", 394 | "Early stopping, best iteration is:\n", 395 | "[2468]\tvalid_0's l1: 14.6564\n", 396 | "fold: 4 training\n", 397 | "Training until validation scores don't improve for 50 rounds.\n", 398 | "Early stopping, best iteration is:\n", 399 | "[2599]\tvalid_0's l1: 14.5114\n", 400 | "fold: 0 training\n", 401 | "Training until validation scores don't improve for 50 rounds.\n", 402 | "Early stopping, best iteration is:\n", 403 | "[3313]\tvalid_0's l1: 14.743\n", 404 | "fold: 1 training\n", 405 | "Training until validation scores don't improve for 50 rounds.\n", 406 | "Early stopping, best iteration is:\n", 407 | "[2590]\tvalid_0's l1: 14.8562\n", 408 | "fold: 2 training\n", 409 | "Training until validation scores don't improve for 50 rounds.\n", 410 | "Early stopping, best iteration is:\n", 411 | "[2523]\tvalid_0's l1: 14.5752\n", 412 | "fold: 3 training\n", 413 | "Training until validation scores don't improve for 50 rounds.\n", 414 | "Early stopping, best iteration is:\n", 415 | "[3564]\tvalid_0's l1: 14.6125\n", 416 | "fold: 4 training\n", 417 | "Training until validation scores don't improve for 50 rounds.\n", 418 | "Early stopping, best iteration is:\n", 419 | "[1853]\tvalid_0's l1: 14.6333\n", 420 | "fold: 0 training\n", 421 | "Training until validation scores don't improve for 50 rounds.\n", 422 | "Early stopping, best iteration is:\n", 423 | "[2851]\tvalid_0's l1: 14.9587\n", 424 | "fold: 1 training\n", 425 | "Training until validation scores don't improve for 50 rounds.\n", 426 | "Early stopping, best iteration is:\n", 427 | "[1875]\tvalid_0's l1: 14.7808\n", 428 | "fold: 2 training\n", 429 | "Training until validation scores don't improve for 50 rounds.\n", 430 | "Early stopping, best iteration is:\n", 431 | "[2957]\tvalid_0's l1: 14.5525\n", 432 | "fold: 3 training\n", 433 | "Training until validation scores don't improve for 50 rounds.\n", 434 | "Early stopping, best iteration is:\n", 435 | "[2723]\tvalid_0's l1: 14.4804\n", 436 | "fold: 4 training\n", 437 | "Training until validation scores don't improve for 50 rounds.\n", 438 | "Early stopping, best iteration is:\n", 439 | "[3311]\tvalid_0's l1: 14.6854\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "cv_pred_all = 0\n", 445 | "en_amount = 3\n", 446 | "for seed in range(en_amount):\n", 447 | " NFOLDS = 5\n", 448 | " train_label = train_data['score']\n", 449 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n", 450 | " kf = kfold.split(train_data, train_label)\n", 451 | "\n", 452 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n", 453 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n", 454 | "\n", 455 | "\n", 456 | " cv_pred = np.zeros(test_data.shape[0])\n", 457 | " valid_best_l2_all = 0\n", 458 | "\n", 459 | " feature_importance_df = pd.DataFrame()\n", 460 | " count = 0\n", 461 | " for i, (train_fold, validate) in enumerate(kf):\n", 462 | " print('fold: ',i, ' training')\n", 463 | " X_train, X_validate, label_train, label_validate = \\\n", 464 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n", 465 | " train_label[train_fold], train_label[validate]\n", 466 | " dtrain = lgb.Dataset(X_train, label_train)\n", 467 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n", 468 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n", 469 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n", 470 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n", 471 | "\n", 472 | "# fold_importance_df = pd.DataFrame()\n", 473 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n", 474 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n", 475 | "# fold_importance_df[\"fold\"] = count + 1\n", 476 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 477 | " count += 1\n", 478 | "\n", 479 | " cv_pred /= NFOLDS\n", 480 | " valid_best_l2_all /= NFOLDS\n", 481 | " \n", 482 | " cv_pred_all += cv_pred\n", 483 | "cv_pred_all /= en_amount\n", 484 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 12, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "name": "stderr", 494 | "output_type": "stream", 495 | "text": [ 496 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n", 497 | " % (min_groups, self.n_splits)), Warning)\n" 498 | ] 499 | }, 500 | { 501 | "name": "stdout", 502 | "output_type": "stream", 503 | "text": [ 504 | "fold: 0 training\n", 505 | "Training until validation scores don't improve for 50 rounds.\n", 506 | "Early stopping, best iteration is:\n", 507 | "[2457]\tvalid_0's l1: 14.7871\n", 508 | "fold: 1 training\n", 509 | "Training until validation scores don't improve for 50 rounds.\n", 510 | "Early stopping, best iteration is:\n", 511 | "[2365]\tvalid_0's l1: 14.6983\n", 512 | "fold: 2 training\n", 513 | "Training until validation scores don't improve for 50 rounds.\n", 514 | "Early stopping, best iteration is:\n", 515 | "[2082]\tvalid_0's l1: 14.7999\n", 516 | "fold: 3 training\n", 517 | "Training until validation scores don't improve for 50 rounds.\n", 518 | "Early stopping, best iteration is:\n", 519 | "[2266]\tvalid_0's l1: 14.483\n", 520 | "fold: 4 training\n", 521 | "Training until validation scores don't improve for 50 rounds.\n", 522 | "Early stopping, best iteration is:\n", 523 | "[2046]\tvalid_0's l1: 14.7681\n", 524 | "fold: 0 training\n", 525 | "Training until validation scores don't improve for 50 rounds.\n", 526 | "Early stopping, best iteration is:\n", 527 | "[2436]\tvalid_0's l1: 14.7728\n", 528 | "fold: 1 training\n", 529 | "Training until validation scores don't improve for 50 rounds.\n", 530 | "Early stopping, best iteration is:\n", 531 | "[2053]\tvalid_0's l1: 14.8066\n", 532 | "fold: 2 training\n", 533 | "Training until validation scores don't improve for 50 rounds.\n", 534 | "Early stopping, best iteration is:\n", 535 | "[2221]\tvalid_0's l1: 14.5464\n", 536 | "fold: 3 training\n", 537 | "Training until validation scores don't improve for 50 rounds.\n", 538 | "Early stopping, best iteration is:\n", 539 | "[2348]\tvalid_0's l1: 14.5198\n", 540 | "fold: 4 training\n", 541 | "Training until validation scores don't improve for 50 rounds.\n", 542 | "Early stopping, best iteration is:\n", 543 | "[2207]\tvalid_0's l1: 14.8169\n", 544 | "fold: 0 training\n", 545 | "Training until validation scores don't improve for 50 rounds.\n", 546 | "Early stopping, best iteration is:\n", 547 | "[2110]\tvalid_0's l1: 14.5323\n", 548 | "fold: 1 training\n", 549 | "Training until validation scores don't improve for 50 rounds.\n", 550 | "Early stopping, best iteration is:\n", 551 | "[2627]\tvalid_0's l1: 14.8493\n", 552 | "fold: 2 training\n", 553 | "Training until validation scores don't improve for 50 rounds.\n", 554 | "Early stopping, best iteration is:\n", 555 | "[2040]\tvalid_0's l1: 14.8335\n", 556 | "fold: 3 training\n", 557 | "Training until validation scores don't improve for 50 rounds.\n", 558 | "Early stopping, best iteration is:\n", 559 | "[2241]\tvalid_0's l1: 14.6379\n", 560 | "fold: 4 training\n", 561 | "Training until validation scores don't improve for 50 rounds.\n", 562 | "Early stopping, best iteration is:\n", 563 | "[2424]\tvalid_0's l1: 14.6794\n" 564 | ] 565 | } 566 | ], 567 | "source": [ 568 | "cv_pred_all2 = 0\n", 569 | "en_amount = 3\n", 570 | "for seed in range(en_amount):\n", 571 | " NFOLDS = 5\n", 572 | " train_label = train_data['score']\n", 573 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n", 574 | " kf = kfold.split(train_data, train_label)\n", 575 | "\n", 576 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n", 577 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n", 578 | "\n", 579 | "\n", 580 | " cv_pred = np.zeros(test_data.shape[0])\n", 581 | " valid_best_l2_all = 0\n", 582 | "\n", 583 | " feature_importance_df = pd.DataFrame()\n", 584 | " count = 0\n", 585 | " for i, (train_fold, validate) in enumerate(kf):\n", 586 | " print('fold: ',i, ' training')\n", 587 | " X_train, X_validate, label_train, label_validate = \\\n", 588 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n", 589 | " train_label[train_fold], train_label[validate]\n", 590 | " dtrain = lgb.Dataset(X_train, label_train)\n", 591 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n", 592 | " bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n", 593 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n", 594 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n", 595 | "\n", 596 | "# fold_importance_df = pd.DataFrame()\n", 597 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n", 598 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n", 599 | "# fold_importance_df[\"fold\"] = count + 1\n", 600 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 601 | " count += 1\n", 602 | "\n", 603 | " cv_pred /= NFOLDS\n", 604 | " valid_best_l2_all /= NFOLDS\n", 605 | " \n", 606 | " cv_pred_all2 += cv_pred\n", 607 | " \n", 608 | "cv_pred_all2 /= en_amount\n", 609 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 13, 615 | "metadata": { 616 | "collapsed": true 617 | }, 618 | "outputs": [], 619 | "source": [ 620 | "# display_importances(feature_importance_df)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": { 627 | "collapsed": true 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "baseline\n", 632 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n", 633 | " \n", 634 | "#充值金额是否为整数\n", 635 | "cv score for valid is: 0.06343660584697094\n", 636 | "#当月话费/半年话费\n", 637 | "cv score for valid is: 0.06349188259250227\n", 638 | "#当月话费/余额\n", 639 | "cv score for valid is: 0.06350638782547711\n", 640 | " \n", 641 | "#leaves 31\n", 642 | "cv score for valid is: 0.06354362406472286\n", 643 | "#remove l1, l2 = 5\n", 644 | "cv score for valid is: 0.06358730556250403\n", 645 | "#feature fraction 0.7\n", 646 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n", 647 | "max_depth 5, objective l1\n", 648 | "cv score for valid is: 0.06367445081783887\n", 649 | "feature fraction 0.6\n", 650 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n", 651 | "10 fold\n", 652 | "cv score for valid is: 0.0637915578042461 --- 6378 --- useless\n", 653 | "remove blk list flag\n", 654 | "cv score for valid is: 0.06377613710442855" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "### Submit" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 14, 667 | "metadata": {}, 668 | "outputs": [ 669 | { 670 | "name": "stderr", 671 | "output_type": "stream", 672 | "text": [ 673 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 674 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 675 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 676 | "\n", 677 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 678 | " \n", 679 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", 680 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 681 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 682 | "\n", 683 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 684 | " after removing the cwd from sys.path.\n", 685 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n", 686 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 687 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 688 | "\n", 689 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 690 | " \"\"\"\n" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "test_data_sub = test_data[['uid']]\n", 696 | "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n", 697 | "test_data_sub.columns = ['id','score']\n", 698 | "test_data_sub['score1'] = cv_pred_all\n", 699 | "test_data_sub['score2'] = cv_pred_all2" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 17, 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "name": "stderr", 709 | "output_type": "stream", 710 | "text": [ 711 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 712 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 713 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 714 | "\n", 715 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 716 | " \"\"\"Entry point for launching an IPython kernel.\n" 717 | ] 718 | } 719 | ], 720 | "source": [ 721 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": 18, 727 | "metadata": { 728 | "collapsed": true 729 | }, 730 | "outputs": [], 731 | "source": [ 732 | "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 31, 738 | "metadata": {}, 739 | "outputs": [ 740 | { 741 | "data": { 742 | "text/plain": [ 743 | "617.8386873193765" 744 | ] 745 | }, 746 | "execution_count": 31, 747 | "metadata": {}, 748 | "output_type": "execute_result" 749 | } 750 | ], 751 | "source": [ 752 | "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n", 753 | "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": { 760 | "collapsed": true 761 | }, 762 | "outputs": [], 763 | "source": [] 764 | } 765 | ], 766 | "metadata": { 767 | "kernelspec": { 768 | "display_name": "Python 3", 769 | "language": "python", 770 | "name": "python3" 771 | }, 772 | "language_info": { 773 | "codemirror_mode": { 774 | "name": "ipython", 775 | "version": 3 776 | }, 777 | "file_extension": ".py", 778 | "mimetype": "text/x-python", 779 | "name": "python", 780 | "nbconvert_exporter": "python", 781 | "pygments_lexer": "ipython3", 782 | "version": "3.6.1" 783 | } 784 | }, 785 | "nbformat": 4, 786 | "nbformat_minor": 2 787 | } 788 | -------------------------------------------------------------------------------- /code/Baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 14, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "#过年没时间做了,专心搞kaggle去了\n", 19 | "#kaggle玩家欢迎和我交流,ID是YourVenn" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n", 32 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 33 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 34 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 35 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "import time\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import seaborn as sns\n", 43 | "import numpy as np\n", 44 | "import pandas as pd\n", 45 | "import lightgbm as lgb\n", 46 | "from sklearn.model_selection import StratifiedKFold\n", 47 | "from sklearn.preprocessing import LabelEncoder" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Input data" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "data_path = '../input/'\n", 66 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n", 67 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n", 68 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Pre-processing" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | "
用户编码用户实名制是否通过核实用户年龄是否大学生客户是否黑名单客户是否4G不健康客户用户网龄(月)用户最近一次缴费距今时长(月)缴费用户最近一次缴费金额(元)用户近6个月平均消费值(元)...当月是否景点游览当月是否体育场馆消费当月网购类应用使用次数当月物流快递类应用使用次数当月金融理财类应用使用总次数当月视频播放类应用使用次数当月飞机类应用使用次数当月火车类应用使用次数当月旅游资讯类应用使用次数信用分
0a4651f98c82948b186bdcdc8108381b4144000186199.8163.86...117130274071450030664
\n", 154 | "

1 rows × 30 columns

\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n", 159 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n", 160 | "\n", 161 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n", 162 | "0 0 186 1 99.8 163.86 ... \n", 163 | "\n", 164 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n", 165 | "0 1 1 713 0 2740 \n", 166 | "\n", 167 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n", 168 | "0 7145 0 0 30 664 \n", 169 | "\n", 170 | "[1 rows x 30 columns]" 171 | ] 172 | }, 173 | "execution_count": 3, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "train_data.head(1)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n", 192 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n", 193 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n", 194 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n", 195 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n", 196 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n", 197 | " '当月旅游资讯类应用使用次数', '信用分'],\n", 198 | " dtype='object')\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "#all chinese name- -\n", 204 | "#rename one by one\n", 205 | "print(train_data.columns)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 5, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n", 217 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n", 218 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n", 219 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n", 220 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n", 221 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n", 222 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n", 223 | " 'tour_app_count','score']\n", 224 | "test_data.columns = train_data.columns[:-1]" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Feature Engineering" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 6, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "name": "stderr", 241 | "output_type": "stream", 242 | "text": [ 243 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n", 244 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 245 | "\n", 246 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 247 | " \n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n", 253 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n", 254 | "\n", 255 | "def produce_offline_feat(train_data):\n", 256 | " train_data['top_up_amount_offline'] = 0\n", 257 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n", 258 | " train_data['top_up_amount'] != 0] = 1\n", 259 | " return train_data\n", 260 | "\n", 261 | "train_data = produce_offline_feat(train_data)\n", 262 | "test_data = produce_offline_feat(test_data)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 7, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "def produce_fee_rate(train_data):\n", 274 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n", 275 | " train_data['current_fee_stability'] = \\\n", 276 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n", 277 | " \n", 278 | " #当月话费/当月账户余额\n", 279 | " train_data['use_left_rate'] = \\\n", 280 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n", 281 | " return train_data\n", 282 | "\n", 283 | "train_data = produce_fee_rate(train_data)\n", 284 | "test_data = produce_fee_rate(test_data)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### Training" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 8, 297 | "metadata": { 298 | "collapsed": true 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "def display_importances(feature_importance_df_):\n", 303 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n", 304 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n", 305 | " plt.figure(figsize=(8, 10))\n", 306 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n", 307 | " plt.title('LightGBM Features (avg over folds)')\n", 308 | " plt.tight_layout()\n", 309 | " plt.show()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "#para\n", 321 | "params = {\n", 322 | " 'learning_rate': 0.01,\n", 323 | " 'boosting_type': 'gbdt',\n", 324 | " 'objective': 'regression_l1',\n", 325 | " 'metric': 'mae',\n", 326 | " 'feature_fraction': 0.6,\n", 327 | " 'bagging_fraction': 0.8,\n", 328 | " 'bagging_freq': 2,\n", 329 | " 'num_leaves': 31,\n", 330 | " 'verbose': -1,\n", 331 | " 'max_depth': 5,\n", 332 | " 'lambda_l2': 5, 'lambda_l1': 0\n", 333 | "}" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 10, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "NFOLDS = 5\n", 345 | "train_label = train_data['score']\n", 346 | "kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2019)\n", 347 | "kf = kfold.split(train_data, train_label)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 11, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "Index(['uid', 'true_name_flag', 'age', 'uni_student_flag', 'blk_list_flag',\n", 359 | " '4g_unhealth_flag', 'net_age_till_now', 'top_up_month_diff',\n", 360 | " 'top_up_amount', 'recent_6month_avg_use', 'total_account_fee',\n", 361 | " 'curr_month_balance', 'curr_overdue_flag', 'cost_sensitivity',\n", 362 | " 'connect_num', 'freq_shopping_flag', 'recent_3month_shopping_count',\n", 363 | " 'wanda_flag', 'sam_flag', 'movie_flag', 'tour_flag', 'sport_flag',\n", 364 | " 'online_shopping_count', 'express_count', 'finance_app_count',\n", 365 | " 'video_app_count', 'flight_count', 'train_count', 'tour_app_count',\n", 366 | " 'score', 'top_up_amount_offline', 'current_fee_stability',\n", 367 | " 'use_left_rate'],\n", 368 | " dtype='object')" 369 | ] 370 | }, 371 | "execution_count": 11, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "train_data.columns" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 12, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n", 389 | "test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 13, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stderr", 399 | "output_type": "stream", 400 | "text": [ 401 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n", 402 | " % (min_groups, self.n_splits)), Warning)\n" 403 | ] 404 | }, 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "fold: 0 training\n", 410 | "Training until validation scores don't improve for 50 rounds.\n", 411 | "Early stopping, best iteration is:\n", 412 | "[2834]\tvalid_0's l1: 14.7519\n", 413 | "fold: 1 training\n", 414 | "Training until validation scores don't improve for 50 rounds.\n", 415 | "Early stopping, best iteration is:\n", 416 | "[2780]\tvalid_0's l1: 14.6775\n", 417 | "fold: 2 training\n", 418 | "Training until validation scores don't improve for 50 rounds.\n", 419 | "Early stopping, best iteration is:\n", 420 | "[3745]\tvalid_0's l1: 14.728\n", 421 | "fold: 3 training\n", 422 | "Training until validation scores don't improve for 50 rounds.\n", 423 | "Early stopping, best iteration is:\n", 424 | "[3009]\tvalid_0's l1: 14.46\n", 425 | "fold: 4 training\n", 426 | "Training until validation scores don't improve for 50 rounds.\n", 427 | "Early stopping, best iteration is:\n", 428 | "[2544]\tvalid_0's l1: 14.7818\n", 429 | "cv score for valid is: 0.06377613710442855\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "cv_pred = np.zeros(test_data.shape[0])\n", 435 | "valid_best_l2_all = 0\n", 436 | "\n", 437 | "feature_importance_df = pd.DataFrame()\n", 438 | "count = 0\n", 439 | "for i, (train_fold, validate) in enumerate(kf):\n", 440 | " print('fold: ',i, ' training')\n", 441 | " X_train, X_validate, label_train, label_validate = \\\n", 442 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n", 443 | " train_label[train_fold], train_label[validate]\n", 444 | " dtrain = lgb.Dataset(X_train, label_train)\n", 445 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n", 446 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n", 447 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n", 448 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n", 449 | "\n", 450 | " fold_importance_df = pd.DataFrame()\n", 451 | " fold_importance_df[\"feature\"] = list(X_train.columns)\n", 452 | " fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)\n", 453 | " fold_importance_df[\"fold\"] = count + 1\n", 454 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 455 | " count += 1\n", 456 | "\n", 457 | "cv_pred /= NFOLDS\n", 458 | "valid_best_l2_all /= NFOLDS\n", 459 | "print('cv score for valid is: ', 1/(1+valid_best_l2_all))" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 15, 465 | "metadata": { 466 | "collapsed": true 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "display_importances(feature_importance_df)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "collapsed": true 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "baseline\n", 482 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n", 483 | " \n", 484 | "#充值金额是否为整数\n", 485 | "cv score for valid is: 0.06343660584697094\n", 486 | "#当月话费/半年话费\n", 487 | "cv score for valid is: 0.06349188259250227\n", 488 | "#当月话费/余额\n", 489 | "cv score for valid is: 0.06350638782547711\n", 490 | " \n", 491 | "#leaves 31\n", 492 | "cv score for valid is: 0.06354362406472286\n", 493 | "#remove l1, l2 = 5\n", 494 | "cv score for valid is: 0.06358730556250403\n", 495 | "#feature fraction 0.7\n", 496 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n", 497 | "max_depth 5, objective l1\n", 498 | "cv score for valid is: 0.06367445081783887\n", 499 | "feature fraction 0.6\n", 500 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n", 501 | "remove blk flag\n", 502 | "cv score for valid is: 0.06377613710442855" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "### Submit" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 39, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "name": "stderr", 519 | "output_type": "stream", 520 | "text": [ 521 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 522 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 523 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 524 | "\n", 525 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 526 | " \n" 527 | ] 528 | } 529 | ], 530 | "source": [ 531 | "test_data_sub = test_data[['uid']]\n", 532 | "test_data_sub['score'] = cv_pred\n", 533 | "test_data_sub.columns = ['id','score']" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 40, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "name": "stderr", 543 | "output_type": "stream", 544 | "text": [ 545 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 546 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 547 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 548 | "\n", 549 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 550 | " \"\"\"Entry point for launching an IPython kernel.\n" 551 | ] 552 | } 553 | ], 554 | "source": [ 555 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 41, 561 | "metadata": { 562 | "collapsed": true 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "test_data_sub.to_csv('../output/baseline_63776.csv', index=False)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": { 573 | "collapsed": true 574 | }, 575 | "outputs": [], 576 | "source": [] 577 | } 578 | ], 579 | "metadata": { 580 | "kernelspec": { 581 | "display_name": "Python 3", 582 | "language": "python", 583 | "name": "python3" 584 | }, 585 | "language_info": { 586 | "codemirror_mode": { 587 | "name": "ipython", 588 | "version": 3 589 | }, 590 | "file_extension": ".py", 591 | "mimetype": "text/x-python", 592 | "name": "python", 593 | "nbconvert_exporter": "python", 594 | "pygments_lexer": "ipython3", 595 | "version": "3.6.1" 596 | } 597 | }, 598 | "nbformat": 4, 599 | "nbformat_minor": 2 600 | } 601 | -------------------------------------------------------------------------------- /code/Baseline_bagging_version.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "#在local cv 6377的基础上,加上MSE优化,MAE & MAE各自用N个seed 最终加权平均" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stderr", 28 | "output_type": "stream", 29 | "text": [ 30 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n", 31 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 32 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 33 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 34 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import time\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "import seaborn as sns\n", 42 | "import numpy as np\n", 43 | "import pandas as pd\n", 44 | "import lightgbm as lgb\n", 45 | "from sklearn.model_selection import StratifiedKFold\n", 46 | "from sklearn.preprocessing import LabelEncoder" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Input data" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "data_path = '../input/'\n", 65 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n", 66 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n", 67 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Pre-processing" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/html": [ 85 | "
\n", 86 | "\n", 99 | "\n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | "
用户编码用户实名制是否通过核实用户年龄是否大学生客户是否黑名单客户是否4G不健康客户用户网龄(月)用户最近一次缴费距今时长(月)缴费用户最近一次缴费金额(元)用户近6个月平均消费值(元)...当月是否景点游览当月是否体育场馆消费当月网购类应用使用次数当月物流快递类应用使用次数当月金融理财类应用使用总次数当月视频播放类应用使用次数当月飞机类应用使用次数当月火车类应用使用次数当月旅游资讯类应用使用次数信用分
0a4651f98c82948b186bdcdc8108381b4144000186199.8163.86...117130274071450030664
\n", 153 | "

1 rows × 30 columns

\n", 154 | "
" 155 | ], 156 | "text/plain": [ 157 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n", 158 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n", 159 | "\n", 160 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n", 161 | "0 0 186 1 99.8 163.86 ... \n", 162 | "\n", 163 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n", 164 | "0 1 1 713 0 2740 \n", 165 | "\n", 166 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n", 167 | "0 7145 0 0 30 664 \n", 168 | "\n", 169 | "[1 rows x 30 columns]" 170 | ] 171 | }, 172 | "execution_count": 3, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "train_data.head(1)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 4, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n", 191 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n", 192 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n", 193 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n", 194 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n", 195 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n", 196 | " '当月旅游资讯类应用使用次数', '信用分'],\n", 197 | " dtype='object')\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "#all chinese name- -\n", 203 | "#rename one by one\n", 204 | "print(train_data.columns)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n", 216 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n", 217 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n", 218 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n", 219 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n", 220 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n", 221 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n", 222 | " 'tour_app_count','score']\n", 223 | "test_data.columns = train_data.columns[:-1]" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Feature Engineering" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 6, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stderr", 249 | "output_type": "stream", 250 | "text": [ 251 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", 252 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 253 | "\n", 254 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 255 | " # This is added back by InteractiveShellApp.init_path()\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n", 261 | "#先前余额,当前余额 + 当月话费 - 上次缴费 --- useless\n", 262 | "#充值金额/余额 --- useless\n", 263 | "#当月话费/最近充值金额 --- useless\n", 264 | "#六个月均值/充值金额 --- useless\n", 265 | "\n", 266 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n", 267 | "\n", 268 | "def produce_offline_feat(train_data):\n", 269 | " train_data['top_up_amount_offline'] = 0\n", 270 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n", 271 | " train_data['top_up_amount'] != 0] = 1\n", 272 | " return train_data\n", 273 | "\n", 274 | "train_data = produce_offline_feat(train_data)\n", 275 | "test_data = produce_offline_feat(test_data)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 7, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "def produce_fee_rate(train_data):\n", 287 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n", 288 | " train_data['current_fee_stability'] = \\\n", 289 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n", 290 | " \n", 291 | " #当月话费/当月账户余额\n", 292 | " train_data['use_left_rate'] = \\\n", 293 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n", 294 | " return train_data\n", 295 | "\n", 296 | "train_data = produce_fee_rate(train_data)\n", 297 | "test_data = produce_fee_rate(test_data)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "### Training" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 8, 310 | "metadata": { 311 | "collapsed": true 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "def display_importances(feature_importance_df_):\n", 316 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n", 317 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n", 318 | " plt.figure(figsize=(8, 10))\n", 319 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n", 320 | " plt.title('LightGBM Features (avg over folds)')\n", 321 | " plt.tight_layout()\n", 322 | " plt.show()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 9, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "#para\n", 334 | "params = {\n", 335 | " 'learning_rate': 0.01,\n", 336 | " 'boosting_type': 'gbdt',\n", 337 | " 'objective': 'regression_l1',\n", 338 | " 'metric': 'mae',\n", 339 | " 'feature_fraction': 0.6,\n", 340 | " 'bagging_fraction': 0.8,\n", 341 | " 'bagging_freq': 2,\n", 342 | " 'num_leaves': 31,\n", 343 | " 'verbose': -1,\n", 344 | " 'max_depth': 5,\n", 345 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n", 346 | "}" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 10, 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "#para\n", 358 | "params2 = {\n", 359 | " 'learning_rate': 0.01,\n", 360 | " 'boosting_type': 'gbdt',\n", 361 | " 'objective': 'regression_l2',\n", 362 | " 'metric': 'mae',\n", 363 | " 'feature_fraction': 0.6,\n", 364 | " 'bagging_fraction': 0.8,\n", 365 | " 'bagging_freq': 2,\n", 366 | " 'num_leaves': 31,\n", 367 | " 'verbose': -1,\n", 368 | " 'max_depth': 5,\n", 369 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n", 370 | " 'seed': 89\n", 371 | "}" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 11, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "name": "stderr", 381 | "output_type": "stream", 382 | "text": [ 383 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n", 384 | " % (min_groups, self.n_splits)), Warning)\n" 385 | ] 386 | }, 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "fold: 0 training\n", 392 | "Training until validation scores don't improve for 50 rounds.\n", 393 | "Early stopping, best iteration is:\n", 394 | "[2433]\tvalid_0's l1: 14.7441\n", 395 | "fold: 1 training\n", 396 | "Training until validation scores don't improve for 50 rounds.\n", 397 | "Early stopping, best iteration is:\n", 398 | "[1876]\tvalid_0's l1: 14.8595\n", 399 | "fold: 2 training\n", 400 | "Training until validation scores don't improve for 50 rounds.\n", 401 | "Early stopping, best iteration is:\n", 402 | "[2459]\tvalid_0's l1: 14.7082\n", 403 | "fold: 3 training\n", 404 | "Training until validation scores don't improve for 50 rounds.\n", 405 | "Early stopping, best iteration is:\n", 406 | "[2468]\tvalid_0's l1: 14.6564\n", 407 | "fold: 4 training\n", 408 | "Training until validation scores don't improve for 50 rounds.\n", 409 | "Early stopping, best iteration is:\n", 410 | "[2599]\tvalid_0's l1: 14.5114\n", 411 | "fold: 0 training\n", 412 | "Training until validation scores don't improve for 50 rounds.\n", 413 | "Early stopping, best iteration is:\n", 414 | "[3313]\tvalid_0's l1: 14.743\n", 415 | "fold: 1 training\n", 416 | "Training until validation scores don't improve for 50 rounds.\n", 417 | "Early stopping, best iteration is:\n", 418 | "[2590]\tvalid_0's l1: 14.8562\n", 419 | "fold: 2 training\n", 420 | "Training until validation scores don't improve for 50 rounds.\n", 421 | "Early stopping, best iteration is:\n", 422 | "[2523]\tvalid_0's l1: 14.5752\n", 423 | "fold: 3 training\n", 424 | "Training until validation scores don't improve for 50 rounds.\n", 425 | "Early stopping, best iteration is:\n", 426 | "[3564]\tvalid_0's l1: 14.6125\n", 427 | "fold: 4 training\n", 428 | "Training until validation scores don't improve for 50 rounds.\n", 429 | "Early stopping, best iteration is:\n", 430 | "[1853]\tvalid_0's l1: 14.6333\n", 431 | "fold: 0 training\n", 432 | "Training until validation scores don't improve for 50 rounds.\n", 433 | "Early stopping, best iteration is:\n", 434 | "[2851]\tvalid_0's l1: 14.9587\n", 435 | "fold: 1 training\n", 436 | "Training until validation scores don't improve for 50 rounds.\n", 437 | "Early stopping, best iteration is:\n", 438 | "[1875]\tvalid_0's l1: 14.7808\n", 439 | "fold: 2 training\n", 440 | "Training until validation scores don't improve for 50 rounds.\n", 441 | "Early stopping, best iteration is:\n", 442 | "[2957]\tvalid_0's l1: 14.5525\n", 443 | "fold: 3 training\n", 444 | "Training until validation scores don't improve for 50 rounds.\n", 445 | "Early stopping, best iteration is:\n", 446 | "[2723]\tvalid_0's l1: 14.4804\n", 447 | "fold: 4 training\n", 448 | "Training until validation scores don't improve for 50 rounds.\n", 449 | "Early stopping, best iteration is:\n", 450 | "[3311]\tvalid_0's l1: 14.6854\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "cv_pred_all = 0\n", 456 | "en_amount = 3\n", 457 | "for seed in range(en_amount):\n", 458 | " NFOLDS = 5\n", 459 | " train_label = train_data['score']\n", 460 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n", 461 | " kf = kfold.split(train_data, train_label)\n", 462 | "\n", 463 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n", 464 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n", 465 | "\n", 466 | "\n", 467 | " cv_pred = np.zeros(test_data.shape[0])\n", 468 | " valid_best_l2_all = 0\n", 469 | "\n", 470 | " feature_importance_df = pd.DataFrame()\n", 471 | " count = 0\n", 472 | " for i, (train_fold, validate) in enumerate(kf):\n", 473 | " print('fold: ',i, ' training')\n", 474 | " X_train, X_validate, label_train, label_validate = \\\n", 475 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n", 476 | " train_label[train_fold], train_label[validate]\n", 477 | " dtrain = lgb.Dataset(X_train, label_train)\n", 478 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n", 479 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n", 480 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n", 481 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n", 482 | "\n", 483 | "# fold_importance_df = pd.DataFrame()\n", 484 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n", 485 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n", 486 | "# fold_importance_df[\"fold\"] = count + 1\n", 487 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 488 | " count += 1\n", 489 | "\n", 490 | " cv_pred /= NFOLDS\n", 491 | " valid_best_l2_all /= NFOLDS\n", 492 | " \n", 493 | " cv_pred_all += cv_pred\n", 494 | "cv_pred_all /= en_amount\n", 495 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 12, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "name": "stderr", 505 | "output_type": "stream", 506 | "text": [ 507 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n", 508 | " % (min_groups, self.n_splits)), Warning)\n" 509 | ] 510 | }, 511 | { 512 | "name": "stdout", 513 | "output_type": "stream", 514 | "text": [ 515 | "fold: 0 training\n", 516 | "Training until validation scores don't improve for 50 rounds.\n", 517 | "Early stopping, best iteration is:\n", 518 | "[2457]\tvalid_0's l1: 14.7871\n", 519 | "fold: 1 training\n", 520 | "Training until validation scores don't improve for 50 rounds.\n", 521 | "Early stopping, best iteration is:\n", 522 | "[2365]\tvalid_0's l1: 14.6983\n", 523 | "fold: 2 training\n", 524 | "Training until validation scores don't improve for 50 rounds.\n", 525 | "Early stopping, best iteration is:\n", 526 | "[2082]\tvalid_0's l1: 14.7999\n", 527 | "fold: 3 training\n", 528 | "Training until validation scores don't improve for 50 rounds.\n", 529 | "Early stopping, best iteration is:\n", 530 | "[2266]\tvalid_0's l1: 14.483\n", 531 | "fold: 4 training\n", 532 | "Training until validation scores don't improve for 50 rounds.\n", 533 | "Early stopping, best iteration is:\n", 534 | "[2046]\tvalid_0's l1: 14.7681\n", 535 | "fold: 0 training\n", 536 | "Training until validation scores don't improve for 50 rounds.\n", 537 | "Early stopping, best iteration is:\n", 538 | "[2436]\tvalid_0's l1: 14.7728\n", 539 | "fold: 1 training\n", 540 | "Training until validation scores don't improve for 50 rounds.\n", 541 | "Early stopping, best iteration is:\n", 542 | "[2053]\tvalid_0's l1: 14.8066\n", 543 | "fold: 2 training\n", 544 | "Training until validation scores don't improve for 50 rounds.\n", 545 | "Early stopping, best iteration is:\n", 546 | "[2221]\tvalid_0's l1: 14.5464\n", 547 | "fold: 3 training\n", 548 | "Training until validation scores don't improve for 50 rounds.\n", 549 | "Early stopping, best iteration is:\n", 550 | "[2348]\tvalid_0's l1: 14.5198\n", 551 | "fold: 4 training\n", 552 | "Training until validation scores don't improve for 50 rounds.\n", 553 | "Early stopping, best iteration is:\n", 554 | "[2207]\tvalid_0's l1: 14.8169\n", 555 | "fold: 0 training\n", 556 | "Training until validation scores don't improve for 50 rounds.\n", 557 | "Early stopping, best iteration is:\n", 558 | "[2110]\tvalid_0's l1: 14.5323\n", 559 | "fold: 1 training\n", 560 | "Training until validation scores don't improve for 50 rounds.\n", 561 | "Early stopping, best iteration is:\n", 562 | "[2627]\tvalid_0's l1: 14.8493\n", 563 | "fold: 2 training\n", 564 | "Training until validation scores don't improve for 50 rounds.\n", 565 | "Early stopping, best iteration is:\n", 566 | "[2040]\tvalid_0's l1: 14.8335\n", 567 | "fold: 3 training\n", 568 | "Training until validation scores don't improve for 50 rounds.\n", 569 | "Early stopping, best iteration is:\n", 570 | "[2241]\tvalid_0's l1: 14.6379\n", 571 | "fold: 4 training\n", 572 | "Training until validation scores don't improve for 50 rounds.\n", 573 | "Early stopping, best iteration is:\n", 574 | "[2424]\tvalid_0's l1: 14.6794\n" 575 | ] 576 | } 577 | ], 578 | "source": [ 579 | "cv_pred_all2 = 0\n", 580 | "en_amount = 3\n", 581 | "for seed in range(en_amount):\n", 582 | " NFOLDS = 5\n", 583 | " train_label = train_data['score']\n", 584 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n", 585 | " kf = kfold.split(train_data, train_label)\n", 586 | "\n", 587 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n", 588 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n", 589 | "\n", 590 | "\n", 591 | " cv_pred = np.zeros(test_data.shape[0])\n", 592 | " valid_best_l2_all = 0\n", 593 | "\n", 594 | " feature_importance_df = pd.DataFrame()\n", 595 | " count = 0\n", 596 | " for i, (train_fold, validate) in enumerate(kf):\n", 597 | " print('fold: ',i, ' training')\n", 598 | " X_train, X_validate, label_train, label_validate = \\\n", 599 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n", 600 | " train_label[train_fold], train_label[validate]\n", 601 | " dtrain = lgb.Dataset(X_train, label_train)\n", 602 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n", 603 | " bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n", 604 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n", 605 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n", 606 | "\n", 607 | "# fold_importance_df = pd.DataFrame()\n", 608 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n", 609 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n", 610 | "# fold_importance_df[\"fold\"] = count + 1\n", 611 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 612 | " count += 1\n", 613 | "\n", 614 | " cv_pred /= NFOLDS\n", 615 | " valid_best_l2_all /= NFOLDS\n", 616 | " \n", 617 | " cv_pred_all2 += cv_pred\n", 618 | " \n", 619 | "cv_pred_all2 /= en_amount\n", 620 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 13, 626 | "metadata": { 627 | "collapsed": true 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "# display_importances(feature_importance_df)" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": { 638 | "collapsed": true 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "baseline\n", 643 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n", 644 | " \n", 645 | "#充值金额是否为整数\n", 646 | "cv score for valid is: 0.06343660584697094\n", 647 | "#当月话费/半年话费\n", 648 | "cv score for valid is: 0.06349188259250227\n", 649 | "#当月话费/余额\n", 650 | "cv score for valid is: 0.06350638782547711\n", 651 | " \n", 652 | "#leaves 31\n", 653 | "cv score for valid is: 0.06354362406472286\n", 654 | "#remove l1, l2 = 5\n", 655 | "cv score for valid is: 0.06358730556250403\n", 656 | "#feature fraction 0.7\n", 657 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n", 658 | "max_depth 5, objective l1\n", 659 | "cv score for valid is: 0.06367445081783887\n", 660 | "feature fraction 0.6\n", 661 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n", 662 | "10 fold\n", 663 | "cv score for valid is: 0.0637915578042461 --- 6378 --- useless\n", 664 | "remove blk list flag\n", 665 | "cv score for valid is: 0.06377613710442855" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "### Submit" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 14, 678 | "metadata": {}, 679 | "outputs": [ 680 | { 681 | "name": "stderr", 682 | "output_type": "stream", 683 | "text": [ 684 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", 685 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 686 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 687 | "\n", 688 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 689 | " \n", 690 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", 691 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 692 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 693 | "\n", 694 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 695 | " after removing the cwd from sys.path.\n", 696 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n", 697 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 698 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 699 | "\n", 700 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 701 | " \"\"\"\n" 702 | ] 703 | } 704 | ], 705 | "source": [ 706 | "test_data_sub = test_data[['uid']]\n", 707 | "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n", 708 | "test_data_sub.columns = ['id','score']\n", 709 | "test_data_sub['score1'] = cv_pred_all\n", 710 | "test_data_sub['score2'] = cv_pred_all2" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 17, 716 | "metadata": {}, 717 | "outputs": [ 718 | { 719 | "name": "stderr", 720 | "output_type": "stream", 721 | "text": [ 722 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", 723 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 724 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 725 | "\n", 726 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 727 | " \"\"\"Entry point for launching an IPython kernel.\n" 728 | ] 729 | } 730 | ], 731 | "source": [ 732 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 18, 738 | "metadata": { 739 | "collapsed": true 740 | }, 741 | "outputs": [], 742 | "source": [ 743 | "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": 31, 749 | "metadata": {}, 750 | "outputs": [ 751 | { 752 | "data": { 753 | "text/plain": [ 754 | "617.8386873193765" 755 | ] 756 | }, 757 | "execution_count": 31, 758 | "metadata": {}, 759 | "output_type": "execute_result" 760 | } 761 | ], 762 | "source": [ 763 | "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n", 764 | "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": { 771 | "collapsed": true 772 | }, 773 | "outputs": [], 774 | "source": [] 775 | } 776 | ], 777 | "metadata": { 778 | "kernelspec": { 779 | "display_name": "Python 3", 780 | "language": "python", 781 | "name": "python3" 782 | }, 783 | "language_info": { 784 | "codemirror_mode": { 785 | "name": "ipython", 786 | "version": 3 787 | }, 788 | "file_extension": ".py", 789 | "mimetype": "text/x-python", 790 | "name": "python", 791 | "nbconvert_exporter": "python", 792 | "pygments_lexer": "ipython3", 793 | "version": "3.6.1" 794 | } 795 | }, 796 | "nbformat": 4, 797 | "nbformat_minor": 2 798 | } 799 | --------------------------------------------------------------------------------