├── FTRL.ipynb ├── README.md ├── blending.ipynb ├── features.ipynb └── train_lgb_xgb.py /FTRL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/randomstate/__init__.py:66: RandomStateDeprecationWarning: \n", 13 | "**End-of-life notification**\n", 14 | "\n", 15 | "This library was designed to bring alternative generators to the NumPy \n", 16 | "infrastructure. It as been successful in advancing the conversation \n", 17 | "for a future implementation of a new random number API in NumPy which \n", 18 | "will allow new algorithms and/or generators. The next step\n", 19 | "in this process is to separate the basic (or core RNG) from the \n", 20 | "functions that transform random bits into useful random numbers.\n", 21 | "This has been implemented in a successor project **randomgen** \n", 22 | "available on GitHub\n", 23 | "\n", 24 | "https://github.com/bashtage/randomgen\n", 25 | "\n", 26 | "or PyPi\n", 27 | "\n", 28 | "https://pypi.org/project/randomstate/.\n", 29 | "\n", 30 | "randomgen has a slightly different API, so please see the randomgen documentation\n", 31 | "\n", 32 | "https://bashtage.github.io/randomgen.\n", 33 | "\n", 34 | " warnings.warn(DEPRECATION_MESSAGE, RandomStateDeprecationWarning)\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import wordbatch\n", 40 | "from wordbatch.extractors import WordHash\n", 41 | "from wordbatch.models import FM_FTRL\n", 42 | "from wordbatch.data_utils import *\n", 43 | "import threading\n", 44 | "import pandas as pd\n", 45 | "from sklearn.metrics import roc_auc_score\n", 46 | "import time\n", 47 | "import numpy as np\n", 48 | "import gc\n", 49 | "from contextlib import contextmanager" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "count_combinations = [\n", 61 | " ['app'],\n", 62 | " ['ip'], # 3.03\n", 63 | " ['channel'],\n", 64 | " ['os'],\n", 65 | " ['ip', 'device'], # 9.88\n", 66 | " ['day', 'hour', 'app'], # 4.08\n", 67 | " ['app', 'channel'], # 2.8\n", 68 | " ['ip', 'day', 'hour'], # 0.52\n", 69 | " ['os', 'device'], # 0.44\n", 70 | " ['ip', 'os', 'day', 'hour'], # 0.41\n", 71 | " ['ip', 'device', 'day', 'hour'], # 0.31\n", 72 | " ['ip', 'app', 'os'] # 0.21\n", 73 | "]\n", 74 | "\n", 75 | "countUniq_combinations = [\n", 76 | " # [['app'],'ip'],\n", 77 | " # [['app', 'device', 'os', 'channel'], 'ip'],\n", 78 | " [['ip'], 'channel'], # 0.9\n", 79 | " [['ip'], 'app'], # 1.3\n", 80 | " [['ip'], 'os'] # 0.45\n", 81 | "]\n", 82 | "\n", 83 | "nextClick_combinations = [\n", 84 | " ['ip', 'os'],\n", 85 | " ['ip', 'app', 'device', 'os']\n", 86 | "]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "@contextmanager\n", 96 | "def timer(name):\n", 97 | " t0 = time.time()\n", 98 | " yield\n", 99 | " print(f'[{name}] done in {time.time() - t0:.0f} s')\n", 100 | "\n", 101 | "\n", 102 | "import os, psutil\n", 103 | "\n", 104 | "\n", 105 | "def cpuStats():\n", 106 | " pid = os.getpid()\n", 107 | " py = psutil.Process(pid)\n", 108 | " memoryUse = py.memory_info()[0] / 2. ** 30\n", 109 | " print('memory GB:', memoryUse)\n", 110 | "\n", 111 | "\n", 112 | "start_time = time.time()\n", 113 | "\n", 114 | "mean_auc = 0\n", 115 | "\n", 116 | "\n", 117 | "def fit_batch(clf, X, y, w): clf.partial_fit(X, y, sample_weight=w)\n", 118 | "\n", 119 | "\n", 120 | "def predict_batch(clf, X): return clf.predict(X)\n", 121 | "\n", 122 | "\n", 123 | "def evaluate_batch(clf, X, y, rcount):\n", 124 | " auc = roc_auc_score(y, predict_batch(clf, X))\n", 125 | " global mean_auc\n", 126 | " if mean_auc == 0:\n", 127 | " mean_auc = auc\n", 128 | " else:\n", 129 | " mean_auc = 0.2 * (mean_auc * 4 + auc)\n", 130 | " print(rcount, \"ROC AUC:\", auc, \"Running Mean:\", mean_auc)\n", 131 | " return auc\n", 132 | "\n", 133 | "\n", 134 | "def count_agg(df, group_cols):\n", 135 | " print('grouping features')\n", 136 | " for i, cols in enumerate(group_cols):\n", 137 | " col_name = \"_\".join(cols) + '_count'\n", 138 | " count = df.groupby(cols).size().reset_index(name=col_name)\n", 139 | " df = df.merge(count, on=cols, how='left')\n", 140 | " del count\n", 141 | " gc.collect()\n", 142 | " return df\n", 143 | "\n", 144 | "\n", 145 | "def count_uniq(df, group_uniq_cols):\n", 146 | " print('unique features')\n", 147 | " for i, cols in enumerate(group_uniq_cols):\n", 148 | " group_cols, uniq_col = cols[0], cols[1]\n", 149 | " col_name = \"_\".join(group_cols) + '_uniq_' + uniq_col + '_countUniq'\n", 150 | " tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)\n", 151 | " df = df.merge(tmp, on=group_cols, how='left')\n", 152 | " del tmp\n", 153 | " gc.collect()\n", 154 | " return df\n", 155 | "\n", 156 | "\n", 157 | "def next_click(df, group_cols):\n", 158 | " print('next click features')\n", 159 | " df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)\n", 160 | " for i, cols in enumerate(group_cols):\n", 161 | " col_name = \"_\".join(cols) + '_nextClick'\n", 162 | " df[col_name] = (df.groupby(cols).click_time.shift(-1) - df.click_time).astype(np.float32)\n", 163 | " gc.collect()\n", 164 | " return df\n", 165 | "\n", 166 | "\n", 167 | "def df2csr(wb, df, pick_hours=None):\n", 168 | " df.reset_index(drop=True, inplace=True)\n", 169 | " with timer(\"Adding counts\"):\n", 170 | " df['click_time'] = pd.to_datetime(df['click_time'])\n", 171 | " dt = df['click_time'].dt\n", 172 | " df['day'] = dt.day.astype('uint8')\n", 173 | " df['hour'] = dt.hour.astype('uint8')\n", 174 | " del (dt)\n", 175 | "\n", 176 | " df = count_agg(df, count_combinations)\n", 177 | " df = count_uniq(df, countUniq_combinations)\n", 178 | " df = next_click(df, nextClick_combinations)\n", 179 | "\n", 180 | " with timer(\"Log-binning features\"):\n", 181 | " for fea in ['app_count',\n", 182 | " 'ip_count',\n", 183 | " 'channel_count',\n", 184 | " 'os_count',\n", 185 | " 'ip_device_count',\n", 186 | " 'day_hour_app_count',\n", 187 | " 'app_channel_count',\n", 188 | " 'ip_day_hour_count',\n", 189 | " 'os_device_count',\n", 190 | " 'ip_os_day_hour_count',\n", 191 | " 'ip_device_day_hour_count',\n", 192 | " 'ip_app_os_count',\n", 193 | " 'ip_uniq_channel_countUniq',\n", 194 | " 'ip_uniq_app_countUniq',\n", 195 | " 'ip_uniq_os_countUniq',\n", 196 | " 'ip_os_nextClick',\n", 197 | " 'ip_app_device_os_nextClick'\n", 198 | " ]:\n", 199 | " df[fea] = np.log2(1 + df[fea].values).astype(int)\n", 200 | "\n", 201 | " with timer(\"Generating str_array\"):\n", 202 | " str_array = (\"I\" + df['ip'].astype(str) \\\n", 203 | " + \" A\" + df['app'].astype(str) \\\n", 204 | " + \" D\" + df['device'].astype(str) \\\n", 205 | " + \" O\" + df['os'].astype(str) \\\n", 206 | " + \" C\" + df['channel'].astype(str) \\\n", 207 | " + \" WD\" + df['day'].astype(str) \\\n", 208 | " + \" H\" + df['hour'].astype(str) \\\n", 209 | " + \" AXC\" + df['app'].astype(str) + \"_\" + df['channel'].astype(str) \\\n", 210 | " + \" OXC\" + df['os'].astype(str) + \"_\" + df['channel'].astype(str) \\\n", 211 | " + \" AXD\" + df['app'].astype(str) + \"_\" + df['device'].astype(str) \\\n", 212 | " + \" IXA\" + df['ip'].astype(str) + \"_\" + df['app'].astype(str) \\\n", 213 | " + \" AXO\" + df['app'].astype(str) + \"_\" + df['os'].astype(str) \\\n", 214 | " + \"AC\" + df['app_count'].astype(str) \\\n", 215 | " + \"IC\" + df['ip_count'].astype(str) \\\n", 216 | " + \"CC\" + df['channel_count'].astype(str) \\\n", 217 | " + \"OC\" + df['os_count'].astype(str) \\\n", 218 | " + \"IDC\" + df['ip_device_count'].astype(str) \\\n", 219 | " + \"DHAC\" + df['day_hour_app_count'].astype(str) \\\n", 220 | " + \"ACC\" + df['app_channel_count'].astype(str) \\\n", 221 | " + \"IDHC\" + df['ip_day_hour_count'].astype(str) \\\n", 222 | " + \"ODC\" + df['os_device_count'].astype(str) \\\n", 223 | " + \"IODHC\" + df['ip_os_day_hour_count'].astype(str) \\\n", 224 | " + \"IDDHC\" + df['ip_device_day_hour_count'].astype(str) \\\n", 225 | " + \"IAOC\" + df['ip_app_os_count'].astype(str) \\\n", 226 | " + \"IUC\" + df['ip_uniq_channel_countUniq'].astype(str) \\\n", 227 | " + \"IUA\" + df['ip_uniq_app_countUniq'].astype(str) \\\n", 228 | " + \"IUO\" + df['ip_uniq_os_countUniq'].astype(str) \\\n", 229 | " + \"ION\" + df['ip_os_nextClick'].astype(str) \\\n", 230 | " + \"IADON\" + df['ip_app_device_os_nextClick'].astype(str) \n", 231 | " ).values\n", 232 | " # cpuStats()\n", 233 | " if 'is_attributed' in df.columns:\n", 234 | " labels = df['is_attributed'].values\n", 235 | " weights = np.multiply([1.0 if x == 1 else 0.2 for x in df['is_attributed'].values],\n", 236 | " df['hour'].apply(lambda x: 1.0 if x in pick_hours else 0.5))\n", 237 | " else:\n", 238 | " labels = []\n", 239 | " weights = []\n", 240 | " return str_array, labels, weights" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 4, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "class ThreadWithReturnValue(threading.Thread):\n", 252 | " def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None):\n", 253 | " threading.Thread.__init__(self, group, target, name, args, kwargs, daemon=daemon)\n", 254 | " self._return = None\n", 255 | "\n", 256 | " def run(self):\n", 257 | " if self._target is not None:\n", 258 | " self._return = self._target(*self._args, **self._kwargs)\n", 259 | "\n", 260 | " def join(self):\n", 261 | " threading.Thread.join(self)\n", 262 | " return self._return" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 5, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "batchsize = 5000000\n", 274 | "D = 2 ** 20\n", 275 | "\n", 276 | "wb = wordbatch.WordBatch(None, extractor=(WordHash, {\"ngram_range\": (1, 1), \"analyzer\": \"word\",\n", 277 | " \"lowercase\": False, \"n_features\": D,\n", 278 | " \"norm\": None, \"binary\": True})\n", 279 | " , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0)\n", 280 | "clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,\n", 281 | " D_fm=8, e_noise=0.0, iters=2, inv_link=\"sigmoid\", e_clip=1.0, threads=4, use_avx=1, verbose=0)\n" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "scrolled": true 289 | }, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "grouping features\n", 296 | "unique features\n", 297 | "next click features\n", 298 | "[Adding counts] done in 40 s\n", 299 | "[Log-binning features] done in 2 s\n", 300 | "[Generating str_array] done in 145 s\n", 301 | "Training 5000000 227.62053513526917\n", 302 | "memory GB: 1.9478111267089844\n", 303 | "grouping features\n", 304 | "unique features\n", 305 | "next click features\n", 306 | "[Adding counts] done in 40 s\n", 307 | "[Log-binning features] done in 2 s\n", 308 | "[Generating str_array] done in 145 s\n", 309 | "Training 10000000 525.7184975147247\n", 310 | "memory GB: 2.3234786987304688\n", 311 | "10000000 ROC AUC: 0.9745651774599268 Running Mean: 0.9745651774599268\n", 312 | "grouping features\n", 313 | "unique features\n", 314 | "next click features\n", 315 | "[Adding counts] done in 41 s\n", 316 | "[Log-binning features] done in 2 s\n", 317 | "[Generating str_array] done in 148 s\n", 318 | "Training 15000000 851.9294579029083\n", 319 | "memory GB: 2.624431610107422\n", 320 | "grouping features\n", 321 | "unique features\n", 322 | "next click features\n", 323 | "[Adding counts] done in 40 s\n", 324 | "[Log-binning features] done in 2 s\n", 325 | "[Generating str_array] done in 144 s\n", 326 | "Training 20000000 1145.4692740440369\n", 327 | "memory GB: 2.211658477783203\n", 328 | "20000000 ROC AUC: 0.9683387927007022 Running Mean: 0.9733199005080819\n", 329 | "grouping features\n", 330 | "unique features\n", 331 | "next click features\n", 332 | "[Adding counts] done in 42 s\n", 333 | "[Log-binning features] done in 2 s\n", 334 | "[Generating str_array] done in 145 s\n", 335 | "Training 25000000 1466.6529309749603\n", 336 | "memory GB: 2.6913833618164062\n", 337 | "grouping features\n", 338 | "unique features\n", 339 | "next click features\n", 340 | "[Adding counts] done in 41 s\n", 341 | "[Log-binning features] done in 2 s\n", 342 | "[Generating str_array] done in 145 s\n", 343 | "Training 30000000 1764.8575196266174\n", 344 | "memory GB: 2.9753570556640625\n", 345 | "30000000 ROC AUC: 0.9733478534062981 Running Mean: 0.9733254910877251\n", 346 | "grouping features\n", 347 | "unique features\n", 348 | "next click features\n", 349 | "[Adding counts] done in 40 s\n", 350 | "[Log-binning features] done in 2 s\n", 351 | "[Generating str_array] done in 145 s\n", 352 | "Training 35000000 2089.1054067611694\n", 353 | "memory GB: 2.9623565673828125\n", 354 | "grouping features\n", 355 | "unique features\n", 356 | "next click features\n", 357 | "[Adding counts] done in 40 s\n", 358 | "[Log-binning features] done in 2 s\n", 359 | "[Generating str_array] done in 146 s\n", 360 | "Training 40000000 2385.0382177829742\n", 361 | "memory GB: 2.8841781616210938\n", 362 | "40000000 ROC AUC: 0.9774640639463225 Running Mean: 0.9741532056594446\n", 363 | "grouping features\n", 364 | "unique features\n", 365 | "next click features\n", 366 | "[Adding counts] done in 41 s\n", 367 | "[Log-binning features] done in 2 s\n", 368 | "[Generating str_array] done in 146 s\n", 369 | "Training 45000000 2704.936663866043\n", 370 | "memory GB: 2.7643775939941406\n", 371 | "grouping features\n", 372 | "unique features\n", 373 | "next click features\n", 374 | "[Adding counts] done in 40 s\n", 375 | "[Log-binning features] done in 2 s\n", 376 | "[Generating str_array] done in 145 s\n", 377 | "Training 50000000 2998.2805643081665\n", 378 | "memory GB: 2.9196701049804688\n", 379 | "50000000 ROC AUC: 0.9801504367222452 Running Mean: 0.9753526518720048\n", 380 | "grouping features\n", 381 | "unique features\n", 382 | "next click features\n", 383 | "[Adding counts] done in 39 s\n", 384 | "[Log-binning features] done in 2 s\n", 385 | "[Generating str_array] done in 149 s\n", 386 | "Training 55000000 3323.360859155655\n", 387 | "memory GB: 3.0685882568359375\n", 388 | "grouping features\n", 389 | "unique features\n", 390 | "next click features\n", 391 | "[Adding counts] done in 39 s\n", 392 | "[Log-binning features] done in 2 s\n", 393 | "[Generating str_array] done in 149 s\n", 394 | "Training 60000000 3622.3480241298676\n", 395 | "memory GB: 2.3555679321289062\n", 396 | "60000000 ROC AUC: 0.9685419239799189 Running Mean: 0.9739905062935876\n", 397 | "grouping features\n", 398 | "unique features\n", 399 | "next click features\n", 400 | "[Adding counts] done in 40 s\n", 401 | "[Log-binning features] done in 2 s\n", 402 | "[Generating str_array] done in 144 s\n", 403 | "Training 65000000 3944.0199427604675\n", 404 | "memory GB: 3.0107765197753906\n", 405 | "grouping features\n", 406 | "unique features\n", 407 | "next click features\n", 408 | "[Adding counts] done in 40 s\n", 409 | "[Log-binning features] done in 2 s\n", 410 | "[Generating str_array] done in 144 s\n", 411 | "Training 70000000 4232.590351819992\n", 412 | "memory GB: 2.636463165283203\n", 413 | "70000000 ROC AUC: 0.9759715986873083 Running Mean: 0.9743867247723318\n", 414 | "grouping features\n", 415 | "unique features\n", 416 | "next click features\n", 417 | "[Adding counts] done in 40 s\n", 418 | "[Log-binning features] done in 2 s\n", 419 | "[Generating str_array] done in 144 s\n", 420 | "Training 75000000 4557.927364110947\n", 421 | "memory GB: 2.794708251953125\n", 422 | "grouping features\n", 423 | "unique features\n", 424 | "next click features\n", 425 | "[Adding counts] done in 41 s\n", 426 | "[Log-binning features] done in 2 s\n", 427 | "[Generating str_array] done in 144 s\n", 428 | "Training 80000000 4850.067337274551\n", 429 | "memory GB: 2.8362197875976562\n", 430 | "80000000 ROC AUC: 0.979075189084094 Running Mean: 0.9753244176346842\n", 431 | "grouping features\n", 432 | "unique features\n", 433 | "next click features\n", 434 | "[Adding counts] done in 40 s\n", 435 | "[Log-binning features] done in 2 s\n", 436 | "[Generating str_array] done in 144 s\n", 437 | "Training 85000000 5168.086678981781\n", 438 | "memory GB: 2.3478317260742188\n", 439 | "grouping features\n", 440 | "unique features\n", 441 | "next click features\n", 442 | "[Adding counts] done in 40 s\n", 443 | "[Log-binning features] done in 2 s\n", 444 | "[Generating str_array] done in 144 s\n", 445 | "Training 90000000 5461.925878763199\n", 446 | "memory GB: 2.7953529357910156\n", 447 | "90000000 ROC AUC: 0.9745685903117774 Running Mean: 0.975173252170103\n", 448 | "grouping features\n", 449 | "unique features\n", 450 | "next click features\n", 451 | "[Adding counts] done in 41 s\n", 452 | "[Log-binning features] done in 2 s\n", 453 | "[Generating str_array] done in 144 s\n", 454 | "Training 95000000 5783.882947683334\n", 455 | "memory GB: 3.18951416015625\n", 456 | "grouping features\n", 457 | "unique features\n", 458 | "next click features\n", 459 | "[Adding counts] done in 40 s\n", 460 | "[Log-binning features] done in 2 s\n", 461 | "[Generating str_array] done in 145 s\n", 462 | "Training 100000000 6080.251963376999\n", 463 | "memory GB: 3.5879440307617188\n", 464 | "100000000 ROC AUC: 0.9776985376051364 Running Mean: 0.9756783092571096\n", 465 | "grouping features\n", 466 | "unique features\n", 467 | "next click features\n", 468 | "[Adding counts] done in 40 s\n", 469 | "[Log-binning features] done in 2 s\n", 470 | "[Generating str_array] done in 145 s\n", 471 | "Training 105000000 6396.508868455887\n", 472 | "memory GB: 3.0739974975585938\n", 473 | "grouping features\n", 474 | "unique features\n", 475 | "next click features\n", 476 | "[Adding counts] done in 40 s\n", 477 | "[Log-binning features] done in 2 s\n", 478 | "[Generating str_array] done in 144 s\n", 479 | "Training 110000000 6689.3176164627075\n", 480 | "memory GB: 2.8143463134765625\n", 481 | "110000000 ROC AUC: 0.9797327962031683 Running Mean: 0.9764892066463213\n", 482 | "grouping features\n", 483 | "unique features\n", 484 | "next click features\n", 485 | "[Adding counts] done in 39 s\n", 486 | "[Log-binning features] done in 2 s\n", 487 | "[Generating str_array] done in 147 s\n", 488 | "Training 115000000 7010.823716640472\n", 489 | "memory GB: 2.646251678466797\n", 490 | "grouping features\n", 491 | "unique features\n", 492 | "next click features\n", 493 | "[Adding counts] done in 40 s\n", 494 | "[Log-binning features] done in 2 s\n", 495 | "[Generating str_array] done in 145 s\n", 496 | "Training 120000000 7303.225748062134\n", 497 | "memory GB: 2.391429901123047\n", 498 | "120000000 ROC AUC: 0.9710553820325588 Running Mean: 0.9754024417235689\n", 499 | "grouping features\n", 500 | "unique features\n", 501 | "next click features\n", 502 | "[Adding counts] done in 40 s\n", 503 | "[Log-binning features] done in 2 s\n", 504 | "[Generating str_array] done in 144 s\n", 505 | "Training 125000000 7623.8535215854645\n", 506 | "memory GB: 2.7701873779296875\n", 507 | "grouping features\n" 508 | ] 509 | }, 510 | { 511 | "name": "stderr", 512 | "output_type": "stream", 513 | "text": [ 514 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:1457: RuntimeWarning: divide by zero encountered in long_scalars\n", 515 | " stride //= shape[i]\n" 516 | ] 517 | }, 518 | { 519 | "name": "stdout", 520 | "output_type": "stream", 521 | "text": [ 522 | "unique features\n", 523 | "next click features\n", 524 | "[Adding counts] done in 1 s\n", 525 | "[Log-binning features] done in 0 s\n", 526 | "[Generating str_array] done in 0 s\n" 527 | ] 528 | }, 529 | { 530 | "ename": "IndexError", 531 | "evalue": "list index out of range", 532 | "output_type": "error", 533 | "traceback": [ 534 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 535 | "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", 536 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mgc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_array\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mstr_array\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrcount\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mbatchsize\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 537 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/wordbatch/wordbatch.py\u001b[0m in \u001b[0;36mtransform\u001b[0;34m(self, texts, extractor, cache_features, input_split, reset, update)\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_split\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextractor\u001b[0m\u001b[0;34m!=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 250\u001b[0;31m \u001b[0mtexts\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mextractor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_split\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmerge_output\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 251\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcache_features\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mextractor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcache_features\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 538 | "\u001b[0;32mwordbatch/extractors/extractors.pyx\u001b[0m in \u001b[0;36mwordbatch.extractors.extractors.WordHash.transform\u001b[0;34m()\u001b[0m\n", 539 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/wordbatch/wordbatch.py\u001b[0m in \u001b[0;36mparallelize_batches\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mparallelize_batches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 270\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatcher\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparallelize_batches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 271\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msplit_batches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 540 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/wordbatch/batcher.py\u001b[0m in \u001b[0;36mparallelize_batches\u001b[0;34m(self, task, data, args, method, timeout, rdd_col, input_split, merge_output, minibatch_size, procs)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0mattempt\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 138\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mmerge_output\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge_batches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 541 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/wordbatch/batcher.py\u001b[0m in \u001b[0;36mmerge_batches\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmerge_batches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mssp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcsr_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mssp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 80\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mitem\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msublist\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msublist\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 542 | "\u001b[0;31mIndexError\u001b[0m: list index out of range" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "dtypes = {\n", 548 | " 'ip': 'uint32',\n", 549 | " 'app': 'uint16',\n", 550 | " 'device': 'uint16',\n", 551 | " 'os': 'uint16',\n", 552 | " 'channel': 'uint16',\n", 553 | " 'is_attributed': 'uint8',\n", 554 | "}\n", 555 | "\n", 556 | "p = None\n", 557 | "rcount = 0\n", 558 | "\n", 559 | "for df_c in pd.read_csv('data/train.csv',\n", 560 | " engine='c', chunksize=batchsize,\n", 561 | " skiprows=range(1, 9308569), sep=\",\", dtype=dtypes):\n", 562 | "\n", 563 | " rcount += batchsize\n", 564 | " if rcount == 130000000:\n", 565 | " df_c['click_time'] = pd.to_datetime(df_c['click_time'])\n", 566 | " df_c['day'] = df_c['click_time'].dt.day.astype('uint8')\n", 567 | " df_c = df_c[df_c['day'] == 8]\n", 568 | " str_array, labels, weights = df2csr(wb, df_c, pick_hours={4, 5, 10, 13, 14})\n", 569 | " del (df_c)\n", 570 | " if p != None:\n", 571 | " p.join()\n", 572 | " del (X)\n", 573 | " gc.collect()\n", 574 | " X = wb.transform(str_array)\n", 575 | " del (str_array)\n", 576 | " if rcount % (2 * batchsize) == 0:\n", 577 | " if p != None: p.join()\n", 578 | " p = threading.Thread(target=evaluate_batch, args=(clf, X, labels, rcount))\n", 579 | " p.start()\n", 580 | " print(\"Training\", rcount, time.time() - start_time)\n", 581 | " cpuStats()\n", 582 | " if p != None: p.join()\n", 583 | " p = threading.Thread(target=fit_batch, args=(clf, X, labels, weights))\n", 584 | " p.start()\n", 585 | " if rcount == 130000000:\n", 586 | " break" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": { 593 | "collapsed": true 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "if p != None:\n", 598 | " p.join()\n", 599 | "\n", 600 | "del (X)\n", 601 | "p = None\n", 602 | "click_ids = []\n", 603 | "test_preds = []\n", 604 | "rcount = 0\n", 605 | "for df_c in pd.read_csv('data/test.csv', engine='c', chunksize=batchsize,\n", 606 | " sep=\",\", dtype=dtypes):\n", 607 | " rcount += batchsize\n", 608 | " if rcount % (10 * batchsize) == 0:\n", 609 | " print(rcount)\n", 610 | " str_array, labels, weights = df2csr(wb, df_c)\n", 611 | " click_ids += df_c['click_id'].tolist()\n", 612 | " del (df_c)\n", 613 | " if p != None:\n", 614 | " test_preds += list(p.join())\n", 615 | " del (X)\n", 616 | " gc.collect()\n", 617 | " X = wb.transform(str_array)\n", 618 | " del (str_array)\n", 619 | " p = ThreadWithReturnValue(target=predict_batch, args=(clf, X))\n", 620 | " p.start()\n", 621 | "if p != None: test_preds += list(p.join())\n", 622 | "\n", 623 | "df_sub = pd.DataFrame({\"click_id\": click_ids, 'is_attributed': test_preds})\n", 624 | "df_sub.to_csv(\"wordbatch_fm_ftrl.csv\", index=False)" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": { 631 | "collapsed": true 632 | }, 633 | "outputs": [], 634 | "source": [] 635 | } 636 | ], 637 | "metadata": { 638 | "kernelspec": { 639 | "display_name": "Python 3", 640 | "language": "python", 641 | "name": "python3" 642 | }, 643 | "language_info": { 644 | "codemirror_mode": { 645 | "name": "ipython", 646 | "version": 3 647 | }, 648 | "file_extension": ".py", 649 | "mimetype": "text/x-python", 650 | "name": "python", 651 | "nbconvert_exporter": "python", 652 | "pygments_lexer": "ipython3", 653 | "version": "3.6.3" 654 | } 655 | }, 656 | "nbformat": 4, 657 | "nbformat_minor": 2 658 | } 659 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle TalkingData AdTracking Fraud Detection Challenge 2 | 48th Solution, competition link: https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection 3 | 4 | 1. [features.ipynb](https://github.com/shawnau/talkingData/blob/master/features.ipynb): notebook version 5 | 2. [train_xgb_lgb.py](https://github.com/shawnau/talkingData/blob/master/train_lgb_xgb.py): script version, gives about 0.9824 on private LB 6 | 3. blending.ipynb: blending historical models, which boost me about 0.0002 7 | 4. FTRL.ipynb: haven't tried due to limited time 8 | 9 | **running this code on full training data needs 96GB RAM with 128G swap** 10 | 11 | ## Some Solutions as a Reference 12 | 13 | 1. [3th, NN model](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56262#latest-325349) 14 | 2. [4th](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56243#latest-325397) 15 | 3. [6th, strong single lightGBM](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion) 16 | 4. [9th](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56279#latest-325405) 17 | 5. [FFM trick](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56282) 18 | 19 | ## Train log 20 | please see the [dashboard](https://github.com/shawnau/talkingData/projects/1) 21 | -------------------------------------------------------------------------------- /blending.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "xgb = pd.read_csv('submit_xgb_1161')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "lgb = pd.read_csv('submit_lgb_875')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "models = { \n", 46 | " 'xgb' : {\n", 47 | " 'name':'xgboost06',\n", 48 | " 'score':98.06,\n", 49 | " 'df':xgb },\n", 50 | " 'lgb' : {\n", 51 | " 'name':'lightgbm11',\n", 52 | " 'score':98.11,\n", 53 | " 'df':lgb }, \n", 54 | " }" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "Blending...\n", 67 | "\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "isa_lg = 0\n", 73 | "isa_hm = 0\n", 74 | "\n", 75 | "print(\"Blending...\\n\")\n", 76 | "for df in models.keys() : \n", 77 | " isa_lg += np.log(models[df]['df'].is_attributed)\n", 78 | " isa_hm += 1/(models[df]['df'].is_attributed)\n", 79 | "isa_lg = np.exp(isa_lg/len(models.keys()))\n", 80 | "isa_hm = len(models.keys())/isa_hm" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "xgb\n", 93 | "\n", 94 | " click_id is_attributed\n", 95 | "0 0 0.139849\n", 96 | "1 1 0.010011\n", 97 | "2 2 0.001622\n", 98 | "3 3 0.020445\n", 99 | "4 4 0.012320\n", 100 | "5 5 0.003596\n", 101 | "6 6 0.021500\n", 102 | "7 7 0.119462\n", 103 | "8 9 0.160837\n", 104 | "9 8 0.002143\n", 105 | "lgb\n", 106 | "\n", 107 | " click_id is_attributed\n", 108 | "0 0 0.115162\n", 109 | "1 1 0.012690\n", 110 | "2 2 0.001412\n", 111 | "3 3 0.033062\n", 112 | "4 4 0.012521\n", 113 | "5 5 0.003612\n", 114 | "6 6 0.031346\n", 115 | "7 7 0.172713\n", 116 | "8 9 0.137332\n", 117 | "9 8 0.003644\n", 118 | "Isa log\n", 119 | "\n", 120 | "0 0.126907\n", 121 | "1 0.011272\n", 122 | "2 0.001513\n", 123 | "3 0.025999\n", 124 | "4 0.012420\n", 125 | "5 0.003604\n", 126 | "6 0.025960\n", 127 | "7 0.143641\n", 128 | "8 0.148620\n", 129 | "9 0.002795\n", 130 | "Name: is_attributed, dtype: float64\n", 131 | "\n", 132 | "Isa harmo\n", 133 | "\n", 134 | "0 0.126311\n", 135 | "1 0.011193\n", 136 | "2 0.001509\n", 137 | "3 0.025266\n", 138 | "4 0.012420\n", 139 | "5 0.003604\n", 140 | "6 0.025506\n", 141 | "7 0.141235\n", 142 | "8 0.148158\n", 143 | "9 0.002699\n", 144 | "Name: is_attributed, dtype: float64\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "print(\"xgb\\n\")\n", 150 | "print(xgb[:10])\n", 151 | "print(\"lgb\\n\")\n", 152 | "print(lgb[:10])\n", 153 | "\n", 154 | "print(\"Isa log\\n\")\n", 155 | "print(isa_lg[:10])\n", 156 | "print()\n", 157 | "print(\"Isa harmo\\n\")\n", 158 | "print(isa_hm[:10])" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 7, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 183 | "\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | "
click_idis_attributed
000.126907
110.011272
220.001513
330.025999
440.012420
\n", 219 | "
" 220 | ], 221 | "text/plain": [ 222 | " click_id is_attributed\n", 223 | "0 0 0.126907\n", 224 | "1 1 0.011272\n", 225 | "2 2 0.001513\n", 226 | "3 3 0.025999\n", 227 | "4 4 0.012420" 228 | ] 229 | }, 230 | "execution_count": 7, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "sub_log = pd.DataFrame()\n", 237 | "sub_log['click_id'] = xgb['click_id']\n", 238 | "sub_log['is_attributed'] = isa_lg\n", 239 | "sub_log.head()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 8, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/html": [ 250 | "
\n", 251 | "\n", 264 | "\n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | "
click_idis_attributed
000.126311
110.011193
220.001509
330.025266
440.012420
\n", 300 | "
" 301 | ], 302 | "text/plain": [ 303 | " click_id is_attributed\n", 304 | "0 0 0.126311\n", 305 | "1 1 0.011193\n", 306 | "2 2 0.001509\n", 307 | "3 3 0.025266\n", 308 | "4 4 0.012420" 309 | ] 310 | }, 311 | "execution_count": 8, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "sub_hm = pd.DataFrame()\n", 318 | "sub_hm['click_id'] = xgb['click_id']\n", 319 | "sub_hm['is_attributed'] = isa_hm\n", 320 | "sub_hm.head()" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 9, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "Writing...\n", 333 | "Done!\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "print(\"Writing...\")\n", 339 | "sub_log.to_csv('submission_xgb06_lgb11_log.gz', index=False, float_format='%.9f', compression='gzip')\n", 340 | "sub_hm.to_csv('submission_xgb06_lgb11_hm.gz', index=False, float_format='%.9f', compression='gzip')\n", 341 | "print('Done!')" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 3", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.6.3" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 2 375 | } 376 | -------------------------------------------------------------------------------- /features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "import gc\n", 13 | "import time\n", 14 | "from time import gmtime, strftime\n", 15 | "\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "import lightgbm as lgb\n", 18 | "import xgboost as xgb\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Features" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 18, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def group_label(df, group_cols):\n", 40 | " for i, cols in enumerate(group_cols):\n", 41 | " col_name = \"_\".join(group_cols)\n", 42 | " print(i, col_name)\n", 43 | " group_idx = df.drop_duplicates(cols)[cols].reset_index()\n", 44 | " group_idx.rename(columns={'index':col_name}, inplace=True)\n", 45 | " df = df.merge( group_idx, on=cols, how='left' )\n", 46 | " del group_idx\n", 47 | " gc.collect()\n", 48 | " return df\n", 49 | " \n", 50 | "def count_agg(df, group_cols):\n", 51 | " for i, cols in enumerate(group_cols):\n", 52 | " col_name = \"_\".join(cols)+'_count'\n", 53 | " print(i, col_name)\n", 54 | " count = df.groupby(cols).size().reset_index(name=col_name)\n", 55 | " df = df.merge(count, on=cols, how='left')\n", 56 | " del count\n", 57 | " gc.collect()\n", 58 | " return df\n", 59 | "\n", 60 | "def count_cum(df, group_cols):\n", 61 | " for i, cols in enumerate(group_cols):\n", 62 | " col_name = \"_\".join(cols)+'_countAccum'\n", 63 | " print(i, col_name)\n", 64 | " df[col_name] = df.groupby(cols).cumcount()\n", 65 | " gc.collect()\n", 66 | " return df\n", 67 | "\n", 68 | "def count_uniq(df, group_uniq_cols):\n", 69 | " for i, cols in enumerate(group_uniq_cols):\n", 70 | " group_cols, uniq_col = cols[0], cols[1]\n", 71 | " col_name = \"_\".join(group_cols)+'_uniq_'+uniq_col+'_countUniq'\n", 72 | " print(i, col_name)\n", 73 | " tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)\n", 74 | " df = df.merge(tmp, on=group_cols, how='left')\n", 75 | " del tmp\n", 76 | " gc.collect()\n", 77 | " return df\n", 78 | "\n", 79 | "def next_click(df, group_cols):\n", 80 | " for i, cols in enumerate(group_cols):\n", 81 | " col_name = \"_\".join(cols)+'_nextClick'\n", 82 | " print(i, col_name)\n", 83 | " df[col_name] = (df.groupby(cols).click_time.shift(-1) - df.click_time).astype(np.float32)\n", 84 | " gc.collect()\n", 85 | " return df\n", 86 | "\n", 87 | "def frequence(df, group_cols):\n", 88 | " for i, cols in enumerate(group_cols):\n", 89 | " col_name = \"_\".join(cols)+'_nextClick'\n", 90 | " print(i, col_name)\n", 91 | " clickFreq = df.groupby(cols)[col_name].mean().dropna().reset_index(name=(\"_\".join(cols)+'_clickFreq'))\n", 92 | " df = df.merge(clickFreq, on=cols, how='left')\n", 93 | " del clickFreq\n", 94 | " gc.collect()\n", 95 | " return df" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 19, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "def generate_features(df):\n", 107 | " print('generating time features...')\n", 108 | " df['day'] = df['click_time'].dt.day.astype('uint8')\n", 109 | " df['hour'] = df['click_time'].dt.hour.astype('uint8')\n", 110 | " df['in_test_hh'] = (3 - 2 * df['hour'].isin([4, 5, 9, 10, 13, 14]) # most frequent\n", 111 | " - 1 * df['hour'].isin([6, 11, 15])).astype('uint8') # least frequent\n", 112 | " print('done')\n", 113 | " gc.collect()\n", 114 | " \n", 115 | " group_combinations = [\n", 116 | " #['app', 'device'],\n", 117 | " #['app', 'channel']\n", 118 | " ]\n", 119 | " \n", 120 | " count_combinations = [\n", 121 | " ['app'],\n", 122 | " ['ip'], # 3.03\n", 123 | " ['channel'],\n", 124 | " ['os'],\n", 125 | " ['ip', 'device'], # 9.88\n", 126 | " ['day', 'hour', 'app'], # 4.08\n", 127 | " ['app', 'channel'], # 2.8\n", 128 | " ['ip', 'day', 'in_test_hh'], # 1.74\n", 129 | " ['ip', 'day', 'hour'], # 0.52\n", 130 | " ['os', 'device'], # 0.44\n", 131 | " ['ip', 'os', 'day', 'hour'], # 0.41\n", 132 | " ['ip', 'device', 'day', 'hour'], # 0.31\n", 133 | " ['ip', 'app', 'os'] # 0.21\n", 134 | " ]\n", 135 | " \n", 136 | " countUniq_combinations = [\n", 137 | " #[['app'],'ip'],\n", 138 | " #[['app', 'device', 'os', 'channel'], 'ip'],\n", 139 | " [['ip'], 'channel'], # 0.9\n", 140 | " [['ip'], 'app'], # 1.3\n", 141 | " [['ip'], 'os'] # 0.45\n", 142 | " ]\n", 143 | " \n", 144 | " nextClick_combinations = [\n", 145 | " ['ip', 'os'],\n", 146 | " ['ip', 'device', 'os'],\n", 147 | " ['ip', 'app', 'device', 'os'],\n", 148 | " ['ip', 'app', 'device', 'os', 'channel']\n", 149 | " ]\n", 150 | " \n", 151 | " freq_combinations = [\n", 152 | " #['ip', 'app', 'device', 'os']\n", 153 | " ]\n", 154 | " \n", 155 | " accum_combinations = [\n", 156 | " #['app'],\n", 157 | " ['ip'] # 3.03\n", 158 | " #['day', 'hour', 'app']\n", 159 | " ]\n", 160 | " \n", 161 | " \n", 162 | " df = group_label(df, group_combinations)\n", 163 | " df = count_agg(df, count_combinations)\n", 164 | " df = count_cum(df, accum_combinations)\n", 165 | " df = count_uniq(df, countUniq_combinations)\n", 166 | " df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)\n", 167 | " df = next_click(df, nextClick_combinations)\n", 168 | " df = frequence(df, freq_combinations)\n", 169 | " \n", 170 | " df.drop(['ip', 'click_time', 'day', 'in_test_hh'], axis=1, inplace=True)\n", 171 | " gc.collect()\n", 172 | " print(df.info())\n", 173 | " return df" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Load Data" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 20, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "dtype = {\n", 190 | " 'ip' :'uint32',\n", 191 | " 'app' :'uint16',\n", 192 | " 'device': 'uint16',\n", 193 | " 'os' :'uint16',\n", 194 | " 'channel': 'uint16',\n", 195 | " 'is_attributed': 'uint8',\n", 196 | " 'click_id': 'uint32',\n", 197 | "}\n", 198 | "\n", 199 | "# train: (184903890, 7)\n", 200 | "# test: (18790469, 7)\n", 201 | "train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']\n", 202 | "train_df = pd.read_csv('data/train.csv', dtype=dtype, usecols=train_cols, parse_dates=['click_time'])\n", 203 | "\n", 204 | "test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']\n", 205 | "# using test_supplement \n", 206 | "test_df = pd.read_csv('data/test_supplement.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 21, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "# combine train and test data\n", 218 | "common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']\n", 219 | "all_df = pd.concat([train_df[common_cols], test_df[common_cols]])" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 22, 225 | "metadata": { 226 | "scrolled": true 227 | }, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "generating time features...\n", 234 | "done\n", 235 | "0 app_count\n", 236 | "1 ip_count\n", 237 | "2 channel_count\n", 238 | "3 os_count\n", 239 | "4 ip_device_count\n", 240 | "5 day_hour_app_count\n", 241 | "6 app_channel_count\n", 242 | "7 ip_day_in_test_hh_count\n", 243 | "8 ip_day_hour_count\n", 244 | "9 os_device_count\n", 245 | "10 ip_os_day_hour_count\n", 246 | "11 ip_device_day_hour_count\n", 247 | "12 ip_app_os_count\n", 248 | "0 ip_countAccum\n", 249 | "0 ip_uniq_channel_countUniq\n", 250 | "1 ip_uniq_app_countUniq\n", 251 | "2 ip_uniq_os_countUniq\n", 252 | "0 ip_os_nextClick\n", 253 | "1 ip_device_os_nextClick\n", 254 | "2 ip_app_device_os_nextClick\n", 255 | "3 ip_app_device_os_channel_nextClick\n", 256 | "\n", 257 | "Int64Index: 200000 entries, 0 to 199999\n", 258 | "Data columns (total 26 columns):\n", 259 | "app 200000 non-null uint16\n", 260 | "device 200000 non-null uint16\n", 261 | "os 200000 non-null uint16\n", 262 | "channel 200000 non-null uint16\n", 263 | "hour 200000 non-null uint8\n", 264 | "app_count 200000 non-null int64\n", 265 | "ip_count 200000 non-null int64\n", 266 | "channel_count 200000 non-null int64\n", 267 | "os_count 200000 non-null int64\n", 268 | "ip_device_count 200000 non-null int64\n", 269 | "day_hour_app_count 200000 non-null int64\n", 270 | "app_channel_count 200000 non-null int64\n", 271 | "ip_day_in_test_hh_count 200000 non-null int64\n", 272 | "ip_day_hour_count 200000 non-null int64\n", 273 | "os_device_count 200000 non-null int64\n", 274 | "ip_os_day_hour_count 200000 non-null int64\n", 275 | "ip_device_day_hour_count 200000 non-null int64\n", 276 | "ip_app_os_count 200000 non-null int64\n", 277 | "ip_countAccum 200000 non-null int64\n", 278 | "ip_uniq_channel_countUniq 200000 non-null int64\n", 279 | "ip_uniq_app_countUniq 200000 non-null int64\n", 280 | "ip_uniq_os_countUniq 200000 non-null int64\n", 281 | "ip_os_nextClick 151558 non-null float32\n", 282 | "ip_device_os_nextClick 150231 non-null float32\n", 283 | "ip_app_device_os_nextClick 63406 non-null float32\n", 284 | "ip_app_device_os_channel_nextClick 42736 non-null float32\n", 285 | "dtypes: float32(4), int64(17), uint16(4), uint8(1)\n", 286 | "memory usage: 32.2 MB\n", 287 | "None\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "# generate data\n", 293 | "all_df = generate_features(all_df)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 10, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "24" 305 | ] 306 | }, 307 | "execution_count": 10, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "# split train/test features from concated data\n", 314 | "train_features = all_df.iloc[:train_df.shape[0]]\n", 315 | "test_features = all_df.iloc[train_df.shape[0]:]\n", 316 | "gc.collect()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "# Train LightGBM Model" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 11, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "metrics = 'auc'\n", 335 | "lgb_params = {\n", 336 | " 'boosting_type': 'gbdt',\n", 337 | " 'objective': 'binary',\n", 338 | " 'metric': metrics,\n", 339 | " 'learning_rate': 0.1,\n", 340 | " 'num_leaves': 7,\n", 341 | " 'max_depth': 4,\n", 342 | " 'min_child_samples': 100,\n", 343 | " 'max_bin': 100,\n", 344 | " 'subsample': 0.7,\n", 345 | " 'subsample_freq': 1,\n", 346 | " 'colsample_bytree': 0.7,\n", 347 | " 'min_child_weight': 0,\n", 348 | " 'min_split_gain': 0,\n", 349 | " 'nthread': 24,\n", 350 | " 'verbose': 1,\n", 351 | " 'scale_pos_weight': 200\n", 352 | "}\n", 353 | "\n", 354 | "target = 'is_attributed'\n", 355 | "features = [col for col in train_features.columns if col not in ['level_0', 'index', 'is_attributed']]\n", 356 | "category = ['app', 'device', 'os', 'channel', 'hour']" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "Train size: 179903890\n", 369 | "Valid size: 5000000\n" 370 | ] 371 | }, 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "12" 376 | ] 377 | }, 378 | "execution_count": 12, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "# train valid split\n", 385 | "labels = train_df.is_attributed.values\n", 386 | "train_features, valid_features = train_test_split(train_features, test_size=5000000, shuffle=False)\n", 387 | "train_labels, valid_labels = train_test_split(labels, test_size=5000000, shuffle=False)\n", 388 | "print('Train size:', len(train_features))\n", 389 | "print('Valid size:', len(valid_features))\n", 390 | "gc.collect()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 13, 396 | "metadata": { 397 | "collapsed": true 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "# convert data into dataset. Warning: Memory Peak\n", 402 | "xgtrain = lgb.Dataset(train_features[features].values, \n", 403 | " label=train_labels,\n", 404 | " feature_name=features,\n", 405 | " categorical_feature=category)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 14, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "xgvalid = lgb.Dataset(valid_features[features].values, \n", 417 | " label=valid_labels,\n", 418 | " feature_name=features,\n", 419 | " categorical_feature=category)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 15, 425 | "metadata": { 426 | "scrolled": true 427 | }, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "Training...\n" 434 | ] 435 | }, 436 | { 437 | "name": "stderr", 438 | "output_type": "stream", 439 | "text": [ 440 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/lightgbm/basic.py:1036: UserWarning: Using categorical_feature in Dataset.\n", 441 | " warnings.warn('Using categorical_feature in Dataset.')\n", 442 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/lightgbm/basic.py:681: UserWarning: categorical_feature in param dict is overrided.\n", 443 | " warnings.warn('categorical_feature in param dict is overrided.')\n" 444 | ] 445 | }, 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | "[1]\tvalid's auc: 0.966352\n", 451 | "Training until validation scores don't improve for 50 rounds.\n", 452 | "[2]\tvalid's auc: 0.965255\n", 453 | "[3]\tvalid's auc: 0.965736\n", 454 | "[4]\tvalid's auc: 0.967262\n", 455 | "[5]\tvalid's auc: 0.968275\n", 456 | "[6]\tvalid's auc: 0.968514\n", 457 | "[7]\tvalid's auc: 0.969005\n", 458 | "[8]\tvalid's auc: 0.969135\n", 459 | "[9]\tvalid's auc: 0.969544\n", 460 | "[10]\tvalid's auc: 0.969897\n", 461 | "[11]\tvalid's auc: 0.971303\n", 462 | "[12]\tvalid's auc: 0.970808\n", 463 | "[13]\tvalid's auc: 0.972213\n", 464 | "[14]\tvalid's auc: 0.972248\n", 465 | "[15]\tvalid's auc: 0.972585\n", 466 | "[16]\tvalid's auc: 0.973142\n", 467 | "[17]\tvalid's auc: 0.972927\n", 468 | "[18]\tvalid's auc: 0.973638\n", 469 | "[19]\tvalid's auc: 0.973712\n", 470 | "[20]\tvalid's auc: 0.973566\n", 471 | "[21]\tvalid's auc: 0.97354\n", 472 | "[22]\tvalid's auc: 0.974188\n", 473 | "[23]\tvalid's auc: 0.974211\n", 474 | "[24]\tvalid's auc: 0.974127\n", 475 | "[25]\tvalid's auc: 0.975213\n", 476 | "[26]\tvalid's auc: 0.975339\n", 477 | "[27]\tvalid's auc: 0.975524\n", 478 | "[28]\tvalid's auc: 0.975912\n", 479 | "[29]\tvalid's auc: 0.976048\n", 480 | "[30]\tvalid's auc: 0.976783\n", 481 | "[31]\tvalid's auc: 0.977161\n", 482 | "[32]\tvalid's auc: 0.977056\n", 483 | "[33]\tvalid's auc: 0.977343\n", 484 | "[34]\tvalid's auc: 0.977964\n", 485 | "[35]\tvalid's auc: 0.978315\n", 486 | "[36]\tvalid's auc: 0.978804\n", 487 | "[37]\tvalid's auc: 0.979182\n", 488 | "[38]\tvalid's auc: 0.979507\n", 489 | "[39]\tvalid's auc: 0.979803\n", 490 | "[40]\tvalid's auc: 0.979992\n", 491 | "[41]\tvalid's auc: 0.98026\n", 492 | "[42]\tvalid's auc: 0.98059\n", 493 | "[43]\tvalid's auc: 0.980902\n", 494 | "[44]\tvalid's auc: 0.981156\n", 495 | "[45]\tvalid's auc: 0.981405\n", 496 | "[46]\tvalid's auc: 0.981478\n", 497 | "[47]\tvalid's auc: 0.98176\n", 498 | "[48]\tvalid's auc: 0.981869\n", 499 | "[49]\tvalid's auc: 0.982101\n", 500 | "[50]\tvalid's auc: 0.982318\n", 501 | "[51]\tvalid's auc: 0.982542\n", 502 | "[52]\tvalid's auc: 0.982618\n", 503 | "[53]\tvalid's auc: 0.982809\n", 504 | "[54]\tvalid's auc: 0.983017\n", 505 | "[55]\tvalid's auc: 0.983152\n", 506 | "[56]\tvalid's auc: 0.983427\n", 507 | "[57]\tvalid's auc: 0.983456\n", 508 | "[58]\tvalid's auc: 0.983586\n", 509 | "[59]\tvalid's auc: 0.983818\n", 510 | "[60]\tvalid's auc: 0.983957\n", 511 | "[61]\tvalid's auc: 0.984085\n", 512 | "[62]\tvalid's auc: 0.984128\n", 513 | "[63]\tvalid's auc: 0.984178\n", 514 | "[64]\tvalid's auc: 0.984309\n", 515 | "[65]\tvalid's auc: 0.984344\n", 516 | "[66]\tvalid's auc: 0.984351\n", 517 | "[67]\tvalid's auc: 0.98457\n", 518 | "[68]\tvalid's auc: 0.984685\n", 519 | "[69]\tvalid's auc: 0.984789\n", 520 | "[70]\tvalid's auc: 0.984903\n", 521 | "[71]\tvalid's auc: 0.984958\n", 522 | "[72]\tvalid's auc: 0.984998\n", 523 | "[73]\tvalid's auc: 0.985041\n", 524 | "[74]\tvalid's auc: 0.985125\n", 525 | "[75]\tvalid's auc: 0.985219\n", 526 | "[76]\tvalid's auc: 0.985292\n", 527 | "[77]\tvalid's auc: 0.985257\n", 528 | "[78]\tvalid's auc: 0.985318\n", 529 | "[79]\tvalid's auc: 0.985476\n", 530 | "[80]\tvalid's auc: 0.985545\n", 531 | "[81]\tvalid's auc: 0.985609\n", 532 | "[82]\tvalid's auc: 0.985609\n", 533 | "[83]\tvalid's auc: 0.985659\n", 534 | "[84]\tvalid's auc: 0.985732\n", 535 | "[85]\tvalid's auc: 0.985866\n", 536 | "[86]\tvalid's auc: 0.985924\n", 537 | "[87]\tvalid's auc: 0.985957\n", 538 | "[88]\tvalid's auc: 0.985964\n", 539 | "[89]\tvalid's auc: 0.986025\n", 540 | "[90]\tvalid's auc: 0.986087\n", 541 | "[91]\tvalid's auc: 0.986072\n", 542 | "[92]\tvalid's auc: 0.986112\n", 543 | "[93]\tvalid's auc: 0.986156\n", 544 | "[94]\tvalid's auc: 0.986164\n", 545 | "[95]\tvalid's auc: 0.986194\n", 546 | "[96]\tvalid's auc: 0.986236\n", 547 | "[97]\tvalid's auc: 0.986265\n", 548 | "[98]\tvalid's auc: 0.986319\n", 549 | "[99]\tvalid's auc: 0.986319\n", 550 | "[100]\tvalid's auc: 0.986367\n", 551 | "[101]\tvalid's auc: 0.986404\n", 552 | "[102]\tvalid's auc: 0.986425\n", 553 | "[103]\tvalid's auc: 0.986505\n", 554 | "[104]\tvalid's auc: 0.986561\n", 555 | "[105]\tvalid's auc: 0.986595\n", 556 | "[106]\tvalid's auc: 0.986616\n", 557 | "[107]\tvalid's auc: 0.986659\n", 558 | "[108]\tvalid's auc: 0.98678\n", 559 | "[109]\tvalid's auc: 0.986888\n", 560 | "[110]\tvalid's auc: 0.986913\n", 561 | "[111]\tvalid's auc: 0.986951\n", 562 | "[112]\tvalid's auc: 0.986974\n", 563 | "[113]\tvalid's auc: 0.987014\n", 564 | "[114]\tvalid's auc: 0.987038\n", 565 | "[115]\tvalid's auc: 0.987057\n", 566 | "[116]\tvalid's auc: 0.987072\n", 567 | "[117]\tvalid's auc: 0.98708\n", 568 | "[118]\tvalid's auc: 0.987092\n", 569 | "[119]\tvalid's auc: 0.987133\n", 570 | "[120]\tvalid's auc: 0.987133\n", 571 | "[121]\tvalid's auc: 0.987116\n", 572 | "[122]\tvalid's auc: 0.987127\n", 573 | "[123]\tvalid's auc: 0.987155\n", 574 | "[124]\tvalid's auc: 0.987184\n", 575 | "[125]\tvalid's auc: 0.987213\n", 576 | "[126]\tvalid's auc: 0.987238\n", 577 | "[127]\tvalid's auc: 0.987232\n", 578 | "[128]\tvalid's auc: 0.987244\n", 579 | "[129]\tvalid's auc: 0.987276\n", 580 | "[130]\tvalid's auc: 0.987301\n", 581 | "[131]\tvalid's auc: 0.987324\n", 582 | "[132]\tvalid's auc: 0.987332\n", 583 | "[133]\tvalid's auc: 0.987415\n", 584 | "[134]\tvalid's auc: 0.98743\n", 585 | "[135]\tvalid's auc: 0.987462\n", 586 | "[136]\tvalid's auc: 0.987479\n", 587 | "[137]\tvalid's auc: 0.987485\n", 588 | "[138]\tvalid's auc: 0.987551\n", 589 | "[139]\tvalid's auc: 0.98758\n", 590 | "[140]\tvalid's auc: 0.987604\n", 591 | "[141]\tvalid's auc: 0.98763\n", 592 | "[142]\tvalid's auc: 0.987645\n", 593 | "[143]\tvalid's auc: 0.98765\n", 594 | "[144]\tvalid's auc: 0.98767\n", 595 | "[145]\tvalid's auc: 0.987672\n", 596 | "[146]\tvalid's auc: 0.987688\n", 597 | "[147]\tvalid's auc: 0.987712\n", 598 | "[148]\tvalid's auc: 0.987729\n", 599 | "[149]\tvalid's auc: 0.987749\n", 600 | "[150]\tvalid's auc: 0.98776\n", 601 | "[151]\tvalid's auc: 0.987767\n", 602 | "[152]\tvalid's auc: 0.987771\n", 603 | "[153]\tvalid's auc: 0.987785\n", 604 | "[154]\tvalid's auc: 0.987769\n", 605 | "[155]\tvalid's auc: 0.987781\n", 606 | "[156]\tvalid's auc: 0.987792\n", 607 | "[157]\tvalid's auc: 0.987846\n", 608 | "[158]\tvalid's auc: 0.987844\n", 609 | "[159]\tvalid's auc: 0.987852\n", 610 | "[160]\tvalid's auc: 0.987859\n", 611 | "[161]\tvalid's auc: 0.987874\n", 612 | "[162]\tvalid's auc: 0.987889\n", 613 | "[163]\tvalid's auc: 0.98795\n", 614 | "[164]\tvalid's auc: 0.987965\n", 615 | "[165]\tvalid's auc: 0.987981\n", 616 | "[166]\tvalid's auc: 0.987998\n", 617 | "[167]\tvalid's auc: 0.987999\n", 618 | "[168]\tvalid's auc: 0.988003\n", 619 | "[169]\tvalid's auc: 0.988009\n", 620 | "[170]\tvalid's auc: 0.988005\n", 621 | "[171]\tvalid's auc: 0.988019\n", 622 | "[172]\tvalid's auc: 0.988046\n", 623 | "[173]\tvalid's auc: 0.988057\n", 624 | "[174]\tvalid's auc: 0.988113\n", 625 | "[175]\tvalid's auc: 0.988111\n", 626 | "[176]\tvalid's auc: 0.98811\n", 627 | "[177]\tvalid's auc: 0.988127\n", 628 | "[178]\tvalid's auc: 0.988139\n", 629 | "[179]\tvalid's auc: 0.988148\n", 630 | "[180]\tvalid's auc: 0.988165\n", 631 | "[181]\tvalid's auc: 0.988184\n", 632 | "[182]\tvalid's auc: 0.988197\n", 633 | "[183]\tvalid's auc: 0.98821\n", 634 | "[184]\tvalid's auc: 0.988226\n", 635 | "[185]\tvalid's auc: 0.988241\n", 636 | "[186]\tvalid's auc: 0.988251\n", 637 | "[187]\tvalid's auc: 0.988265\n", 638 | "[188]\tvalid's auc: 0.988306\n", 639 | "[189]\tvalid's auc: 0.988307\n", 640 | "[190]\tvalid's auc: 0.988315\n", 641 | "[191]\tvalid's auc: 0.988353\n", 642 | "[192]\tvalid's auc: 0.988363\n", 643 | "[193]\tvalid's auc: 0.988357\n", 644 | "[194]\tvalid's auc: 0.988368\n", 645 | "[195]\tvalid's auc: 0.98837\n", 646 | "[196]\tvalid's auc: 0.988389\n", 647 | "[197]\tvalid's auc: 0.988387\n", 648 | "[198]\tvalid's auc: 0.988412\n", 649 | "[199]\tvalid's auc: 0.988422\n", 650 | "[200]\tvalid's auc: 0.988441\n", 651 | "[201]\tvalid's auc: 0.988445\n", 652 | "[202]\tvalid's auc: 0.988474\n", 653 | "[203]\tvalid's auc: 0.988483\n", 654 | "[204]\tvalid's auc: 0.988504\n", 655 | "[205]\tvalid's auc: 0.988567\n", 656 | "[206]\tvalid's auc: 0.988577\n", 657 | "[207]\tvalid's auc: 0.98858\n", 658 | "[208]\tvalid's auc: 0.9886\n", 659 | "[209]\tvalid's auc: 0.988613\n", 660 | "[210]\tvalid's auc: 0.98862\n", 661 | "[211]\tvalid's auc: 0.988629\n", 662 | "[212]\tvalid's auc: 0.988633\n", 663 | "[213]\tvalid's auc: 0.988645\n", 664 | "[214]\tvalid's auc: 0.988648\n", 665 | "[215]\tvalid's auc: 0.988653\n", 666 | "[216]\tvalid's auc: 0.988656\n", 667 | "[217]\tvalid's auc: 0.988656\n", 668 | "[218]\tvalid's auc: 0.988648\n", 669 | "[219]\tvalid's auc: 0.988646\n", 670 | "[220]\tvalid's auc: 0.988649\n", 671 | "[221]\tvalid's auc: 0.988661\n", 672 | "[222]\tvalid's auc: 0.988665\n", 673 | "[223]\tvalid's auc: 0.988666\n", 674 | "[224]\tvalid's auc: 0.988673\n", 675 | "[225]\tvalid's auc: 0.988683\n", 676 | "[226]\tvalid's auc: 0.988691\n", 677 | "[227]\tvalid's auc: 0.988696\n", 678 | "[228]\tvalid's auc: 0.988706\n", 679 | "[229]\tvalid's auc: 0.988704\n", 680 | "[230]\tvalid's auc: 0.988712\n", 681 | "[231]\tvalid's auc: 0.98872\n", 682 | "[232]\tvalid's auc: 0.988717\n", 683 | "[233]\tvalid's auc: 0.98872\n", 684 | "[234]\tvalid's auc: 0.988716\n", 685 | "[235]\tvalid's auc: 0.988719\n", 686 | "[236]\tvalid's auc: 0.988727\n", 687 | "[237]\tvalid's auc: 0.988725\n", 688 | "[238]\tvalid's auc: 0.98873\n", 689 | "[239]\tvalid's auc: 0.988745\n", 690 | "[240]\tvalid's auc: 0.988748\n", 691 | "[241]\tvalid's auc: 0.988762\n", 692 | "[242]\tvalid's auc: 0.988763\n", 693 | "[243]\tvalid's auc: 0.988771\n", 694 | "[244]\tvalid's auc: 0.988776\n", 695 | "[245]\tvalid's auc: 0.988796\n", 696 | "[246]\tvalid's auc: 0.988795\n", 697 | "[247]\tvalid's auc: 0.988801\n", 698 | "[248]\tvalid's auc: 0.9888\n", 699 | "[249]\tvalid's auc: 0.988802\n", 700 | "[250]\tvalid's auc: 0.988817\n", 701 | "[251]\tvalid's auc: 0.988814\n", 702 | "[252]\tvalid's auc: 0.988821\n", 703 | "[253]\tvalid's auc: 0.988824\n", 704 | "[254]\tvalid's auc: 0.988821\n", 705 | "[255]\tvalid's auc: 0.988825\n", 706 | "[256]\tvalid's auc: 0.988829\n", 707 | "[257]\tvalid's auc: 0.988874\n", 708 | "[258]\tvalid's auc: 0.988883\n", 709 | "[259]\tvalid's auc: 0.988887\n", 710 | "[260]\tvalid's auc: 0.988892\n", 711 | "[261]\tvalid's auc: 0.988899\n", 712 | "[262]\tvalid's auc: 0.988903\n", 713 | "[263]\tvalid's auc: 0.988906\n", 714 | "[264]\tvalid's auc: 0.988912\n", 715 | "[265]\tvalid's auc: 0.988917\n", 716 | "[266]\tvalid's auc: 0.988911\n", 717 | "[267]\tvalid's auc: 0.988918\n", 718 | "[268]\tvalid's auc: 0.988915\n", 719 | "[269]\tvalid's auc: 0.988929\n", 720 | "[270]\tvalid's auc: 0.988941\n", 721 | "[271]\tvalid's auc: 0.988954\n", 722 | "[272]\tvalid's auc: 0.988946\n", 723 | "[273]\tvalid's auc: 0.98895\n", 724 | "[274]\tvalid's auc: 0.988953\n", 725 | "[275]\tvalid's auc: 0.988954\n", 726 | "[276]\tvalid's auc: 0.988956\n", 727 | "[277]\tvalid's auc: 0.988977\n", 728 | "[278]\tvalid's auc: 0.988982\n", 729 | "[279]\tvalid's auc: 0.988984\n", 730 | "[280]\tvalid's auc: 0.988991\n", 731 | "[281]\tvalid's auc: 0.989004\n", 732 | "[282]\tvalid's auc: 0.989008\n", 733 | "[283]\tvalid's auc: 0.989012\n", 734 | "[284]\tvalid's auc: 0.989013\n", 735 | "[285]\tvalid's auc: 0.989021\n", 736 | "[286]\tvalid's auc: 0.989025\n", 737 | "[287]\tvalid's auc: 0.989029\n", 738 | "[288]\tvalid's auc: 0.989031\n", 739 | "[289]\tvalid's auc: 0.989034\n", 740 | "[290]\tvalid's auc: 0.989038\n", 741 | "[291]\tvalid's auc: 0.989056\n", 742 | "[292]\tvalid's auc: 0.989055\n", 743 | "[293]\tvalid's auc: 0.989049\n", 744 | "[294]\tvalid's auc: 0.989053\n", 745 | "[295]\tvalid's auc: 0.989058\n", 746 | "[296]\tvalid's auc: 0.989077\n" 747 | ] 748 | }, 749 | { 750 | "name": "stdout", 751 | "output_type": "stream", 752 | "text": [ 753 | "[297]\tvalid's auc: 0.989073\n", 754 | "[298]\tvalid's auc: 0.989077\n", 755 | "[299]\tvalid's auc: 0.989073\n", 756 | "[300]\tvalid's auc: 0.989086\n", 757 | "[301]\tvalid's auc: 0.989088\n", 758 | "[302]\tvalid's auc: 0.989092\n", 759 | "[303]\tvalid's auc: 0.989092\n", 760 | "[304]\tvalid's auc: 0.989096\n", 761 | "[305]\tvalid's auc: 0.989102\n", 762 | "[306]\tvalid's auc: 0.989102\n", 763 | "[307]\tvalid's auc: 0.989111\n", 764 | "[308]\tvalid's auc: 0.989113\n", 765 | "[309]\tvalid's auc: 0.989125\n", 766 | "[310]\tvalid's auc: 0.989129\n", 767 | "[311]\tvalid's auc: 0.98913\n", 768 | "[312]\tvalid's auc: 0.989132\n", 769 | "[313]\tvalid's auc: 0.989128\n", 770 | "[314]\tvalid's auc: 0.989138\n", 771 | "[315]\tvalid's auc: 0.989141\n", 772 | "[316]\tvalid's auc: 0.989145\n", 773 | "[317]\tvalid's auc: 0.98915\n", 774 | "[318]\tvalid's auc: 0.989162\n", 775 | "[319]\tvalid's auc: 0.989167\n", 776 | "[320]\tvalid's auc: 0.989169\n", 777 | "[321]\tvalid's auc: 0.989171\n", 778 | "[322]\tvalid's auc: 0.989168\n", 779 | "[323]\tvalid's auc: 0.989171\n", 780 | "[324]\tvalid's auc: 0.989172\n", 781 | "[325]\tvalid's auc: 0.989175\n", 782 | "[326]\tvalid's auc: 0.989175\n", 783 | "[327]\tvalid's auc: 0.989182\n", 784 | "[328]\tvalid's auc: 0.989182\n", 785 | "[329]\tvalid's auc: 0.989193\n", 786 | "[330]\tvalid's auc: 0.989196\n", 787 | "[331]\tvalid's auc: 0.989205\n", 788 | "[332]\tvalid's auc: 0.98921\n", 789 | "[333]\tvalid's auc: 0.98921\n", 790 | "[334]\tvalid's auc: 0.989211\n", 791 | "[335]\tvalid's auc: 0.989212\n", 792 | "[336]\tvalid's auc: 0.989213\n", 793 | "[337]\tvalid's auc: 0.989215\n", 794 | "[338]\tvalid's auc: 0.989219\n", 795 | "[339]\tvalid's auc: 0.989223\n", 796 | "[340]\tvalid's auc: 0.989228\n", 797 | "[341]\tvalid's auc: 0.989233\n", 798 | "[342]\tvalid's auc: 0.989234\n", 799 | "[343]\tvalid's auc: 0.989236\n", 800 | "[344]\tvalid's auc: 0.989236\n", 801 | "[345]\tvalid's auc: 0.989236\n", 802 | "[346]\tvalid's auc: 0.989239\n", 803 | "[347]\tvalid's auc: 0.989239\n", 804 | "[348]\tvalid's auc: 0.989239\n", 805 | "[349]\tvalid's auc: 0.989242\n", 806 | "[350]\tvalid's auc: 0.98925\n", 807 | "[351]\tvalid's auc: 0.989253\n", 808 | "[352]\tvalid's auc: 0.98926\n", 809 | "[353]\tvalid's auc: 0.989282\n", 810 | "[354]\tvalid's auc: 0.989296\n", 811 | "[355]\tvalid's auc: 0.989302\n", 812 | "[356]\tvalid's auc: 0.989303\n", 813 | "[357]\tvalid's auc: 0.989309\n", 814 | "[358]\tvalid's auc: 0.989309\n", 815 | "[359]\tvalid's auc: 0.989306\n", 816 | "[360]\tvalid's auc: 0.989309\n", 817 | "[361]\tvalid's auc: 0.989311\n", 818 | "[362]\tvalid's auc: 0.989319\n", 819 | "[363]\tvalid's auc: 0.989313\n", 820 | "[364]\tvalid's auc: 0.989315\n", 821 | "[365]\tvalid's auc: 0.989312\n", 822 | "[366]\tvalid's auc: 0.989314\n", 823 | "[367]\tvalid's auc: 0.989314\n", 824 | "[368]\tvalid's auc: 0.989319\n", 825 | "[369]\tvalid's auc: 0.989317\n", 826 | "[370]\tvalid's auc: 0.98932\n", 827 | "[371]\tvalid's auc: 0.989321\n", 828 | "[372]\tvalid's auc: 0.989324\n", 829 | "[373]\tvalid's auc: 0.989316\n", 830 | "[374]\tvalid's auc: 0.989318\n", 831 | "[375]\tvalid's auc: 0.98932\n", 832 | "[376]\tvalid's auc: 0.989318\n", 833 | "[377]\tvalid's auc: 0.989318\n", 834 | "[378]\tvalid's auc: 0.98932\n", 835 | "[379]\tvalid's auc: 0.989321\n", 836 | "[380]\tvalid's auc: 0.989326\n", 837 | "[381]\tvalid's auc: 0.989328\n", 838 | "[382]\tvalid's auc: 0.989327\n", 839 | "[383]\tvalid's auc: 0.989331\n", 840 | "[384]\tvalid's auc: 0.989332\n", 841 | "[385]\tvalid's auc: 0.989333\n", 842 | "[386]\tvalid's auc: 0.989334\n", 843 | "[387]\tvalid's auc: 0.989337\n", 844 | "[388]\tvalid's auc: 0.989341\n", 845 | "[389]\tvalid's auc: 0.989342\n", 846 | "[390]\tvalid's auc: 0.989345\n", 847 | "[391]\tvalid's auc: 0.989346\n", 848 | "[392]\tvalid's auc: 0.98934\n", 849 | "[393]\tvalid's auc: 0.989348\n", 850 | "[394]\tvalid's auc: 0.989351\n", 851 | "[395]\tvalid's auc: 0.98935\n", 852 | "[396]\tvalid's auc: 0.989355\n", 853 | "[397]\tvalid's auc: 0.98937\n", 854 | "[398]\tvalid's auc: 0.989376\n", 855 | "[399]\tvalid's auc: 0.989376\n", 856 | "[400]\tvalid's auc: 0.989377\n", 857 | "[401]\tvalid's auc: 0.989379\n", 858 | "[402]\tvalid's auc: 0.989379\n", 859 | "[403]\tvalid's auc: 0.989376\n", 860 | "[404]\tvalid's auc: 0.989376\n", 861 | "[405]\tvalid's auc: 0.989375\n", 862 | "[406]\tvalid's auc: 0.989375\n", 863 | "[407]\tvalid's auc: 0.989378\n", 864 | "[408]\tvalid's auc: 0.989385\n", 865 | "[409]\tvalid's auc: 0.98939\n", 866 | "[410]\tvalid's auc: 0.989392\n", 867 | "[411]\tvalid's auc: 0.989389\n", 868 | "[412]\tvalid's auc: 0.989387\n", 869 | "[413]\tvalid's auc: 0.989391\n", 870 | "[414]\tvalid's auc: 0.989396\n", 871 | "[415]\tvalid's auc: 0.989394\n", 872 | "[416]\tvalid's auc: 0.989397\n", 873 | "[417]\tvalid's auc: 0.989401\n", 874 | "[418]\tvalid's auc: 0.989403\n", 875 | "[419]\tvalid's auc: 0.989407\n", 876 | "[420]\tvalid's auc: 0.989407\n", 877 | "[421]\tvalid's auc: 0.989407\n", 878 | "[422]\tvalid's auc: 0.98941\n", 879 | "[423]\tvalid's auc: 0.989412\n", 880 | "[424]\tvalid's auc: 0.98941\n", 881 | "[425]\tvalid's auc: 0.989407\n", 882 | "[426]\tvalid's auc: 0.989409\n", 883 | "[427]\tvalid's auc: 0.989423\n", 884 | "[428]\tvalid's auc: 0.989424\n", 885 | "[429]\tvalid's auc: 0.989427\n", 886 | "[430]\tvalid's auc: 0.989431\n", 887 | "[431]\tvalid's auc: 0.989428\n", 888 | "[432]\tvalid's auc: 0.989427\n", 889 | "[433]\tvalid's auc: 0.989434\n", 890 | "[434]\tvalid's auc: 0.989436\n", 891 | "[435]\tvalid's auc: 0.989439\n", 892 | "[436]\tvalid's auc: 0.989435\n", 893 | "[437]\tvalid's auc: 0.989436\n", 894 | "[438]\tvalid's auc: 0.989434\n", 895 | "[439]\tvalid's auc: 0.989434\n", 896 | "[440]\tvalid's auc: 0.989437\n", 897 | "[441]\tvalid's auc: 0.989438\n", 898 | "[442]\tvalid's auc: 0.989443\n", 899 | "[443]\tvalid's auc: 0.989446\n", 900 | "[444]\tvalid's auc: 0.989448\n", 901 | "[445]\tvalid's auc: 0.989448\n", 902 | "[446]\tvalid's auc: 0.989447\n", 903 | "[447]\tvalid's auc: 0.989451\n", 904 | "[448]\tvalid's auc: 0.989451\n", 905 | "[449]\tvalid's auc: 0.989452\n", 906 | "[450]\tvalid's auc: 0.989453\n", 907 | "[451]\tvalid's auc: 0.989455\n", 908 | "[452]\tvalid's auc: 0.989455\n", 909 | "[453]\tvalid's auc: 0.989448\n", 910 | "[454]\tvalid's auc: 0.989447\n", 911 | "[455]\tvalid's auc: 0.989443\n", 912 | "[456]\tvalid's auc: 0.989446\n", 913 | "[457]\tvalid's auc: 0.989448\n", 914 | "[458]\tvalid's auc: 0.98945\n", 915 | "[459]\tvalid's auc: 0.989451\n", 916 | "[460]\tvalid's auc: 0.989456\n", 917 | "[461]\tvalid's auc: 0.989459\n", 918 | "[462]\tvalid's auc: 0.989454\n", 919 | "[463]\tvalid's auc: 0.989454\n", 920 | "[464]\tvalid's auc: 0.989454\n", 921 | "[465]\tvalid's auc: 0.989458\n", 922 | "[466]\tvalid's auc: 0.989456\n", 923 | "[467]\tvalid's auc: 0.989455\n", 924 | "[468]\tvalid's auc: 0.989459\n", 925 | "[469]\tvalid's auc: 0.989458\n", 926 | "[470]\tvalid's auc: 0.989461\n", 927 | "[471]\tvalid's auc: 0.98946\n", 928 | "[472]\tvalid's auc: 0.98946\n", 929 | "[473]\tvalid's auc: 0.989459\n", 930 | "[474]\tvalid's auc: 0.989461\n", 931 | "[475]\tvalid's auc: 0.989461\n", 932 | "[476]\tvalid's auc: 0.989459\n", 933 | "[477]\tvalid's auc: 0.98946\n", 934 | "[478]\tvalid's auc: 0.989461\n", 935 | "[479]\tvalid's auc: 0.989464\n", 936 | "[480]\tvalid's auc: 0.98946\n", 937 | "[481]\tvalid's auc: 0.989464\n", 938 | "[482]\tvalid's auc: 0.989464\n", 939 | "[483]\tvalid's auc: 0.989466\n", 940 | "[484]\tvalid's auc: 0.989467\n", 941 | "[485]\tvalid's auc: 0.989468\n", 942 | "[486]\tvalid's auc: 0.989467\n", 943 | "[487]\tvalid's auc: 0.989468\n", 944 | "[488]\tvalid's auc: 0.989468\n", 945 | "[489]\tvalid's auc: 0.989469\n", 946 | "[490]\tvalid's auc: 0.989472\n", 947 | "[491]\tvalid's auc: 0.989476\n", 948 | "[492]\tvalid's auc: 0.989476\n", 949 | "[493]\tvalid's auc: 0.989477\n", 950 | "[494]\tvalid's auc: 0.989476\n", 951 | "[495]\tvalid's auc: 0.989478\n", 952 | "[496]\tvalid's auc: 0.989524\n", 953 | "[497]\tvalid's auc: 0.989527\n", 954 | "[498]\tvalid's auc: 0.989528\n", 955 | "[499]\tvalid's auc: 0.989533\n", 956 | "[500]\tvalid's auc: 0.989533\n", 957 | "[501]\tvalid's auc: 0.989533\n", 958 | "[502]\tvalid's auc: 0.989552\n", 959 | "[503]\tvalid's auc: 0.989555\n", 960 | "[504]\tvalid's auc: 0.989556\n", 961 | "[505]\tvalid's auc: 0.98956\n", 962 | "[506]\tvalid's auc: 0.989562\n", 963 | "[507]\tvalid's auc: 0.98956\n", 964 | "[508]\tvalid's auc: 0.989561\n", 965 | "[509]\tvalid's auc: 0.989569\n", 966 | "[510]\tvalid's auc: 0.989567\n", 967 | "[511]\tvalid's auc: 0.989567\n", 968 | "[512]\tvalid's auc: 0.989567\n", 969 | "[513]\tvalid's auc: 0.989564\n", 970 | "[514]\tvalid's auc: 0.989565\n", 971 | "[515]\tvalid's auc: 0.989566\n", 972 | "[516]\tvalid's auc: 0.989568\n", 973 | "[517]\tvalid's auc: 0.989569\n", 974 | "[518]\tvalid's auc: 0.989569\n", 975 | "[519]\tvalid's auc: 0.989572\n", 976 | "[520]\tvalid's auc: 0.989575\n", 977 | "[521]\tvalid's auc: 0.989581\n", 978 | "[522]\tvalid's auc: 0.98958\n", 979 | "[523]\tvalid's auc: 0.989584\n", 980 | "[524]\tvalid's auc: 0.989591\n", 981 | "[525]\tvalid's auc: 0.989594\n", 982 | "[526]\tvalid's auc: 0.989595\n", 983 | "[527]\tvalid's auc: 0.989598\n", 984 | "[528]\tvalid's auc: 0.989598\n", 985 | "[529]\tvalid's auc: 0.989599\n", 986 | "[530]\tvalid's auc: 0.989601\n", 987 | "[531]\tvalid's auc: 0.9896\n", 988 | "[532]\tvalid's auc: 0.989602\n", 989 | "[533]\tvalid's auc: 0.989602\n", 990 | "[534]\tvalid's auc: 0.989603\n", 991 | "[535]\tvalid's auc: 0.989604\n", 992 | "[536]\tvalid's auc: 0.989603\n", 993 | "[537]\tvalid's auc: 0.989603\n", 994 | "[538]\tvalid's auc: 0.989604\n", 995 | "[539]\tvalid's auc: 0.989605\n", 996 | "[540]\tvalid's auc: 0.989605\n", 997 | "[541]\tvalid's auc: 0.989601\n", 998 | "[542]\tvalid's auc: 0.989604\n", 999 | "[543]\tvalid's auc: 0.989609\n", 1000 | "[544]\tvalid's auc: 0.989613\n", 1001 | "[545]\tvalid's auc: 0.989615\n", 1002 | "[546]\tvalid's auc: 0.989616\n", 1003 | "[547]\tvalid's auc: 0.989616\n", 1004 | "[548]\tvalid's auc: 0.989614\n", 1005 | "[549]\tvalid's auc: 0.989612\n", 1006 | "[550]\tvalid's auc: 0.989612\n", 1007 | "[551]\tvalid's auc: 0.989611\n", 1008 | "[552]\tvalid's auc: 0.989614\n", 1009 | "[553]\tvalid's auc: 0.989614\n", 1010 | "[554]\tvalid's auc: 0.989613\n", 1011 | "[555]\tvalid's auc: 0.989614\n", 1012 | "[556]\tvalid's auc: 0.98962\n", 1013 | "[557]\tvalid's auc: 0.989616\n", 1014 | "[558]\tvalid's auc: 0.989616\n", 1015 | "[559]\tvalid's auc: 0.989618\n", 1016 | "[560]\tvalid's auc: 0.989619\n", 1017 | "[561]\tvalid's auc: 0.989616\n", 1018 | "[562]\tvalid's auc: 0.989619\n", 1019 | "[563]\tvalid's auc: 0.989619\n", 1020 | "[564]\tvalid's auc: 0.98962\n", 1021 | "[565]\tvalid's auc: 0.989619\n", 1022 | "[566]\tvalid's auc: 0.989621\n", 1023 | "[567]\tvalid's auc: 0.989621\n", 1024 | "[568]\tvalid's auc: 0.989622\n", 1025 | "[569]\tvalid's auc: 0.989625\n", 1026 | "[570]\tvalid's auc: 0.989623\n", 1027 | "[571]\tvalid's auc: 0.989624\n", 1028 | "[572]\tvalid's auc: 0.989624\n", 1029 | "[573]\tvalid's auc: 0.989626\n", 1030 | "[574]\tvalid's auc: 0.989625\n", 1031 | "[575]\tvalid's auc: 0.989628\n", 1032 | "[576]\tvalid's auc: 0.989629\n", 1033 | "[577]\tvalid's auc: 0.98963\n", 1034 | "[578]\tvalid's auc: 0.989628\n", 1035 | "[579]\tvalid's auc: 0.989629\n", 1036 | "[580]\tvalid's auc: 0.989629\n", 1037 | "[581]\tvalid's auc: 0.989626\n", 1038 | "[582]\tvalid's auc: 0.989633\n", 1039 | "[583]\tvalid's auc: 0.989632\n", 1040 | "[584]\tvalid's auc: 0.989632\n", 1041 | "[585]\tvalid's auc: 0.989631\n", 1042 | "[586]\tvalid's auc: 0.989632\n", 1043 | "[587]\tvalid's auc: 0.989634\n", 1044 | "[588]\tvalid's auc: 0.989636\n", 1045 | "[589]\tvalid's auc: 0.989638\n", 1046 | "[590]\tvalid's auc: 0.989639\n" 1047 | ] 1048 | }, 1049 | { 1050 | "name": "stdout", 1051 | "output_type": "stream", 1052 | "text": [ 1053 | "[591]\tvalid's auc: 0.989638\n", 1054 | "[592]\tvalid's auc: 0.989641\n", 1055 | "[593]\tvalid's auc: 0.989639\n", 1056 | "[594]\tvalid's auc: 0.989639\n", 1057 | "[595]\tvalid's auc: 0.98964\n", 1058 | "[596]\tvalid's auc: 0.989638\n", 1059 | "[597]\tvalid's auc: 0.989647\n", 1060 | "[598]\tvalid's auc: 0.989646\n", 1061 | "[599]\tvalid's auc: 0.989645\n", 1062 | "[600]\tvalid's auc: 0.989646\n", 1063 | "[601]\tvalid's auc: 0.989649\n", 1064 | "[602]\tvalid's auc: 0.989652\n", 1065 | "[603]\tvalid's auc: 0.989653\n", 1066 | "[604]\tvalid's auc: 0.98965\n", 1067 | "[605]\tvalid's auc: 0.989655\n", 1068 | "[606]\tvalid's auc: 0.989659\n", 1069 | "[607]\tvalid's auc: 0.989662\n", 1070 | "[608]\tvalid's auc: 0.989662\n", 1071 | "[609]\tvalid's auc: 0.989662\n", 1072 | "[610]\tvalid's auc: 0.989663\n", 1073 | "[611]\tvalid's auc: 0.989662\n", 1074 | "[612]\tvalid's auc: 0.989663\n", 1075 | "[613]\tvalid's auc: 0.989663\n", 1076 | "[614]\tvalid's auc: 0.989661\n", 1077 | "[615]\tvalid's auc: 0.989661\n", 1078 | "[616]\tvalid's auc: 0.989662\n", 1079 | "[617]\tvalid's auc: 0.989663\n", 1080 | "[618]\tvalid's auc: 0.989666\n", 1081 | "[619]\tvalid's auc: 0.989665\n", 1082 | "[620]\tvalid's auc: 0.989668\n", 1083 | "[621]\tvalid's auc: 0.989667\n", 1084 | "[622]\tvalid's auc: 0.989664\n", 1085 | "[623]\tvalid's auc: 0.989668\n", 1086 | "[624]\tvalid's auc: 0.989669\n", 1087 | "[625]\tvalid's auc: 0.989669\n", 1088 | "[626]\tvalid's auc: 0.989672\n", 1089 | "[627]\tvalid's auc: 0.989674\n", 1090 | "[628]\tvalid's auc: 0.989676\n", 1091 | "[629]\tvalid's auc: 0.989675\n", 1092 | "[630]\tvalid's auc: 0.989677\n", 1093 | "[631]\tvalid's auc: 0.989677\n", 1094 | "[632]\tvalid's auc: 0.989676\n", 1095 | "[633]\tvalid's auc: 0.989676\n", 1096 | "[634]\tvalid's auc: 0.989671\n", 1097 | "[635]\tvalid's auc: 0.989672\n", 1098 | "[636]\tvalid's auc: 0.989671\n", 1099 | "[637]\tvalid's auc: 0.989673\n", 1100 | "[638]\tvalid's auc: 0.989674\n", 1101 | "[639]\tvalid's auc: 0.989675\n", 1102 | "[640]\tvalid's auc: 0.989674\n", 1103 | "[641]\tvalid's auc: 0.989677\n", 1104 | "[642]\tvalid's auc: 0.989678\n", 1105 | "[643]\tvalid's auc: 0.989679\n", 1106 | "[644]\tvalid's auc: 0.989678\n", 1107 | "[645]\tvalid's auc: 0.989678\n", 1108 | "[646]\tvalid's auc: 0.989681\n", 1109 | "[647]\tvalid's auc: 0.989682\n", 1110 | "[648]\tvalid's auc: 0.989683\n", 1111 | "[649]\tvalid's auc: 0.989685\n", 1112 | "[650]\tvalid's auc: 0.989687\n", 1113 | "[651]\tvalid's auc: 0.989688\n", 1114 | "[652]\tvalid's auc: 0.989689\n", 1115 | "[653]\tvalid's auc: 0.989689\n", 1116 | "[654]\tvalid's auc: 0.989689\n", 1117 | "[655]\tvalid's auc: 0.989691\n", 1118 | "[656]\tvalid's auc: 0.989692\n", 1119 | "[657]\tvalid's auc: 0.989691\n", 1120 | "[658]\tvalid's auc: 0.98969\n", 1121 | "[659]\tvalid's auc: 0.989687\n", 1122 | "[660]\tvalid's auc: 0.989685\n", 1123 | "[661]\tvalid's auc: 0.989685\n", 1124 | "[662]\tvalid's auc: 0.989688\n", 1125 | "[663]\tvalid's auc: 0.989688\n", 1126 | "[664]\tvalid's auc: 0.989686\n", 1127 | "[665]\tvalid's auc: 0.989686\n", 1128 | "[666]\tvalid's auc: 0.989686\n", 1129 | "[667]\tvalid's auc: 0.989685\n", 1130 | "[668]\tvalid's auc: 0.989687\n", 1131 | "[669]\tvalid's auc: 0.989687\n", 1132 | "[670]\tvalid's auc: 0.989685\n", 1133 | "[671]\tvalid's auc: 0.989686\n", 1134 | "[672]\tvalid's auc: 0.989686\n", 1135 | "[673]\tvalid's auc: 0.989687\n", 1136 | "[674]\tvalid's auc: 0.989698\n", 1137 | "[675]\tvalid's auc: 0.989699\n", 1138 | "[676]\tvalid's auc: 0.989699\n", 1139 | "[677]\tvalid's auc: 0.9897\n", 1140 | "[678]\tvalid's auc: 0.989702\n", 1141 | "[679]\tvalid's auc: 0.989704\n", 1142 | "[680]\tvalid's auc: 0.989708\n", 1143 | "[681]\tvalid's auc: 0.989711\n", 1144 | "[682]\tvalid's auc: 0.989711\n", 1145 | "[683]\tvalid's auc: 0.989714\n", 1146 | "[684]\tvalid's auc: 0.989715\n", 1147 | "[685]\tvalid's auc: 0.989715\n", 1148 | "[686]\tvalid's auc: 0.989723\n", 1149 | "[687]\tvalid's auc: 0.989725\n", 1150 | "[688]\tvalid's auc: 0.989727\n", 1151 | "[689]\tvalid's auc: 0.989729\n", 1152 | "[690]\tvalid's auc: 0.989723\n", 1153 | "[691]\tvalid's auc: 0.989723\n", 1154 | "[692]\tvalid's auc: 0.989725\n", 1155 | "[693]\tvalid's auc: 0.989724\n", 1156 | "[694]\tvalid's auc: 0.989725\n", 1157 | "[695]\tvalid's auc: 0.989725\n", 1158 | "[696]\tvalid's auc: 0.989729\n", 1159 | "[697]\tvalid's auc: 0.98973\n", 1160 | "[698]\tvalid's auc: 0.989731\n", 1161 | "[699]\tvalid's auc: 0.989733\n", 1162 | "[700]\tvalid's auc: 0.989733\n", 1163 | "[701]\tvalid's auc: 0.989735\n", 1164 | "[702]\tvalid's auc: 0.989735\n", 1165 | "[703]\tvalid's auc: 0.989736\n", 1166 | "[704]\tvalid's auc: 0.989738\n", 1167 | "[705]\tvalid's auc: 0.989739\n", 1168 | "[706]\tvalid's auc: 0.989737\n", 1169 | "[707]\tvalid's auc: 0.989736\n", 1170 | "[708]\tvalid's auc: 0.989735\n", 1171 | "[709]\tvalid's auc: 0.989735\n", 1172 | "[710]\tvalid's auc: 0.989736\n", 1173 | "[711]\tvalid's auc: 0.989737\n", 1174 | "[712]\tvalid's auc: 0.989738\n", 1175 | "[713]\tvalid's auc: 0.989732\n", 1176 | "[714]\tvalid's auc: 0.989733\n", 1177 | "[715]\tvalid's auc: 0.989735\n", 1178 | "[716]\tvalid's auc: 0.989736\n", 1179 | "[717]\tvalid's auc: 0.989736\n", 1180 | "[718]\tvalid's auc: 0.989737\n", 1181 | "[719]\tvalid's auc: 0.989736\n", 1182 | "[720]\tvalid's auc: 0.989737\n", 1183 | "[721]\tvalid's auc: 0.989735\n", 1184 | "[722]\tvalid's auc: 0.989749\n", 1185 | "[723]\tvalid's auc: 0.989749\n", 1186 | "[724]\tvalid's auc: 0.989749\n", 1187 | "[725]\tvalid's auc: 0.98975\n", 1188 | "[726]\tvalid's auc: 0.989754\n", 1189 | "[727]\tvalid's auc: 0.989752\n", 1190 | "[728]\tvalid's auc: 0.989751\n", 1191 | "[729]\tvalid's auc: 0.989749\n", 1192 | "[730]\tvalid's auc: 0.98975\n", 1193 | "[731]\tvalid's auc: 0.989765\n", 1194 | "[732]\tvalid's auc: 0.989766\n", 1195 | "[733]\tvalid's auc: 0.989767\n", 1196 | "[734]\tvalid's auc: 0.989768\n", 1197 | "[735]\tvalid's auc: 0.98977\n", 1198 | "[736]\tvalid's auc: 0.989768\n", 1199 | "[737]\tvalid's auc: 0.989762\n", 1200 | "[738]\tvalid's auc: 0.989762\n", 1201 | "[739]\tvalid's auc: 0.989763\n", 1202 | "[740]\tvalid's auc: 0.989765\n", 1203 | "[741]\tvalid's auc: 0.989764\n", 1204 | "[742]\tvalid's auc: 0.989764\n", 1205 | "[743]\tvalid's auc: 0.989765\n", 1206 | "[744]\tvalid's auc: 0.989764\n", 1207 | "[745]\tvalid's auc: 0.989764\n", 1208 | "[746]\tvalid's auc: 0.989765\n", 1209 | "[747]\tvalid's auc: 0.989766\n", 1210 | "[748]\tvalid's auc: 0.989766\n", 1211 | "[749]\tvalid's auc: 0.989767\n", 1212 | "[750]\tvalid's auc: 0.989769\n", 1213 | "[751]\tvalid's auc: 0.989772\n", 1214 | "[752]\tvalid's auc: 0.989771\n", 1215 | "[753]\tvalid's auc: 0.98977\n", 1216 | "[754]\tvalid's auc: 0.98977\n", 1217 | "[755]\tvalid's auc: 0.989769\n", 1218 | "[756]\tvalid's auc: 0.98977\n", 1219 | "[757]\tvalid's auc: 0.989772\n", 1220 | "[758]\tvalid's auc: 0.989774\n", 1221 | "[759]\tvalid's auc: 0.989776\n", 1222 | "[760]\tvalid's auc: 0.989778\n", 1223 | "[761]\tvalid's auc: 0.98978\n", 1224 | "[762]\tvalid's auc: 0.989777\n", 1225 | "[763]\tvalid's auc: 0.989777\n", 1226 | "[764]\tvalid's auc: 0.989778\n", 1227 | "[765]\tvalid's auc: 0.989778\n", 1228 | "[766]\tvalid's auc: 0.98978\n", 1229 | "[767]\tvalid's auc: 0.989781\n", 1230 | "[768]\tvalid's auc: 0.989785\n", 1231 | "[769]\tvalid's auc: 0.989787\n", 1232 | "[770]\tvalid's auc: 0.989789\n", 1233 | "[771]\tvalid's auc: 0.989788\n", 1234 | "[772]\tvalid's auc: 0.98979\n", 1235 | "[773]\tvalid's auc: 0.989788\n", 1236 | "[774]\tvalid's auc: 0.989792\n", 1237 | "[775]\tvalid's auc: 0.989793\n", 1238 | "[776]\tvalid's auc: 0.989792\n", 1239 | "[777]\tvalid's auc: 0.989792\n", 1240 | "[778]\tvalid's auc: 0.989792\n", 1241 | "[779]\tvalid's auc: 0.989792\n", 1242 | "[780]\tvalid's auc: 0.989791\n", 1243 | "[781]\tvalid's auc: 0.989792\n", 1244 | "[782]\tvalid's auc: 0.989793\n", 1245 | "[783]\tvalid's auc: 0.989794\n", 1246 | "[784]\tvalid's auc: 0.989793\n", 1247 | "[785]\tvalid's auc: 0.989793\n", 1248 | "[786]\tvalid's auc: 0.989794\n", 1249 | "[787]\tvalid's auc: 0.989796\n", 1250 | "[788]\tvalid's auc: 0.989796\n", 1251 | "[789]\tvalid's auc: 0.989795\n", 1252 | "[790]\tvalid's auc: 0.989796\n", 1253 | "[791]\tvalid's auc: 0.989799\n", 1254 | "[792]\tvalid's auc: 0.9898\n", 1255 | "[793]\tvalid's auc: 0.989801\n", 1256 | "[794]\tvalid's auc: 0.989802\n", 1257 | "[795]\tvalid's auc: 0.989804\n", 1258 | "[796]\tvalid's auc: 0.989804\n", 1259 | "[797]\tvalid's auc: 0.989805\n", 1260 | "[798]\tvalid's auc: 0.989801\n", 1261 | "[799]\tvalid's auc: 0.989802\n", 1262 | "[800]\tvalid's auc: 0.989802\n", 1263 | "[801]\tvalid's auc: 0.989806\n", 1264 | "[802]\tvalid's auc: 0.98981\n", 1265 | "[803]\tvalid's auc: 0.989808\n", 1266 | "[804]\tvalid's auc: 0.989808\n", 1267 | "[805]\tvalid's auc: 0.989808\n", 1268 | "[806]\tvalid's auc: 0.989807\n", 1269 | "[807]\tvalid's auc: 0.989808\n", 1270 | "[808]\tvalid's auc: 0.989809\n", 1271 | "[809]\tvalid's auc: 0.989809\n", 1272 | "[810]\tvalid's auc: 0.989811\n", 1273 | "[811]\tvalid's auc: 0.989812\n", 1274 | "[812]\tvalid's auc: 0.989812\n", 1275 | "[813]\tvalid's auc: 0.989811\n", 1276 | "[814]\tvalid's auc: 0.989813\n", 1277 | "[815]\tvalid's auc: 0.989813\n", 1278 | "[816]\tvalid's auc: 0.989812\n", 1279 | "[817]\tvalid's auc: 0.989814\n", 1280 | "[818]\tvalid's auc: 0.989815\n", 1281 | "[819]\tvalid's auc: 0.989815\n", 1282 | "[820]\tvalid's auc: 0.989814\n", 1283 | "[821]\tvalid's auc: 0.989815\n", 1284 | "[822]\tvalid's auc: 0.989815\n", 1285 | "[823]\tvalid's auc: 0.989816\n", 1286 | "[824]\tvalid's auc: 0.989815\n", 1287 | "[825]\tvalid's auc: 0.989815\n", 1288 | "[826]\tvalid's auc: 0.989815\n", 1289 | "[827]\tvalid's auc: 0.989814\n", 1290 | "[828]\tvalid's auc: 0.989813\n", 1291 | "[829]\tvalid's auc: 0.989812\n", 1292 | "[830]\tvalid's auc: 0.989812\n", 1293 | "[831]\tvalid's auc: 0.989809\n", 1294 | "[832]\tvalid's auc: 0.98981\n", 1295 | "[833]\tvalid's auc: 0.989811\n", 1296 | "[834]\tvalid's auc: 0.989812\n", 1297 | "[835]\tvalid's auc: 0.989812\n", 1298 | "[836]\tvalid's auc: 0.989812\n", 1299 | "[837]\tvalid's auc: 0.989813\n", 1300 | "[838]\tvalid's auc: 0.989811\n", 1301 | "[839]\tvalid's auc: 0.989811\n", 1302 | "[840]\tvalid's auc: 0.989812\n", 1303 | "[841]\tvalid's auc: 0.989812\n", 1304 | "[842]\tvalid's auc: 0.989811\n", 1305 | "[843]\tvalid's auc: 0.989811\n", 1306 | "[844]\tvalid's auc: 0.989811\n", 1307 | "[845]\tvalid's auc: 0.989812\n", 1308 | "[846]\tvalid's auc: 0.989811\n", 1309 | "[847]\tvalid's auc: 0.989809\n", 1310 | "[848]\tvalid's auc: 0.989807\n", 1311 | "[849]\tvalid's auc: 0.989807\n", 1312 | "[850]\tvalid's auc: 0.989807\n", 1313 | "[851]\tvalid's auc: 0.989811\n", 1314 | "[852]\tvalid's auc: 0.989813\n", 1315 | "[853]\tvalid's auc: 0.989814\n", 1316 | "[854]\tvalid's auc: 0.989813\n", 1317 | "[855]\tvalid's auc: 0.989813\n", 1318 | "[856]\tvalid's auc: 0.989814\n", 1319 | "[857]\tvalid's auc: 0.989816\n", 1320 | "[858]\tvalid's auc: 0.989816\n", 1321 | "[859]\tvalid's auc: 0.989817\n", 1322 | "[860]\tvalid's auc: 0.989818\n", 1323 | "[861]\tvalid's auc: 0.98982\n", 1324 | "[862]\tvalid's auc: 0.989818\n", 1325 | "[863]\tvalid's auc: 0.989822\n", 1326 | "[864]\tvalid's auc: 0.989829\n", 1327 | "[865]\tvalid's auc: 0.989827\n", 1328 | "[866]\tvalid's auc: 0.989827\n", 1329 | "[867]\tvalid's auc: 0.989834\n", 1330 | "[868]\tvalid's auc: 0.989837\n", 1331 | "[869]\tvalid's auc: 0.989839\n", 1332 | "[870]\tvalid's auc: 0.989841\n", 1333 | "[871]\tvalid's auc: 0.989842\n", 1334 | "[872]\tvalid's auc: 0.989849\n", 1335 | "[873]\tvalid's auc: 0.989849\n", 1336 | "[874]\tvalid's auc: 0.989852\n", 1337 | "[875]\tvalid's auc: 0.989852\n", 1338 | "[876]\tvalid's auc: 0.989848\n", 1339 | "[877]\tvalid's auc: 0.989848\n", 1340 | "[878]\tvalid's auc: 0.98985\n", 1341 | "[879]\tvalid's auc: 0.989849\n", 1342 | "[880]\tvalid's auc: 0.98985\n", 1343 | "[881]\tvalid's auc: 0.98985\n", 1344 | "[882]\tvalid's auc: 0.98985\n", 1345 | "[883]\tvalid's auc: 0.98985\n", 1346 | "[884]\tvalid's auc: 0.989849\n" 1347 | ] 1348 | }, 1349 | { 1350 | "name": "stdout", 1351 | "output_type": "stream", 1352 | "text": [ 1353 | "[885]\tvalid's auc: 0.989849\n", 1354 | "[886]\tvalid's auc: 0.989848\n", 1355 | "[887]\tvalid's auc: 0.989852\n", 1356 | "[888]\tvalid's auc: 0.989852\n", 1357 | "[889]\tvalid's auc: 0.989847\n", 1358 | "[890]\tvalid's auc: 0.989848\n", 1359 | "[891]\tvalid's auc: 0.989848\n", 1360 | "[892]\tvalid's auc: 0.989849\n", 1361 | "[893]\tvalid's auc: 0.989848\n", 1362 | "[894]\tvalid's auc: 0.989847\n", 1363 | "[895]\tvalid's auc: 0.989849\n", 1364 | "[896]\tvalid's auc: 0.98985\n", 1365 | "[897]\tvalid's auc: 0.98985\n", 1366 | "[898]\tvalid's auc: 0.989845\n", 1367 | "[899]\tvalid's auc: 0.989845\n", 1368 | "[900]\tvalid's auc: 0.989844\n", 1369 | "[901]\tvalid's auc: 0.989843\n", 1370 | "[902]\tvalid's auc: 0.989841\n", 1371 | "[903]\tvalid's auc: 0.989836\n", 1372 | "[904]\tvalid's auc: 0.989837\n", 1373 | "[905]\tvalid's auc: 0.989838\n", 1374 | "[906]\tvalid's auc: 0.989836\n", 1375 | "[907]\tvalid's auc: 0.989837\n", 1376 | "[908]\tvalid's auc: 0.989834\n", 1377 | "[909]\tvalid's auc: 0.989834\n", 1378 | "[910]\tvalid's auc: 0.989831\n", 1379 | "[911]\tvalid's auc: 0.989831\n", 1380 | "[912]\tvalid's auc: 0.989832\n", 1381 | "[913]\tvalid's auc: 0.989832\n", 1382 | "[914]\tvalid's auc: 0.989834\n", 1383 | "[915]\tvalid's auc: 0.989835\n", 1384 | "[916]\tvalid's auc: 0.989836\n", 1385 | "[917]\tvalid's auc: 0.989837\n", 1386 | "[918]\tvalid's auc: 0.989838\n", 1387 | "[919]\tvalid's auc: 0.989841\n", 1388 | "[920]\tvalid's auc: 0.989845\n", 1389 | "[921]\tvalid's auc: 0.989842\n", 1390 | "[922]\tvalid's auc: 0.989843\n", 1391 | "[923]\tvalid's auc: 0.989844\n", 1392 | "[924]\tvalid's auc: 0.989844\n", 1393 | "[925]\tvalid's auc: 0.989844\n", 1394 | "Early stopping, best iteration is:\n", 1395 | "[875]\tvalid's auc: 0.989852\n", 1396 | "\n", 1397 | "Model Info:\n", 1398 | "n_estimators: 875\n", 1399 | "auc: 0.989852182323\n", 1400 | " feature gain split\n", 1401 | "0 app 58.738362 812\n", 1402 | "7 app_count 15.740246 69\n", 1403 | "25 ip_app_device_os_nextClick 5.387018 438\n", 1404 | "3 channel 3.926541 1032\n", 1405 | "8 ip_count 1.766059 115\n", 1406 | "2 os 1.756788 670\n", 1407 | "12 day_hour_app_count 1.657810 89\n", 1408 | "20 ip_uniq_channel_countUniq 1.600081 128\n", 1409 | "13 app_channel_count 1.574485 79\n", 1410 | "22 ip_uniq_os_countUniq 1.344734 199\n", 1411 | "21 ip_uniq_app_countUniq 1.339017 114\n", 1412 | "14 ip_day_in_test_hh_count 0.752816 49\n", 1413 | "4 hour 0.659677 475\n", 1414 | "11 ip_device_count 0.611573 129\n", 1415 | "23 ip_os_nextClick 0.547943 46\n", 1416 | "15 ip_day_hour_count 0.433315 97\n", 1417 | "1 device 0.427597 71\n", 1418 | "16 os_device_count 0.402650 44\n", 1419 | "17 ip_os_day_hour_count 0.302766 96\n", 1420 | "19 ip_app_os_count 0.264235 101\n", 1421 | "26 ip_app_device_os_channel_nextClick 0.222124 136\n", 1422 | "9 channel_count 0.143971 38\n", 1423 | "10 os_count 0.132563 24\n", 1424 | "18 ip_device_day_hour_count 0.125536 44\n", 1425 | "27 ip_countAccum 0.071134 47\n", 1426 | "24 ip_device_os_nextClick 0.042230 47\n", 1427 | "5 app_device 0.016130 31\n", 1428 | "6 app_channel 0.012600 30\n", 1429 | "model saved as model-2018-05-05-19-53-33\n" 1430 | ] 1431 | } 1432 | ], 1433 | "source": [ 1434 | "print('Training...')\n", 1435 | "evals_results = {}\n", 1436 | "model = lgb.train(lgb_params,\n", 1437 | " xgtrain,\n", 1438 | " valid_sets=[xgvalid],\n", 1439 | " valid_names=['valid'],\n", 1440 | " evals_result=evals_results,\n", 1441 | " num_boost_round=5000,\n", 1442 | " early_stopping_rounds=100,\n", 1443 | " verbose_eval=1,\n", 1444 | " feval=None)\n", 1445 | "n_estimators = model.best_iteration\n", 1446 | "\n", 1447 | "print('\\nModel Info:')\n", 1448 | "print('n_estimators:', n_estimators)\n", 1449 | "print(metrics + ':', evals_results['valid'][metrics][n_estimators - 1])\n", 1450 | "\n", 1451 | "gain = model.feature_importance('gain')\n", 1452 | "ft = pd.DataFrame({'feature': model.feature_name(), 'split': model.feature_importance('split'),\n", 1453 | " 'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)\n", 1454 | "ft.to_csv('feature_importance_ref.csv', index=False)\n", 1455 | "print(ft)\n", 1456 | "\n", 1457 | "model_name = 'model-%s' % strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", 1458 | "model.save_model(model_name)\n", 1459 | "print('model saved as %s' % model_name)" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "markdown", 1464 | "metadata": {}, 1465 | "source": [ 1466 | "# LGB Prediction" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": 16, 1472 | "metadata": {}, 1473 | "outputs": [ 1474 | { 1475 | "name": "stdout", 1476 | "output_type": "stream", 1477 | "text": [ 1478 | "Predicting...\n" 1479 | ] 1480 | } 1481 | ], 1482 | "source": [ 1483 | "print('Predicting...')\n", 1484 | "test_df['is_attributed'] = model.predict(test_features[features], num_iteration=model.best_iteration)" 1485 | ] 1486 | }, 1487 | { 1488 | "cell_type": "code", 1489 | "execution_count": 17, 1490 | "metadata": { 1491 | "scrolled": true 1492 | }, 1493 | "outputs": [ 1494 | { 1495 | "name": "stdout", 1496 | "output_type": "stream", 1497 | "text": [ 1498 | "loading test\n" 1499 | ] 1500 | } 1501 | ], 1502 | "source": [ 1503 | "print('loading test')\n", 1504 | "test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])" 1505 | ] 1506 | }, 1507 | { 1508 | "cell_type": "code", 1509 | "execution_count": 18, 1510 | "metadata": {}, 1511 | "outputs": [ 1512 | { 1513 | "name": "stdout", 1514 | "output_type": "stream", 1515 | "text": [ 1516 | "merging test_supplement to test\n", 1517 | "Writing the submission data into a csv file...\n", 1518 | "All done...\n" 1519 | ] 1520 | } 1521 | ], 1522 | "source": [ 1523 | "print('merging test_supplement to test')\n", 1524 | "join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']\n", 1525 | "all_cols = join_cols + ['is_attributed']\n", 1526 | "\n", 1527 | "test = test.merge(test_df[all_cols], how='left', on=join_cols)\n", 1528 | "test = test.drop_duplicates(subset=['click_id'])\n", 1529 | "\n", 1530 | "print(\"Writing the submission data into a csv file...\")\n", 1531 | "test[['click_id', 'is_attributed']].to_csv('submit_lgb_875.gz', index=False, float_format='%.9f', compression='gzip')\n", 1532 | "print(\"All done...\")" 1533 | ] 1534 | }, 1535 | { 1536 | "cell_type": "code", 1537 | "execution_count": null, 1538 | "metadata": { 1539 | "collapsed": true 1540 | }, 1541 | "outputs": [], 1542 | "source": [ 1543 | "del test\n", 1544 | "gc.collect()" 1545 | ] 1546 | }, 1547 | { 1548 | "cell_type": "markdown", 1549 | "metadata": {}, 1550 | "source": [ 1551 | "# Train XGBoost" 1552 | ] 1553 | }, 1554 | { 1555 | "cell_type": "code", 1556 | "execution_count": 24, 1557 | "metadata": { 1558 | "collapsed": true 1559 | }, 1560 | "outputs": [], 1561 | "source": [ 1562 | "xgb_params = {'eta': 0.1,\n", 1563 | " 'tree_method': \"hist\",\n", 1564 | " 'grow_policy': \"lossguide\",\n", 1565 | " 'max_leaves': 1400, \n", 1566 | " 'max_depth': 4, \n", 1567 | " 'subsample': 0.9, \n", 1568 | " 'colsample_bytree': 0.7, \n", 1569 | " 'colsample_bylevel':0.7,\n", 1570 | " 'min_child_weight':0,\n", 1571 | " 'alpha':0,\n", 1572 | " 'objective': 'binary:logistic', \n", 1573 | " 'eval_metric': 'auc',\n", 1574 | " 'nthread':24,\n", 1575 | " 'random_state': 42,\n", 1576 | " 'scale_pos_weight':200,\n", 1577 | " 'silent': True}" 1578 | ] 1579 | }, 1580 | { 1581 | "cell_type": "code", 1582 | "execution_count": null, 1583 | "metadata": { 1584 | "collapsed": true 1585 | }, 1586 | "outputs": [], 1587 | "source": [ 1588 | "# train valid split\n", 1589 | "labels = train_df.is_attributed.values\n", 1590 | "train_features, valid_features = train_test_split(train_features, train_size=.95, shuffle=False)\n", 1591 | "train_labels, valid_labels = train_test_split(labels, train_size=.95, shuffle=False)\n", 1592 | "print('Train size:', len(train_features))\n", 1593 | "print('Valid size:', len(valid_features))\n", 1594 | "gc.collect()" 1595 | ] 1596 | }, 1597 | { 1598 | "cell_type": "code", 1599 | "execution_count": 25, 1600 | "metadata": { 1601 | "collapsed": true 1602 | }, 1603 | "outputs": [], 1604 | "source": [ 1605 | "dtrain = xgb.DMatrix(train_features, train_labels)\n", 1606 | "dvalid = xgb.DMatrix(valid_features, valid_labels)\n", 1607 | "watchlist = [(dvalid, 'valid')]" 1608 | ] 1609 | }, 1610 | { 1611 | "cell_type": "code", 1612 | "execution_count": null, 1613 | "metadata": { 1614 | "scrolled": true 1615 | }, 1616 | "outputs": [ 1617 | { 1618 | "name": "stdout", 1619 | "output_type": "stream", 1620 | "text": [ 1621 | "[0]\tvalid-auc:0.964034\n", 1622 | "Will train until valid-auc hasn't improved in 50 rounds.\n", 1623 | "[5]\tvalid-auc:0.971544\n", 1624 | "[10]\tvalid-auc:0.972256\n", 1625 | "[15]\tvalid-auc:0.973251\n", 1626 | "[20]\tvalid-auc:0.974015\n", 1627 | "[25]\tvalid-auc:0.975026\n", 1628 | "[30]\tvalid-auc:0.976547\n", 1629 | "[35]\tvalid-auc:0.97817\n", 1630 | "[40]\tvalid-auc:0.979116\n", 1631 | "[45]\tvalid-auc:0.980232\n", 1632 | "[50]\tvalid-auc:0.981004\n", 1633 | "[55]\tvalid-auc:0.981727\n", 1634 | "[60]\tvalid-auc:0.982595\n", 1635 | "[65]\tvalid-auc:0.98306\n", 1636 | "[70]\tvalid-auc:0.983537\n", 1637 | "[75]\tvalid-auc:0.984043\n", 1638 | "[80]\tvalid-auc:0.984369\n", 1639 | "[85]\tvalid-auc:0.984825\n", 1640 | "[90]\tvalid-auc:0.985054\n", 1641 | "[95]\tvalid-auc:0.985282\n", 1642 | "[100]\tvalid-auc:0.985492\n", 1643 | "[105]\tvalid-auc:0.985699\n", 1644 | "[110]\tvalid-auc:0.985984\n", 1645 | "[115]\tvalid-auc:0.986107\n", 1646 | "[120]\tvalid-auc:0.986308\n", 1647 | "[125]\tvalid-auc:0.986371\n", 1648 | "[130]\tvalid-auc:0.98647\n", 1649 | "[135]\tvalid-auc:0.986554\n", 1650 | "[140]\tvalid-auc:0.986716\n", 1651 | "[145]\tvalid-auc:0.98681\n", 1652 | "[150]\tvalid-auc:0.986853\n", 1653 | "[155]\tvalid-auc:0.986941\n", 1654 | "[160]\tvalid-auc:0.987068\n", 1655 | "[165]\tvalid-auc:0.987109\n", 1656 | "[170]\tvalid-auc:0.987232\n", 1657 | "[175]\tvalid-auc:0.987289\n", 1658 | "[180]\tvalid-auc:0.9876\n", 1659 | "[185]\tvalid-auc:0.987754\n", 1660 | "[190]\tvalid-auc:0.987801\n", 1661 | "[195]\tvalid-auc:0.98797\n", 1662 | "[200]\tvalid-auc:0.988013\n", 1663 | "[205]\tvalid-auc:0.988067\n", 1664 | "[210]\tvalid-auc:0.98813\n", 1665 | "[215]\tvalid-auc:0.988169\n", 1666 | "[220]\tvalid-auc:0.988231\n", 1667 | "[225]\tvalid-auc:0.988339\n", 1668 | "[230]\tvalid-auc:0.988396\n", 1669 | "[235]\tvalid-auc:0.988434\n", 1670 | "[240]\tvalid-auc:0.98848\n", 1671 | "[245]\tvalid-auc:0.988509\n", 1672 | "[250]\tvalid-auc:0.988575\n", 1673 | "[255]\tvalid-auc:0.988634\n", 1674 | "[260]\tvalid-auc:0.98866\n", 1675 | "[265]\tvalid-auc:0.988681\n", 1676 | "[270]\tvalid-auc:0.988727\n", 1677 | "[275]\tvalid-auc:0.988742\n", 1678 | "[280]\tvalid-auc:0.988773\n", 1679 | "[285]\tvalid-auc:0.988784\n", 1680 | "[290]\tvalid-auc:0.988805\n", 1681 | "[295]\tvalid-auc:0.988831\n", 1682 | "[300]\tvalid-auc:0.988856\n", 1683 | "[305]\tvalid-auc:0.988902\n", 1684 | "[310]\tvalid-auc:0.988966\n", 1685 | "[315]\tvalid-auc:0.98899\n", 1686 | "[320]\tvalid-auc:0.989023\n", 1687 | "[325]\tvalid-auc:0.989018\n", 1688 | "[330]\tvalid-auc:0.989048\n", 1689 | "[335]\tvalid-auc:0.989047\n", 1690 | "[340]\tvalid-auc:0.989062\n", 1691 | "[345]\tvalid-auc:0.989077\n", 1692 | "[350]\tvalid-auc:0.989141\n", 1693 | "[355]\tvalid-auc:0.989155\n", 1694 | "[360]\tvalid-auc:0.989179\n", 1695 | "[365]\tvalid-auc:0.989191\n", 1696 | "[370]\tvalid-auc:0.989191\n", 1697 | "[375]\tvalid-auc:0.989204\n", 1698 | "[380]\tvalid-auc:0.989201\n", 1699 | "[385]\tvalid-auc:0.989224\n", 1700 | "[390]\tvalid-auc:0.989252\n", 1701 | "[395]\tvalid-auc:0.989252\n", 1702 | "[400]\tvalid-auc:0.989256\n", 1703 | "[405]\tvalid-auc:0.989275\n", 1704 | "[410]\tvalid-auc:0.989283\n", 1705 | "[415]\tvalid-auc:0.989291\n", 1706 | "[420]\tvalid-auc:0.989301\n", 1707 | "[425]\tvalid-auc:0.989303\n", 1708 | "[430]\tvalid-auc:0.989359\n", 1709 | "[435]\tvalid-auc:0.989365\n", 1710 | "[440]\tvalid-auc:0.989371\n", 1711 | "[445]\tvalid-auc:0.989384\n", 1712 | "[450]\tvalid-auc:0.989394\n", 1713 | "[455]\tvalid-auc:0.989407\n", 1714 | "[460]\tvalid-auc:0.989418\n", 1715 | "[465]\tvalid-auc:0.989425\n", 1716 | "[470]\tvalid-auc:0.989453\n", 1717 | "[475]\tvalid-auc:0.989458\n", 1718 | "[480]\tvalid-auc:0.989458\n", 1719 | "[485]\tvalid-auc:0.989464\n", 1720 | "[490]\tvalid-auc:0.989474\n", 1721 | "[495]\tvalid-auc:0.989494\n", 1722 | "[500]\tvalid-auc:0.989534\n", 1723 | "[505]\tvalid-auc:0.989537\n", 1724 | "[510]\tvalid-auc:0.989546\n", 1725 | "[515]\tvalid-auc:0.989543\n", 1726 | "[520]\tvalid-auc:0.98957\n", 1727 | "[525]\tvalid-auc:0.989573\n", 1728 | "[530]\tvalid-auc:0.989584\n", 1729 | "[535]\tvalid-auc:0.989582\n", 1730 | "[540]\tvalid-auc:0.98959\n", 1731 | "[545]\tvalid-auc:0.989588\n", 1732 | "[550]\tvalid-auc:0.98959\n", 1733 | "[555]\tvalid-auc:0.989603\n", 1734 | "[560]\tvalid-auc:0.989604\n", 1735 | "[565]\tvalid-auc:0.98961\n", 1736 | "[570]\tvalid-auc:0.989612\n", 1737 | "[575]\tvalid-auc:0.989624\n", 1738 | "[580]\tvalid-auc:0.989628\n", 1739 | "[585]\tvalid-auc:0.989634\n", 1740 | "[590]\tvalid-auc:0.98964\n", 1741 | "[595]\tvalid-auc:0.989641\n", 1742 | "[600]\tvalid-auc:0.989651\n", 1743 | "[605]\tvalid-auc:0.989653\n", 1744 | "[610]\tvalid-auc:0.98967\n", 1745 | "[615]\tvalid-auc:0.989668\n", 1746 | "[620]\tvalid-auc:0.989681\n", 1747 | "[625]\tvalid-auc:0.989694\n", 1748 | "[630]\tvalid-auc:0.989698\n", 1749 | "[635]\tvalid-auc:0.989716\n", 1750 | "[640]\tvalid-auc:0.989722\n", 1751 | "[645]\tvalid-auc:0.98972\n", 1752 | "[650]\tvalid-auc:0.989733\n", 1753 | "[655]\tvalid-auc:0.989737\n", 1754 | "[660]\tvalid-auc:0.989746\n", 1755 | "[665]\tvalid-auc:0.989743\n", 1756 | "[670]\tvalid-auc:0.989739\n", 1757 | "[675]\tvalid-auc:0.989739\n", 1758 | "[680]\tvalid-auc:0.989743\n", 1759 | "[685]\tvalid-auc:0.989753\n", 1760 | "[690]\tvalid-auc:0.989755\n", 1761 | "[695]\tvalid-auc:0.989755\n", 1762 | "[700]\tvalid-auc:0.989761\n", 1763 | "[705]\tvalid-auc:0.989768\n", 1764 | "[710]\tvalid-auc:0.98977\n", 1765 | "[715]\tvalid-auc:0.98977\n", 1766 | "[720]\tvalid-auc:0.989767\n", 1767 | "[725]\tvalid-auc:0.989771\n", 1768 | "[730]\tvalid-auc:0.989778\n", 1769 | "[735]\tvalid-auc:0.989776\n", 1770 | "[740]\tvalid-auc:0.989783\n", 1771 | "[745]\tvalid-auc:0.989774\n", 1772 | "[750]\tvalid-auc:0.989778\n", 1773 | "[755]\tvalid-auc:0.989769\n", 1774 | "[760]\tvalid-auc:0.98977\n", 1775 | "[765]\tvalid-auc:0.989771\n", 1776 | "[770]\tvalid-auc:0.989779\n", 1777 | "[775]\tvalid-auc:0.989788\n", 1778 | "[780]\tvalid-auc:0.989789\n", 1779 | "[785]\tvalid-auc:0.98979\n", 1780 | "[790]\tvalid-auc:0.989796\n", 1781 | "[795]\tvalid-auc:0.989805\n", 1782 | "[800]\tvalid-auc:0.989799\n", 1783 | "[805]\tvalid-auc:0.989801\n", 1784 | "[810]\tvalid-auc:0.989802\n", 1785 | "[815]\tvalid-auc:0.98981\n", 1786 | "[820]\tvalid-auc:0.989822\n", 1787 | "[825]\tvalid-auc:0.989823\n", 1788 | "[830]\tvalid-auc:0.989824\n", 1789 | "[835]\tvalid-auc:0.989819\n", 1790 | "[840]\tvalid-auc:0.989821\n", 1791 | "[845]\tvalid-auc:0.989826\n", 1792 | "[850]\tvalid-auc:0.989828\n", 1793 | "[855]\tvalid-auc:0.989827\n", 1794 | "[860]\tvalid-auc:0.989827\n", 1795 | "[865]\tvalid-auc:0.989829\n", 1796 | "[870]\tvalid-auc:0.989831\n", 1797 | "[875]\tvalid-auc:0.989836\n", 1798 | "[880]\tvalid-auc:0.989832\n", 1799 | "[885]\tvalid-auc:0.989837\n", 1800 | "[890]\tvalid-auc:0.989839\n", 1801 | "[895]\tvalid-auc:0.989852\n", 1802 | "[900]\tvalid-auc:0.98985\n", 1803 | "[905]\tvalid-auc:0.989845\n", 1804 | "[910]\tvalid-auc:0.989855\n", 1805 | "[915]\tvalid-auc:0.989853\n", 1806 | "[920]\tvalid-auc:0.989847\n", 1807 | "[925]\tvalid-auc:0.989872\n", 1808 | "[930]\tvalid-auc:0.989868\n", 1809 | "[935]\tvalid-auc:0.989888\n", 1810 | "[940]\tvalid-auc:0.989888\n", 1811 | "[945]\tvalid-auc:0.989897\n", 1812 | "[950]\tvalid-auc:0.9899\n" 1813 | ] 1814 | } 1815 | ], 1816 | "source": [ 1817 | "xgb_model = xgb.train(xgb_params, \n", 1818 | " dtrain, \n", 1819 | " num_boost_round=2000, \n", 1820 | " evals=watchlist, \n", 1821 | " maximize=True, \n", 1822 | " early_stopping_rounds = 50, \n", 1823 | " verbose_eval=5)" 1824 | ] 1825 | }, 1826 | { 1827 | "cell_type": "code", 1828 | "execution_count": null, 1829 | "metadata": { 1830 | "collapsed": true 1831 | }, 1832 | "outputs": [], 1833 | "source": [ 1834 | "xgb.plot_importance(xgb_model)" 1835 | ] 1836 | }, 1837 | { 1838 | "cell_type": "code", 1839 | "execution_count": null, 1840 | "metadata": { 1841 | "collapsed": true 1842 | }, 1843 | "outputs": [], 1844 | "source": [ 1845 | "import operator\n", 1846 | "sorted(xgb_model.get_fscore().items(), key=operator.itemgetter(1), reverse=True)" 1847 | ] 1848 | }, 1849 | { 1850 | "cell_type": "markdown", 1851 | "metadata": {}, 1852 | "source": [ 1853 | "# XGB Prediction" 1854 | ] 1855 | }, 1856 | { 1857 | "cell_type": "code", 1858 | "execution_count": null, 1859 | "metadata": { 1860 | "collapsed": true 1861 | }, 1862 | "outputs": [], 1863 | "source": [ 1864 | "model_name = 'xgb-model-%s' % strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", 1865 | "xgb_model.save_model(model_name)\n", 1866 | "print('model saved as %s' % model_name)" 1867 | ] 1868 | }, 1869 | { 1870 | "cell_type": "code", 1871 | "execution_count": null, 1872 | "metadata": { 1873 | "collapsed": true 1874 | }, 1875 | "outputs": [], 1876 | "source": [ 1877 | "dtest = xgb.DMatrix(test_features)" 1878 | ] 1879 | }, 1880 | { 1881 | "cell_type": "code", 1882 | "execution_count": null, 1883 | "metadata": { 1884 | "collapsed": true 1885 | }, 1886 | "outputs": [], 1887 | "source": [ 1888 | "print('Predicting...')\n", 1889 | "test_df['is_attributed'] = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)" 1890 | ] 1891 | }, 1892 | { 1893 | "cell_type": "code", 1894 | "execution_count": null, 1895 | "metadata": { 1896 | "collapsed": true, 1897 | "scrolled": true 1898 | }, 1899 | "outputs": [], 1900 | "source": [ 1901 | "print('loading test')\n", 1902 | "test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])" 1903 | ] 1904 | }, 1905 | { 1906 | "cell_type": "code", 1907 | "execution_count": null, 1908 | "metadata": { 1909 | "collapsed": true 1910 | }, 1911 | "outputs": [], 1912 | "source": [ 1913 | "print('merging test_supplement to test')\n", 1914 | "join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']\n", 1915 | "all_cols = join_cols + ['is_attributed']\n", 1916 | "\n", 1917 | "test = test.merge(test_df[all_cols], how='left', on=join_cols)\n", 1918 | "test = test.drop_duplicates(subset=['click_id'])\n", 1919 | "\n", 1920 | "print(\"Writing the submission data into a csv file...\")\n", 1921 | "test[['click_id', 'is_attributed']].to_csv('submit_xgb_%s.gz'%xgb_model.best_ntree_limit, index=False, float_format='%.9f', compression='gzip')\n", 1922 | "print(\"All done...\")" 1923 | ] 1924 | }, 1925 | { 1926 | "cell_type": "code", 1927 | "execution_count": null, 1928 | "metadata": { 1929 | "collapsed": true 1930 | }, 1931 | "outputs": [], 1932 | "source": [] 1933 | } 1934 | ], 1935 | "metadata": { 1936 | "kernelspec": { 1937 | "display_name": "Python 3", 1938 | "language": "python", 1939 | "name": "python3" 1940 | }, 1941 | "language_info": { 1942 | "codemirror_mode": { 1943 | "name": "ipython", 1944 | "version": 3 1945 | }, 1946 | "file_extension": ".py", 1947 | "mimetype": "text/x-python", 1948 | "name": "python", 1949 | "nbconvert_exporter": "python", 1950 | "pygments_lexer": "ipython3", 1951 | "version": "3.6.3" 1952 | } 1953 | }, 1954 | "nbformat": 4, 1955 | "nbformat_minor": 2 1956 | } 1957 | -------------------------------------------------------------------------------- /train_lgb_xgb.py: -------------------------------------------------------------------------------- 1 | import gc 2 | from time import gmtime, strftime 3 | 4 | from sklearn.model_selection import train_test_split 5 | import lightgbm as lgb 6 | import xgboost as xgb 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | def group_label(df, group_cols): 12 | for i, cols in enumerate(group_cols): 13 | col_name = "_".join(group_cols) 14 | print(i, col_name) 15 | group_idx = df.drop_duplicates(cols)[cols].reset_index() 16 | group_idx.rename(columns={'index': col_name}, inplace=True) 17 | df = df.merge(group_idx, on=cols, how='left') 18 | del group_idx 19 | gc.collect() 20 | return df 21 | 22 | 23 | def count_agg(df, group_cols): 24 | for i, cols in enumerate(group_cols): 25 | col_name = "_".join(cols) + '_count' 26 | print(i, col_name) 27 | count = df.groupby(cols).size().reset_index(name=col_name) 28 | df = df.merge(count, on=cols, how='left') 29 | del count 30 | gc.collect() 31 | return df 32 | 33 | 34 | def count_cum(df, group_cols): 35 | for i, cols in enumerate(group_cols): 36 | col_name = "_".join(cols) + '_countAccum' 37 | print(i, col_name) 38 | df[col_name] = df.groupby(cols).cumcount() 39 | gc.collect() 40 | return df 41 | 42 | 43 | def count_uniq(df, group_uniq_cols): 44 | for i, cols in enumerate(group_uniq_cols): 45 | group_cols, uniq_col = cols[0], cols[1] 46 | col_name = "_".join(group_cols) + '_uniq_' + uniq_col + '_countUniq' 47 | print(i, col_name) 48 | tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name) 49 | df = df.merge(tmp, on=group_cols, how='left') 50 | del tmp 51 | gc.collect() 52 | return df 53 | 54 | 55 | def next_click(df, group_cols): 56 | for i, cols in enumerate(group_cols): 57 | col_name = "_".join(cols) + '_nextClick' 58 | print(i, col_name) 59 | df[col_name] = (df.groupby(cols).click_time.shift(-1) - df.click_time).astype(np.float32) 60 | gc.collect() 61 | return df 62 | 63 | 64 | def frequence(df, group_cols): 65 | for i, cols in enumerate(group_cols): 66 | col_name = "_".join(cols) + '_nextClick' 67 | print(i, col_name) 68 | clickFreq = df.groupby(cols)[col_name].mean().dropna().reset_index(name=("_".join(cols) + '_clickFreq')) 69 | df = df.merge(clickFreq, on=cols, how='left') 70 | del clickFreq 71 | gc.collect() 72 | return df 73 | 74 | 75 | def generate_features(df): 76 | print('generating time features...') 77 | df['day'] = df['click_time'].dt.day.astype('uint8') 78 | df['hour'] = df['click_time'].dt.hour.astype('uint8') 79 | df['in_test_hh'] = (3 - 2 * df['hour'].isin([4, 5, 9, 10, 13, 14]) # most frequent 80 | - 1 * df['hour'].isin([6, 11, 15])).astype('uint8') # least frequent 81 | print('done') 82 | gc.collect() 83 | 84 | group_combinations = [ 85 | # ['app', 'device'], 86 | # ['app', 'channel'] 87 | ] 88 | 89 | count_combinations = [ 90 | ['app'], 91 | ['ip'], # 3.03 92 | ['channel'], 93 | ['os'], 94 | ['ip', 'device'], # 9.88 95 | ['day', 'hour', 'app'], # 4.08 96 | ['app', 'channel'], # 2.8 97 | ['ip', 'day', 'in_test_hh'], # 1.74 98 | ['ip', 'day', 'hour'], # 0.52 99 | ['os', 'device'], # 0.44 100 | ['ip', 'os', 'day', 'hour'], # 0.41 101 | ['ip', 'device', 'day', 'hour'], # 0.31 102 | ['ip', 'app', 'os'] # 0.21 103 | ] 104 | 105 | countUniq_combinations = [ 106 | # [['app'],'ip'], 107 | # [['app', 'device', 'os', 'channel'], 'ip'], 108 | [['ip'], 'channel'], # 0.9 109 | [['ip'], 'app'], # 1.3 110 | [['ip'], 'os'] # 0.45 111 | ] 112 | 113 | nextClick_combinations = [ 114 | ['ip', 'os'], 115 | ['ip', 'device', 'os'], 116 | ['ip', 'app', 'device', 'os'], 117 | ['ip', 'app', 'device', 'os', 'channel'] 118 | ] 119 | 120 | freq_combinations = [ 121 | # ['ip', 'app', 'device', 'os'] 122 | ] 123 | 124 | accum_combinations = [ 125 | # ['app'], 126 | ['ip'] # 3.03 127 | # ['day', 'hour', 'app'] 128 | ] 129 | 130 | df = group_label(df, group_combinations) 131 | df = count_agg(df, count_combinations) 132 | df = count_cum(df, accum_combinations) 133 | df = count_uniq(df, countUniq_combinations) 134 | df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32) 135 | df = next_click(df, nextClick_combinations) 136 | df = frequence(df, freq_combinations) 137 | 138 | df.drop(['ip', 'click_time', 'day', 'in_test_hh'], axis=1, inplace=True) 139 | gc.collect() 140 | print(df.info()) 141 | return df 142 | 143 | 144 | # Load data 145 | dtype = { 146 | 'ip' :'uint32', 147 | 'app' :'uint16', 148 | 'device': 'uint16', 149 | 'os' :'uint16', 150 | 'channel': 'uint16', 151 | 'is_attributed': 'uint8', 152 | 'click_id': 'uint32', 153 | } 154 | print('loading train.csv') 155 | # train: (184903890, 7) 156 | # test: (18790469, 7) 157 | train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'] 158 | train_df = pd.read_csv('data/train.csv', dtype=dtype, usecols=train_cols, parse_dates=['click_time']) 159 | print('loading test.csv') 160 | test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id'] 161 | # using test_supplement 162 | test_df = pd.read_csv('data/test_supplement.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time']) 163 | 164 | # combine train and test data 165 | common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time'] 166 | all_df = pd.concat([train_df[common_cols], test_df[common_cols]]) 167 | 168 | # generate data 169 | all_df = generate_features(all_df) 170 | 171 | # split train/test features from concated data 172 | train_features = all_df.iloc[:train_df.shape[0]] 173 | test_features = all_df.iloc[train_df.shape[0]:] 174 | gc.collect() 175 | 176 | 177 | ########################### train LGB ########################### 178 | lgb_params = { 179 | 'boosting_type': 'gbdt', 180 | 'objective': 'binary', 181 | 'metric': 'auc', 182 | 'learning_rate': 0.08, 183 | 'num_leaves': 8, 184 | 'max_depth': 4, 185 | 'min_child_samples': 100, 186 | 'max_bin': 100, 187 | 'subsample': 0.7, 188 | 'subsample_freq': 1, 189 | 'colsample_bytree': 0.7, 190 | 'min_child_weight': 0, 191 | 'min_split_gain': 0, 192 | 'nthread': 24, 193 | 'verbose': 1, 194 | 'scale_pos_weight': 200 195 | } 196 | 197 | target = 'is_attributed' 198 | features = [col for col in train_features.columns if col not in ['level_0', 'index', 'is_attributed']] 199 | category = ['app', 'device', 'os', 'channel', 'hour'] 200 | 201 | # train valid split 202 | labels = train_df.is_attributed.values 203 | train_features, valid_features = train_test_split(train_features, test_size=5000000, shuffle=False) 204 | train_labels, valid_labels = train_test_split(labels, test_size=5000000, shuffle=False) 205 | print('Train size:', len(train_features)) 206 | print('Valid size:', len(valid_features)) 207 | gc.collect() 208 | 209 | # convert data into dataset. Warning: Memory Peak 210 | print('converting xgtrain...') 211 | xgtrain = lgb.Dataset(train_features[features].values, 212 | label=train_labels, 213 | feature_name=features, 214 | categorical_feature=category) 215 | 216 | print('converting xgvalid...') 217 | xgvalid = lgb.Dataset(valid_features[features].values, 218 | label=valid_labels, 219 | feature_name=features, 220 | categorical_feature=category) 221 | 222 | print('Training...') 223 | evals_results = {} 224 | model = lgb.train(lgb_params, 225 | xgtrain, 226 | valid_sets=[xgvalid], 227 | valid_names=['valid'], 228 | evals_result=evals_results, 229 | num_boost_round=5000, 230 | early_stopping_rounds=100, 231 | verbose_eval=1, 232 | feval=None) 233 | n_estimators = model.best_iteration 234 | 235 | print('\nModel Info:') 236 | print('n_estimators:', n_estimators) 237 | print('auc' + ':', evals_results['valid']['auc'][n_estimators - 1]) 238 | 239 | gain = model.feature_importance('gain') 240 | ft = pd.DataFrame({'feature': model.feature_name(), 'split': model.feature_importance('split'), 241 | 'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False) 242 | ft.to_csv('feature_importance_ref.csv', index=False) 243 | print(ft) 244 | 245 | model_name = 'model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 246 | model.save_model(model_name) 247 | print('model saved as %s' % model_name) 248 | 249 | print('Predicting...') 250 | test_df['is_attributed'] = model.predict(test_features[features], num_iteration=model.best_iteration) 251 | 252 | print('loading test') 253 | test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time']) 254 | 255 | print('merging test_supplement to test') 256 | join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time'] 257 | all_cols = join_cols + ['is_attributed'] 258 | 259 | test = test.merge(test_df[all_cols], how='left', on=join_cols) 260 | test = test.drop_duplicates(subset=['click_id']) 261 | 262 | print("Writing the submission data into a csv file...") 263 | test[['click_id', 'is_attributed']].to_csv('submit_lgb_%s.gz'%(model.best_iteration), index=False, float_format='%.9f', compression='gzip') 264 | print("All done...") 265 | 266 | del test 267 | gc.collect() 268 | 269 | ########################### train XGB ########################### 270 | xgb_params = {'eta': 0.08, 271 | 'tree_method': "hist", 272 | 'grow_policy': "lossguide", 273 | 'max_leaves': 1400, 274 | 'max_depth': 4, 275 | 'subsample': 0.7, 276 | 'colsample_bytree': 0.7, 277 | 'colsample_bylevel':0.7, 278 | 'min_child_weight':0, 279 | 'alpha':0, 280 | 'objective': 'binary:logistic', 281 | 'eval_metric': 'auc', 282 | 'nthread':24, 283 | 'random_state': 42, 284 | 'scale_pos_weight':200, 285 | 'silent': True} 286 | 287 | print('converting dtrain...') 288 | dtrain = xgb.DMatrix(train_features, train_labels) 289 | dvalid = xgb.DMatrix(valid_features, valid_labels) 290 | watchlist = [(dvalid, 'valid')] 291 | 292 | xgb_model = xgb.train(xgb_params, 293 | dtrain, 294 | num_boost_round=5000, 295 | evals=watchlist, 296 | maximize=True, 297 | early_stopping_rounds = 100, 298 | verbose_eval=5) 299 | 300 | model_name = 'xgb-model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 301 | xgb_model.save_model(model_name) 302 | print('model saved as %s' % model_name) 303 | 304 | dtest = xgb.DMatrix(test_features) 305 | print('Predicting...') 306 | test_df['is_attributed'] = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit) 307 | print('loading test') 308 | test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time']) 309 | 310 | print('merging test_supplement to test') 311 | join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time'] 312 | all_cols = join_cols + ['is_attributed'] 313 | 314 | test = test.merge(test_df[all_cols], how='left', on=join_cols) 315 | test = test.drop_duplicates(subset=['click_id']) 316 | 317 | print("Writing the submission data into a csv file...") 318 | test[['click_id', 'is_attributed']].to_csv('submit_xgb_%s_%s.gz' % (xgb_model.best_ntree_limit, xgb_model.best_score), index=False, float_format='%.9f', compression='gzip') 319 | print("All done...") 320 | 321 | --------------------------------------------------------------------------------