├── requirements.txt ├── README.md ├── LICENSE.md ├── Zimnat-lgb_best_score.ipynb └── Zimnat_insurance_best_multy_overall.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | pd.__version__==1.1.0 2 | np.__version__==1.18.5 3 | lightgbm.__version__==3.0.0 4 | sklearn.__version__==0.23.1 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zimnat_Insurance_top-12 2 | My solution for #12 in privat leaderboard. Score=0.0260809843625832 3 | 4 | 5 | 1. Run Zimnat-lgb_best_score.ipynb. It contains lightgbm and xgboost models with some preprocessing. 6 | 2. Run Zimnat_insurance_cat_target+multy.ipynb. It contains some another and same preprocessing and first: catboost for multiclass; second: catboost for binary with target values. 7 | 3. Run Zimnat_insurance_best_multy_overall.ipynb. It contains the same preprocessing and new catboost model. In the end of code you can see blending all models and some postprocessing from statistics. 8 | 9 | 10 | 11 | P.S: bro, you are the best 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Roman Zaev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Zimnat-lgb_best_score.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Download libraries and data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 15 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "import copy\n", 22 | "from itertools import combinations\n", 23 | "\n", 24 | "from lightgbm import LGBMClassifier\n", 25 | "from xgboost import XGBClassifier\n", 26 | "\n", 27 | "from sklearn.metrics import log_loss\n", 28 | "from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold\n", 29 | "from sklearn.preprocessing import LabelEncoder\n", 30 | "\n", 31 | "from tqdm import tqdm, tqdm_notebook\n", 32 | "\n", 33 | "pd.set_option('display.max_columns', 100)\n", 34 | "\n", 35 | "import warnings\n", 36 | "warnings.filterwarnings(\"ignore\")\n", 37 | "import time" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 171, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "train = pd.read_csv('Train.csv')\n", 47 | "test = pd.read_csv('Test.csv')\n", 48 | "sub = pd.read_csv('SampleSubmission.csv')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Checking the data" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 172, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | "\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP8NN17POT66FJGYSRSOP4RVSZPYUQLJR9N2MWAHXOBSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3
04WKQSBB1/2/2019FM19871X1H2A7IT4MS000000010000001000000
1CP5S02H1/6/2019FM1981UAOD2A7IT4MS000000010000001000000
\n", 182 | "
" 183 | ], 184 | "text/plain": [ 185 | " ID join_date sex marital_status birth_year branch_code \\\n", 186 | "0 4WKQSBB 1/2/2019 F M 1987 1X1H \n", 187 | "1 CP5S02H 1/6/2019 F M 1981 UAOD \n", 188 | "\n", 189 | " occupation_code occupation_category_code P5DA RIBP 8NN1 7POT 66FJ \\\n", 190 | "0 2A7I T4MS 0 0 0 0 0 \n", 191 | "1 2A7I T4MS 0 0 0 0 0 \n", 192 | "\n", 193 | " GYSR SOP4 RVSZ PYUQ LJR9 N2MW AHXO BSTQ FM3X K6QO QBOL JWFN \\\n", 194 | "0 0 0 1 0 0 0 0 0 0 1 0 0 \n", 195 | "1 0 0 1 0 0 0 0 0 0 1 0 0 \n", 196 | "\n", 197 | " JZ9D J9JW GHYX ECY3 \n", 198 | "0 0 0 0 0 \n", 199 | "1 0 0 0 0 " 200 | ] 201 | }, 202 | "execution_count": 172, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "train.head(2)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 173, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/html": [ 219 | "
\n", 220 | "\n", 233 | "\n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP8NN17POT66FJGYSRSOP4RVSZPYUQLJR9N2MWAHXOBSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3
0F86J5PC1/12/2018MM198494KCDZRV90QI000000010000000000000
1H6141K31/10/2019MM19961X1HJ9SY90QI000000010000001000000
\n", 335 | "
" 336 | ], 337 | "text/plain": [ 338 | " ID join_date sex marital_status birth_year branch_code \\\n", 339 | "0 F86J5PC 1/12/2018 M M 1984 94KC \n", 340 | "1 H6141K3 1/10/2019 M M 1996 1X1H \n", 341 | "\n", 342 | " occupation_code occupation_category_code P5DA RIBP 8NN1 7POT 66FJ \\\n", 343 | "0 DZRV 90QI 0 0 0 0 0 \n", 344 | "1 J9SY 90QI 0 0 0 0 0 \n", 345 | "\n", 346 | " GYSR SOP4 RVSZ PYUQ LJR9 N2MW AHXO BSTQ FM3X K6QO QBOL JWFN \\\n", 347 | "0 0 0 1 0 0 0 0 0 0 0 0 0 \n", 348 | "1 0 0 1 0 0 0 0 0 0 1 0 0 \n", 349 | "\n", 350 | " JZ9D J9JW GHYX ECY3 \n", 351 | "0 0 0 0 0 \n", 352 | "1 0 0 0 0 " 353 | ] 354 | }, 355 | "execution_count": 173, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "test.head(2)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 174, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/html": [ 372 | "
\n", 373 | "\n", 386 | "\n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | "
ID X PCODELabel
0F86J5PC X P5DA0
1F86J5PC X RIBP0
\n", 407 | "
" 408 | ], 409 | "text/plain": [ 410 | " ID X PCODE Label\n", 411 | "0 F86J5PC X P5DA 0\n", 412 | "1 F86J5PC X RIBP 0" 413 | ] 414 | }, 415 | "execution_count": 174, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "sub.head(2)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "# Replacing non common occupation codes to occupation category code" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 175, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "replace_train = list(\n", 438 | " set(train['occupation_code'].unique().tolist()) -\n", 439 | " set(test['occupation_code']))\n", 440 | "replace_test = list(\n", 441 | " set(test['occupation_code'].unique().tolist()) -\n", 442 | " set(train['occupation_code']))\n", 443 | "\n", 444 | "train['occupation_code'] = train['occupation_code'].replace(\n", 445 | " replace_train, np.nan)\n", 446 | "test['occupation_code'] = test['occupation_code'].replace(replace_test, np.nan)\n", 447 | "train['occupation_code'].fillna(train['occupation_category_code'],\n", 448 | " inplace=True)\n", 449 | "test['occupation_code'].fillna(test['occupation_category_code'], inplace=True)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "# Getting right format" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 176, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "#Make spliting train clients info. Trying to reproduce the situation with test\n", 466 | "#\n", 467 | "X_train = []\n", 468 | "X_train_columns = train.columns[:-1]\n", 469 | "client_index = 0\n", 470 | "\n", 471 | "for line in tqdm_notebook(train.values):\n", 472 | "\n", 473 | " info = line[:8]\n", 474 | " info_products = line[8:-1]\n", 475 | " indexes = [k for k, i in enumerate(info_products) if i == 1]\n", 476 | "\n", 477 | " for i in indexes:\n", 478 | "\n", 479 | " client_index += 1\n", 480 | "\n", 481 | " for k in range(len(info_products)):\n", 482 | "\n", 483 | " if k == i:\n", 484 | "\n", 485 | " info_products_transformed = list(copy.copy(info_products))\n", 486 | " info_products_transformed[i] = 0\n", 487 | "\n", 488 | " X_train.append(\n", 489 | " list(info) + info_products_transformed +\n", 490 | " [X_train_columns[8 + k]] + [client_index])\n", 491 | "\n", 492 | "X_train = pd.DataFrame(X_train)\n", 493 | "X_train.columns = [\n", 494 | " 'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',\n", 495 | " 'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',\n", 496 | " '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',\n", 497 | " 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',\n", 498 | " 'target', 'ID2'\n", 499 | "]\n", 500 | "train = X_train.copy()" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 177, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "#Make info about true values in data of predictions\n", 510 | "#\n", 511 | "X_test = []\n", 512 | "true_values = []\n", 513 | "client_index = 0\n", 514 | "for line in tqdm_notebook(test.values):\n", 515 | "\n", 516 | " client_index += 1\n", 517 | "\n", 518 | " info = line[:8]\n", 519 | " info_products = line[8:-1]\n", 520 | " indexes = [k for k, i in enumerate(info_products) if i == 1]\n", 521 | "\n", 522 | " X_test.append(list(info) + list(info_products) + [client_index])\n", 523 | "\n", 524 | " for true in test.columns[8:][indexes]:\n", 525 | " true_values.append(line[0] + ' X ' + true)\n", 526 | "\n", 527 | "X_test = pd.DataFrame(X_test)\n", 528 | "X_test.columns = [\n", 529 | " 'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',\n", 530 | " 'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',\n", 531 | " '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',\n", 532 | " 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',\n", 533 | " 'ID2'\n", 534 | "]\n", 535 | "test = X_test.copy()" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 178, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "train['marital_status'] = train['marital_status'].replace(['f'], ['F'])" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 179, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "df = train.append(test)" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "# Feature Engineering" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 180, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "def create_date_featues(df):\n", 570 | "\n", 571 | " df['Join_Year'] = pd.to_datetime(df['join_date']).dt.year\n", 572 | "\n", 573 | " df['Join_Month'] = pd.to_datetime(df['join_date']).dt.month\n", 574 | "\n", 575 | " df['Join_Day'] = pd.to_datetime(df['join_date']).dt.day\n", 576 | "\n", 577 | " df['DayOfyear'] = pd.to_datetime(df['join_date']).dt.dayofyear\n", 578 | "\n", 579 | " return df" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 181, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "df = create_date_featues(df)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 182, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [ 597 | "df['birth_year_bin'] = pd.cut(df['birth_year'], bins=5)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 183, 603 | "metadata": {}, 604 | "outputs": [ 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "Wall time: 116 ms\n" 610 | ] 611 | } 612 | ], 613 | "source": [ 614 | "%%time\n", 615 | "columns = [\n", 616 | " 'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',\n", 617 | " 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',\n", 618 | " 'J9JW', 'GHYX', 'ECY3'\n", 619 | "]\n", 620 | "for col in columns:\n", 621 | " df[col + '_' + 'sum'] = df.groupby('branch_code')[col].transform(sum)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 184, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [ 630 | "for col in columns:\n", 631 | " df[col + '_' +\n", 632 | " 'Join_year_sum'] = df.groupby('Join_Year')[col].transform(sum)" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 189, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "df['join_date'] = pd.to_datetime(df['join_date'])\n", 642 | "for col in columns:\n", 643 | " df['from_arise_col_' +\n", 644 | " col] = (df['join_date'] -\n", 645 | " df.loc[df[col] == 1, 'join_date'].min()).dt.days" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 194, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "df['Number_of_Insurance_Bought'] = df.iloc[:, 8:29].sum(axis=1)\n", 655 | "\n", 656 | "\n", 657 | "def mapper(df):\n", 658 | " if df['Number_of_Insurance_Bought'] == 1:\n", 659 | " return 'One'\n", 660 | " elif (df['Number_of_Insurance_Bought'] >\n", 661 | " 1) & (df['Number_of_Insurance_Bought'] < 5):\n", 662 | " return 'Medium'\n", 663 | " elif (df['Number_of_Insurance_Bought'] >\n", 664 | " 4) & (df['Number_of_Insurance_Bought'] < 8):\n", 665 | " return 'High'\n", 666 | " else:\n", 667 | " return 'Too High'\n", 668 | "\n", 669 | "\n", 670 | "df['Insurance_Count'] = df.apply(lambda df: mapper(df), axis=1)\n", 671 | "del df['Number_of_Insurance_Bought']" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 195, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "df['branch_start_year'] = df.groupby('branch_code')['Join_Year'].transform('min')\n", 681 | "df['branch_since'] = 2020 - df['branch_start_year']\n", 682 | "del df['branch_start_year']" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 196, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "df['Unique_customers_per_branch'] = df.groupby('branch_code')['ID'].transform('nunique')\n", 692 | "df['Unique_Insurance_per_branch'] = df.groupby('branch_code')['target'].transform('nunique')\n", 693 | "\n", 694 | "df['Unique_year_per_branch'] = df.groupby('branch_code')['Join_Year'].transform('nunique')\n", 695 | "df['Unique_month_per_branch'] = df.groupby('branch_code')['Join_Month'].transform('nunique')\n", 696 | "df['Unique_branch_per_year'] = df.groupby('Join_Year')['branch_code'].transform('nunique')" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 197, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "df['Age'] = df['Join_Year'] - df['birth_year']\n", 706 | "df['Average_Age_per_branch'] = df.groupby('branch_code')['Age'].transform('mean')\n", 707 | "\n", 708 | "df['Average_Age_per_occupation'] = df.groupby('occupation_code')['Age'].transform('mean')\n", 709 | "\n", 710 | "for col in columns:\n", 711 | " df[col + '_' + 'meanAge'] = df.groupby(col)['Age'].transform('mean')\n", 712 | "\n", 713 | "del df['Age']" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 198, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "df.reset_index(drop=True, inplace=True)" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": 199, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "names_products = [\n", 732 | " 'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',\n", 733 | " 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',\n", 734 | " 'J9JW', 'GHYX', 'ECY3'\n", 735 | "]" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 200, 741 | "metadata": {}, 742 | "outputs": [ 743 | { 744 | "data": { 745 | "application/vnd.jupyter.widget-view+json": { 746 | "model_id": "fb3a491c747e40fba6e9220d679c2d94", 747 | "version_major": 2, 748 | "version_minor": 0 749 | }, 750 | "text/plain": [ 751 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" 752 | ] 753 | }, 754 | "metadata": {}, 755 | "output_type": "display_data" 756 | }, 757 | { 758 | "name": "stdout", 759 | "output_type": "stream", 760 | "text": [ 761 | "\n" 762 | ] 763 | } 764 | ], 765 | "source": [ 766 | "#Add glue togethered targets\n", 767 | "#\n", 768 | "for i, row in tqdm_notebook(df.iterrows()):\n", 769 | " res = []\n", 770 | " for c in names_products:\n", 771 | " if row[c] == 1:\n", 772 | " res.append(c)\n", 773 | " df.loc[df.index == i, 'product_comb'] = '_'.join(sorted(res))" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 201, 779 | "metadata": {}, 780 | "outputs": [ 781 | { 782 | "data": { 783 | "text/html": [ 784 | "
\n", 785 | "\n", 798 | "\n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP8NN17POT66FJGYSRSOP4RVSZPYUQLJR9N2MWAHXOBSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3targetID2Join_YearJoin_MonthJoin_DayDayOfyearbirth_year_binP5DA_sumRIBP_sum8NN1_sum7POT_sum66FJ_sumGYSR_sumSOP4_sumRVSZ_sumPYUQ_sumLJR9_sumN2MW_sumAHXO_sumBSTQ_sumFM3X_sum...from_arise_col_8NN1from_arise_col_7POTfrom_arise_col_66FJfrom_arise_col_GYSRfrom_arise_col_SOP4from_arise_col_RVSZfrom_arise_col_PYUQfrom_arise_col_LJR9from_arise_col_N2MWfrom_arise_col_AHXOfrom_arise_col_BSTQfrom_arise_col_FM3Xfrom_arise_col_K6QOfrom_arise_col_QBOLfrom_arise_col_JWFNfrom_arise_col_JZ9Dfrom_arise_col_J9JWfrom_arise_col_GHYXfrom_arise_col_ECY3Insurance_Countbranch_sinceUnique_customers_per_branchUnique_Insurance_per_branchUnique_year_per_branchUnique_month_per_branchUnique_branch_per_yearAverage_Age_per_branchAverage_Age_per_occupationP5DA_meanAgeRIBP_meanAge8NN1_meanAge7POT_meanAge66FJ_meanAgeGYSR_meanAgeSOP4_meanAgeRVSZ_meanAgePYUQ_meanAgeLJR9_meanAgeN2MW_meanAgeAHXO_meanAgeBSTQ_meanAgeFM3X_meanAgeK6QO_meanAgeQBOL_meanAgeJWFN_meanAgeJZ9D_meanAgeJ9JW_meanAgeGHYX_meanAgeECY3_meanAgeproduct_comb
04WKQSBB2019-01-02FM19871X1H2A7IT4MS000000000000001000000RVSZ12019.01.02.02.0(1979.4, 1995.2]014051001598804080...3284.03284.03282.02191.03282.03284.03284.03284.02919.03278.03277.02554.03282.03284.02912.03278.03278.03277.03281.0One2.01738143114.037.53258838.03514438.41234838.35014838.41139838.4109638.40563238.41407938.42929738.3612138.39629138.41940338.36586438.38269538.41380438.41301338.38430237.67371438.407438.42908938.43148538.40846638.360615K6QO
14WKQSBB2019-01-02FM19871X1H2A7IT4MS000000010000000000000K6QO22019.01.02.02.0(1979.4, 1995.2]014051001598804080...3284.03284.03282.02191.03282.03284.03284.03284.02919.03278.03277.02554.03282.03284.02912.03278.03278.03277.03281.0One2.01738143114.037.53258838.03514438.41234838.35014838.41139838.4109638.40563238.41407938.42929738.4699538.39629138.41940338.36586438.38269538.41380438.41301338.43615937.67371438.407438.42908938.43148538.40846638.360615RVSZ
2CP5S02H2019-01-06FM1981UAOD2A7IT4MS000000000000001000000RVSZ32019.01.06.06.0(1979.4, 1995.2]0546136018419136044258370...3288.03288.03286.02195.03286.03288.03288.03288.02923.03282.03281.02558.03286.03288.02916.03282.03282.03281.03285.0One9.047141810114.037.96199338.03514438.41234838.35014838.41139838.4109638.40563238.41407938.42929738.3612138.39629138.41940338.36586438.38269538.41380438.41301338.38430237.67371438.407438.42908938.43148538.40846638.360615K6QO
3CP5S02H2019-01-06FM1981UAOD2A7IT4MS000000010000000000000K6QO42019.01.06.06.0(1979.4, 1995.2]0546136018419136044258370...3288.03288.03286.02195.03286.03288.03288.03288.02923.03282.03281.02558.03286.03288.02916.03282.03282.03281.03285.0One9.047141810114.037.96199338.03514438.41234838.35014838.41139838.4109638.40563238.41407938.42929738.4699538.39629138.41940338.36586438.38269538.41380438.41301338.43615937.67371438.407438.42908938.43148538.40846638.360615RVSZ
42YKDILJ2013-01-06MU1991748LQZYX90QI000000010000000000001SOP452013.01.06.06.0(1979.4, 1995.2]72031144259366241610711114225416138391133...1097.01097.01095.04.01095.01097.01097.01097.0732.01091.01090.0367.01095.01097.0725.01091.01091.01090.01094.0Medium10.010919211167.038.56082731.42367338.41234838.35014838.41139838.4109638.40563238.41407938.42929738.4699538.39629138.41940338.36586438.38269538.41380438.41301338.43615937.67371438.407438.42908938.43148538.40846640.115056ECY3_RVSZ
\n", 1428 | "

5 rows × 130 columns

\n", 1429 | "
" 1430 | ], 1431 | "text/plain": [ 1432 | " ID join_date sex marital_status birth_year branch_code \\\n", 1433 | "0 4WKQSBB 2019-01-02 F M 1987 1X1H \n", 1434 | "1 4WKQSBB 2019-01-02 F M 1987 1X1H \n", 1435 | "2 CP5S02H 2019-01-06 F M 1981 UAOD \n", 1436 | "3 CP5S02H 2019-01-06 F M 1981 UAOD \n", 1437 | "4 2YKDILJ 2013-01-06 M U 1991 748L \n", 1438 | "\n", 1439 | " occupation_code occupation_category_code P5DA RIBP 8NN1 7POT 66FJ \\\n", 1440 | "0 2A7I T4MS 0 0 0 0 0 \n", 1441 | "1 2A7I T4MS 0 0 0 0 0 \n", 1442 | "2 2A7I T4MS 0 0 0 0 0 \n", 1443 | "3 2A7I T4MS 0 0 0 0 0 \n", 1444 | "4 QZYX 90QI 0 0 0 0 0 \n", 1445 | "\n", 1446 | " GYSR SOP4 RVSZ PYUQ LJR9 N2MW AHXO BSTQ FM3X K6QO QBOL JWFN \\\n", 1447 | "0 0 0 0 0 0 0 0 0 0 1 0 0 \n", 1448 | "1 0 0 1 0 0 0 0 0 0 0 0 0 \n", 1449 | "2 0 0 0 0 0 0 0 0 0 1 0 0 \n", 1450 | "3 0 0 1 0 0 0 0 0 0 0 0 0 \n", 1451 | "4 0 0 1 0 0 0 0 0 0 0 0 0 \n", 1452 | "\n", 1453 | " JZ9D J9JW GHYX ECY3 target ID2 Join_Year Join_Month Join_Day \\\n", 1454 | "0 0 0 0 0 RVSZ 1 2019.0 1.0 2.0 \n", 1455 | "1 0 0 0 0 K6QO 2 2019.0 1.0 2.0 \n", 1456 | "2 0 0 0 0 RVSZ 3 2019.0 1.0 6.0 \n", 1457 | "3 0 0 0 0 K6QO 4 2019.0 1.0 6.0 \n", 1458 | "4 0 0 0 1 SOP4 5 2013.0 1.0 6.0 \n", 1459 | "\n", 1460 | " DayOfyear birth_year_bin P5DA_sum RIBP_sum 8NN1_sum 7POT_sum \\\n", 1461 | "0 2.0 (1979.4, 1995.2] 0 14 0 5 \n", 1462 | "1 2.0 (1979.4, 1995.2] 0 14 0 5 \n", 1463 | "2 6.0 (1979.4, 1995.2] 0 54 6 13 \n", 1464 | "3 6.0 (1979.4, 1995.2] 0 54 6 13 \n", 1465 | "4 6.0 (1979.4, 1995.2] 7 2031 144 259 \n", 1466 | "\n", 1467 | " 66FJ_sum GYSR_sum SOP4_sum RVSZ_sum PYUQ_sum LJR9_sum N2MW_sum \\\n", 1468 | "0 1 0 0 1598 8 0 4 \n", 1469 | "1 1 0 0 1598 8 0 4 \n", 1470 | "2 6 0 18 4191 360 44 25 \n", 1471 | "3 6 0 18 4191 360 44 25 \n", 1472 | "4 366 2 416 10711 1142 254 161 \n", 1473 | "\n", 1474 | " AHXO_sum BSTQ_sum FM3X_sum ... from_arise_col_8NN1 \\\n", 1475 | "0 0 8 0 ... 3284.0 \n", 1476 | "1 0 8 0 ... 3284.0 \n", 1477 | "2 8 37 0 ... 3288.0 \n", 1478 | "3 8 37 0 ... 3288.0 \n", 1479 | "4 38 391 133 ... 1097.0 \n", 1480 | "\n", 1481 | " from_arise_col_7POT from_arise_col_66FJ from_arise_col_GYSR \\\n", 1482 | "0 3284.0 3282.0 2191.0 \n", 1483 | "1 3284.0 3282.0 2191.0 \n", 1484 | "2 3288.0 3286.0 2195.0 \n", 1485 | "3 3288.0 3286.0 2195.0 \n", 1486 | "4 1097.0 1095.0 4.0 \n", 1487 | "\n", 1488 | " from_arise_col_SOP4 from_arise_col_RVSZ from_arise_col_PYUQ \\\n", 1489 | "0 3282.0 3284.0 3284.0 \n", 1490 | "1 3282.0 3284.0 3284.0 \n", 1491 | "2 3286.0 3288.0 3288.0 \n", 1492 | "3 3286.0 3288.0 3288.0 \n", 1493 | "4 1095.0 1097.0 1097.0 \n", 1494 | "\n", 1495 | " from_arise_col_LJR9 from_arise_col_N2MW from_arise_col_AHXO \\\n", 1496 | "0 3284.0 2919.0 3278.0 \n", 1497 | "1 3284.0 2919.0 3278.0 \n", 1498 | "2 3288.0 2923.0 3282.0 \n", 1499 | "3 3288.0 2923.0 3282.0 \n", 1500 | "4 1097.0 732.0 1091.0 \n", 1501 | "\n", 1502 | " from_arise_col_BSTQ from_arise_col_FM3X from_arise_col_K6QO \\\n", 1503 | "0 3277.0 2554.0 3282.0 \n", 1504 | "1 3277.0 2554.0 3282.0 \n", 1505 | "2 3281.0 2558.0 3286.0 \n", 1506 | "3 3281.0 2558.0 3286.0 \n", 1507 | "4 1090.0 367.0 1095.0 \n", 1508 | "\n", 1509 | " from_arise_col_QBOL from_arise_col_JWFN from_arise_col_JZ9D \\\n", 1510 | "0 3284.0 2912.0 3278.0 \n", 1511 | "1 3284.0 2912.0 3278.0 \n", 1512 | "2 3288.0 2916.0 3282.0 \n", 1513 | "3 3288.0 2916.0 3282.0 \n", 1514 | "4 1097.0 725.0 1091.0 \n", 1515 | "\n", 1516 | " from_arise_col_J9JW from_arise_col_GHYX from_arise_col_ECY3 \\\n", 1517 | "0 3278.0 3277.0 3281.0 \n", 1518 | "1 3278.0 3277.0 3281.0 \n", 1519 | "2 3282.0 3281.0 3285.0 \n", 1520 | "3 3282.0 3281.0 3285.0 \n", 1521 | "4 1091.0 1090.0 1094.0 \n", 1522 | "\n", 1523 | " Insurance_Count branch_since Unique_customers_per_branch \\\n", 1524 | "0 One 2.0 1738 \n", 1525 | "1 One 2.0 1738 \n", 1526 | "2 One 9.0 4714 \n", 1527 | "3 One 9.0 4714 \n", 1528 | "4 Medium 10.0 10919 \n", 1529 | "\n", 1530 | " Unique_Insurance_per_branch Unique_year_per_branch \\\n", 1531 | "0 14 3 \n", 1532 | "1 14 3 \n", 1533 | "2 18 10 \n", 1534 | "3 18 10 \n", 1535 | "4 21 11 \n", 1536 | "\n", 1537 | " Unique_month_per_branch Unique_branch_per_year Average_Age_per_branch \\\n", 1538 | "0 1 14.0 37.532588 \n", 1539 | "1 1 14.0 37.532588 \n", 1540 | "2 1 14.0 37.961993 \n", 1541 | "3 1 14.0 37.961993 \n", 1542 | "4 6 7.0 38.560827 \n", 1543 | "\n", 1544 | " Average_Age_per_occupation P5DA_meanAge RIBP_meanAge 8NN1_meanAge \\\n", 1545 | "0 38.035144 38.412348 38.350148 38.411398 \n", 1546 | "1 38.035144 38.412348 38.350148 38.411398 \n", 1547 | "2 38.035144 38.412348 38.350148 38.411398 \n", 1548 | "3 38.035144 38.412348 38.350148 38.411398 \n", 1549 | "4 31.423673 38.412348 38.350148 38.411398 \n", 1550 | "\n", 1551 | " 7POT_meanAge 66FJ_meanAge GYSR_meanAge SOP4_meanAge RVSZ_meanAge \\\n", 1552 | "0 38.41096 38.405632 38.414079 38.429297 38.36121 \n", 1553 | "1 38.41096 38.405632 38.414079 38.429297 38.46995 \n", 1554 | "2 38.41096 38.405632 38.414079 38.429297 38.36121 \n", 1555 | "3 38.41096 38.405632 38.414079 38.429297 38.46995 \n", 1556 | "4 38.41096 38.405632 38.414079 38.429297 38.46995 \n", 1557 | "\n", 1558 | " PYUQ_meanAge LJR9_meanAge N2MW_meanAge AHXO_meanAge BSTQ_meanAge \\\n", 1559 | "0 38.396291 38.419403 38.365864 38.382695 38.413804 \n", 1560 | "1 38.396291 38.419403 38.365864 38.382695 38.413804 \n", 1561 | "2 38.396291 38.419403 38.365864 38.382695 38.413804 \n", 1562 | "3 38.396291 38.419403 38.365864 38.382695 38.413804 \n", 1563 | "4 38.396291 38.419403 38.365864 38.382695 38.413804 \n", 1564 | "\n", 1565 | " FM3X_meanAge K6QO_meanAge QBOL_meanAge JWFN_meanAge JZ9D_meanAge \\\n", 1566 | "0 38.413013 38.384302 37.673714 38.4074 38.429089 \n", 1567 | "1 38.413013 38.436159 37.673714 38.4074 38.429089 \n", 1568 | "2 38.413013 38.384302 37.673714 38.4074 38.429089 \n", 1569 | "3 38.413013 38.436159 37.673714 38.4074 38.429089 \n", 1570 | "4 38.413013 38.436159 37.673714 38.4074 38.429089 \n", 1571 | "\n", 1572 | " J9JW_meanAge GHYX_meanAge ECY3_meanAge product_comb \n", 1573 | "0 38.431485 38.408466 38.360615 K6QO \n", 1574 | "1 38.431485 38.408466 38.360615 RVSZ \n", 1575 | "2 38.431485 38.408466 38.360615 K6QO \n", 1576 | "3 38.431485 38.408466 38.360615 RVSZ \n", 1577 | "4 38.431485 38.408466 40.115056 ECY3_RVSZ \n", 1578 | "\n", 1579 | "[5 rows x 130 columns]" 1580 | ] 1581 | }, 1582 | "execution_count": 201, 1583 | "metadata": {}, 1584 | "output_type": "execute_result" 1585 | } 1586 | ], 1587 | "source": [ 1588 | "df.head()" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "markdown", 1593 | "metadata": {}, 1594 | "source": [ 1595 | "## Interaction Feature" 1596 | ] 1597 | }, 1598 | { 1599 | "cell_type": "code", 1600 | "execution_count": 202, 1601 | "metadata": {}, 1602 | "outputs": [], 1603 | "source": [ 1604 | "df['Join_Year'] = df['Join_Year'].astype(str)\n", 1605 | "df['birth_year_bin'] = df['birth_year_bin'].astype(str)" 1606 | ] 1607 | }, 1608 | { 1609 | "cell_type": "code", 1610 | "execution_count": 203, 1611 | "metadata": {}, 1612 | "outputs": [], 1613 | "source": [ 1614 | "df['bc_oc'] = df['branch_code'] + '_' + df['occupation_code']\n", 1615 | "df['bc_occ'] = df['branch_code'] + '_' + df['occupation_category_code']" 1616 | ] 1617 | }, 1618 | { 1619 | "cell_type": "markdown", 1620 | "metadata": {}, 1621 | "source": [ 1622 | "## Label Encoding" 1623 | ] 1624 | }, 1625 | { 1626 | "cell_type": "code", 1627 | "execution_count": 204, 1628 | "metadata": {}, 1629 | "outputs": [], 1630 | "source": [ 1631 | "from sklearn.preprocessing import LabelEncoder\n", 1632 | "le = LabelEncoder()\n", 1633 | "for col in [\n", 1634 | " 'product_comb', 'Insurance_Count', 'sex', 'marital_status',\n", 1635 | " 'branch_code', 'occupation_category_code', 'occupation_code'\n", 1636 | "]:\n", 1637 | " df[col] = le.fit_transform(df[col])" 1638 | ] 1639 | }, 1640 | { 1641 | "cell_type": "markdown", 1642 | "metadata": {}, 1643 | "source": [ 1644 | "## Frequency Encoding" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "code", 1649 | "execution_count": 205, 1650 | "metadata": {}, 1651 | "outputs": [], 1652 | "source": [ 1653 | "fe_pol = (df.groupby('product_comb').size()) / len(df)\n", 1654 | "df['product_comb_fe'] = df['product_comb'].apply(lambda x: fe_pol[x])" 1655 | ] 1656 | }, 1657 | { 1658 | "cell_type": "code", 1659 | "execution_count": 206, 1660 | "metadata": {}, 1661 | "outputs": [], 1662 | "source": [ 1663 | "fe_pol = (df.groupby('bc_occ').size()) / len(df)\n", 1664 | "df['bc_occ'] = df['bc_occ'].apply(lambda x: fe_pol[x])" 1665 | ] 1666 | }, 1667 | { 1668 | "cell_type": "code", 1669 | "execution_count": 207, 1670 | "metadata": {}, 1671 | "outputs": [], 1672 | "source": [ 1673 | "fe_pol = (df.groupby('bc_oc').size()) / len(df)\n", 1674 | "df['bc_oc'] = df['bc_oc'].apply(lambda x: fe_pol[x])" 1675 | ] 1676 | }, 1677 | { 1678 | "cell_type": "code", 1679 | "execution_count": 208, 1680 | "metadata": {}, 1681 | "outputs": [], 1682 | "source": [ 1683 | "fe_pol = (df.groupby('birth_year_bin').size()) / len(df)\n", 1684 | "df['birth_year_bin'] = df['birth_year_bin'].apply(lambda x: fe_pol[x])\n", 1685 | "df['birth_year_bin'] = df['birth_year_bin'].astype(float)" 1686 | ] 1687 | }, 1688 | { 1689 | "cell_type": "code", 1690 | "execution_count": 209, 1691 | "metadata": {}, 1692 | "outputs": [], 1693 | "source": [ 1694 | "fe_pol = (df.groupby('occupation_code').size()) / len(df)\n", 1695 | "df['occupation_code_fe'] = df['occupation_code'].apply(lambda x: fe_pol[x])" 1696 | ] 1697 | }, 1698 | { 1699 | "cell_type": "code", 1700 | "execution_count": 210, 1701 | "metadata": {}, 1702 | "outputs": [], 1703 | "source": [ 1704 | "fe_pol = (df.groupby('occupation_category_code').size()) / len(df)\n", 1705 | "df['occupation_category_code'] = df['occupation_category_code'].apply(lambda x: fe_pol[x])" 1706 | ] 1707 | }, 1708 | { 1709 | "cell_type": "code", 1710 | "execution_count": 211, 1711 | "metadata": {}, 1712 | "outputs": [], 1713 | "source": [ 1714 | "fe_pol = (df.groupby('sex').size()) / len(df)\n", 1715 | "df['sex_fe'] = df['sex'].apply(lambda x: fe_pol[x])" 1716 | ] 1717 | }, 1718 | { 1719 | "cell_type": "code", 1720 | "execution_count": 212, 1721 | "metadata": {}, 1722 | "outputs": [], 1723 | "source": [ 1724 | "fe_pol = (df.groupby('Insurance_Count').size()) / len(df)\n", 1725 | "df['Insurance_Count_fe'] = df['Insurance_Count'].apply(lambda x: fe_pol[x])" 1726 | ] 1727 | }, 1728 | { 1729 | "cell_type": "code", 1730 | "execution_count": 213, 1731 | "metadata": {}, 1732 | "outputs": [], 1733 | "source": [ 1734 | "df['Join_Year'] = df['Join_Year'].astype(float)" 1735 | ] 1736 | }, 1737 | { 1738 | "cell_type": "code", 1739 | "execution_count": 214, 1740 | "metadata": {}, 1741 | "outputs": [], 1742 | "source": [ 1743 | "#Relationship between targets\n", 1744 | "#\n", 1745 | "for col in columns:\n", 1746 | " for cols in columns:\n", 1747 | " if col != cols:\n", 1748 | " df[col + '_' + cols] = df.groupby(col)[cols].transform(sum)" 1749 | ] 1750 | }, 1751 | { 1752 | "cell_type": "code", 1753 | "execution_count": 215, 1754 | "metadata": {}, 1755 | "outputs": [], 1756 | "source": [ 1757 | "df['num_freq'] = df.groupby('product_comb_fe')['ID'].transform('count')" 1758 | ] 1759 | }, 1760 | { 1761 | "cell_type": "markdown", 1762 | "metadata": {}, 1763 | "source": [ 1764 | "## Getting back train and test" 1765 | ] 1766 | }, 1767 | { 1768 | "cell_type": "code", 1769 | "execution_count": 239, 1770 | "metadata": {}, 1771 | "outputs": [], 1772 | "source": [ 1773 | "train = df[:train.shape[0]]\n", 1774 | "test = df[-test.shape[0]:]" 1775 | ] 1776 | }, 1777 | { 1778 | "cell_type": "code", 1779 | "execution_count": 48, 1780 | "metadata": {}, 1781 | "outputs": [ 1782 | { 1783 | "data": { 1784 | "text/plain": [ 1785 | "(66353, 21)" 1786 | ] 1787 | }, 1788 | "execution_count": 48, 1789 | "metadata": {}, 1790 | "output_type": "execute_result" 1791 | } 1792 | ], 1793 | "source": [ 1794 | "len(train),train['target'].nunique()" 1795 | ] 1796 | }, 1797 | { 1798 | "cell_type": "markdown", 1799 | "metadata": {}, 1800 | "source": [ 1801 | "## Removing records if target count is less than 3" 1802 | ] 1803 | }, 1804 | { 1805 | "cell_type": "code", 1806 | "execution_count": 240, 1807 | "metadata": {}, 1808 | "outputs": [], 1809 | "source": [ 1810 | "train['target_count'] = train.groupby(['branch_code',\n", 1811 | " 'target'])['target'].transform('count')\n", 1812 | "train = train[train['target_count'] > 2]\n", 1813 | "del train['target_count']" 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": 54, 1819 | "metadata": {}, 1820 | "outputs": [ 1821 | { 1822 | "data": { 1823 | "text/plain": [ 1824 | "(66290, 20)" 1825 | ] 1826 | }, 1827 | "execution_count": 54, 1828 | "metadata": {}, 1829 | "output_type": "execute_result" 1830 | } 1831 | ], 1832 | "source": [ 1833 | "len(train), train['target'].nunique()" 1834 | ] 1835 | }, 1836 | { 1837 | "cell_type": "markdown", 1838 | "metadata": {}, 1839 | "source": [ 1840 | "## Label Encoding Target" 1841 | ] 1842 | }, 1843 | { 1844 | "cell_type": "code", 1845 | "execution_count": 241, 1846 | "metadata": {}, 1847 | "outputs": [], 1848 | "source": [ 1849 | "te = LabelEncoder()\n", 1850 | "train['target'] = te.fit_transform(train['target'])" 1851 | ] 1852 | }, 1853 | { 1854 | "cell_type": "markdown", 1855 | "metadata": {}, 1856 | "source": [ 1857 | "# StratifiedKFold" 1858 | ] 1859 | }, 1860 | { 1861 | "cell_type": "code", 1862 | "execution_count": 60, 1863 | "metadata": {}, 1864 | "outputs": [ 1865 | { 1866 | "name": "stdout", 1867 | "output_type": "stream", 1868 | "text": [ 1869 | "Training until validation scores don't improve for 20 rounds\n", 1870 | "[200]\ttraining's multi_logloss: 0.425229\tvalid_1's multi_logloss: 0.442599\n", 1871 | "[400]\ttraining's multi_logloss: 0.342772\tvalid_1's multi_logloss: 0.377401\n", 1872 | "[600]\ttraining's multi_logloss: 0.312031\tvalid_1's multi_logloss: 0.363581\n", 1873 | "[800]\ttraining's multi_logloss: 0.291081\tvalid_1's multi_logloss: 0.358438\n", 1874 | "[1000]\ttraining's multi_logloss: 0.27363\tvalid_1's multi_logloss: 0.355791\n", 1875 | "[1200]\ttraining's multi_logloss: 0.258673\tvalid_1's multi_logloss: 0.354585\n", 1876 | "Early stopping, best iteration is:\n", 1877 | "[1366]\ttraining's multi_logloss: 0.247805\tvalid_1's multi_logloss: 0.354216\n", 1878 | "1 err_lgm: 0.35421631499025946\n", 1879 | "[LightGBM] [Warning] min_data_in_leaf is set=40, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=40\n", 1880 | "[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9\n", 1881 | "[LightGBM] [Warning] bagging_freq is set=2, subsample_freq=0 will be ignored. Current value: bagging_freq=2\n", 1882 | "Training until validation scores don't improve for 20 rounds\n", 1883 | "[200]\ttraining's multi_logloss: 0.418809\tvalid_1's multi_logloss: 0.455144\n", 1884 | "[400]\ttraining's multi_logloss: 0.33489\tvalid_1's multi_logloss: 0.393109\n", 1885 | "[600]\ttraining's multi_logloss: 0.3046\tvalid_1's multi_logloss: 0.381151\n", 1886 | "[800]\ttraining's multi_logloss: 0.283089\tvalid_1's multi_logloss: 0.377217\n", 1887 | "[1000]\ttraining's multi_logloss: 0.265824\tvalid_1's multi_logloss: 0.375818\n", 1888 | "Early stopping, best iteration is:\n", 1889 | "[1073]\ttraining's multi_logloss: 0.260279\tvalid_1's multi_logloss: 0.375593\n", 1890 | "1 err_lgm: 0.375593067922245\n", 1891 | "[LightGBM] [Warning] min_data_in_leaf is set=40, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=40\n", 1892 | "[LightGBM] [Warning] bagging_fraction is set=0.9, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9\n", 1893 | "[LightGBM] [Warning] bagging_freq is set=2, subsample_freq=0 will be ignored. Current value: bagging_freq=2\n", 1894 | "Training until validation scores don't improve for 20 rounds\n", 1895 | "[200]\ttraining's multi_logloss: 0.420918\tvalid_1's multi_logloss: 0.451048\n", 1896 | "[400]\ttraining's multi_logloss: 0.339036\tvalid_1's multi_logloss: 0.386804\n", 1897 | "[600]\ttraining's multi_logloss: 0.308682\tvalid_1's multi_logloss: 0.373044\n", 1898 | "[800]\ttraining's multi_logloss: 0.28747\tvalid_1's multi_logloss: 0.367751\n", 1899 | "[1000]\ttraining's multi_logloss: 0.269769\tvalid_1's multi_logloss: 0.365496\n", 1900 | "[1200]\ttraining's multi_logloss: 0.254697\tvalid_1's multi_logloss: 0.36462\n", 1901 | "Early stopping, best iteration is:\n", 1902 | "[1261]\ttraining's multi_logloss: 0.250606\tvalid_1's multi_logloss: 0.36449\n", 1903 | "1 err_lgm: 0.3644898381125798\n" 1904 | ] 1905 | }, 1906 | { 1907 | "data": { 1908 | "text/plain": [ 1909 | "0.36476640700836144" 1910 | ] 1911 | }, 1912 | "execution_count": 60, 1913 | "metadata": {}, 1914 | "output_type": "execute_result" 1915 | } 1916 | ], 1917 | "source": [ 1918 | "#LGB model\n", 1919 | "#\n", 1920 | "err = []\n", 1921 | "y_pred_tot_lgb = 0\n", 1922 | "\n", 1923 | "fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1997)\n", 1924 | "i = 1\n", 1925 | "x = train.drop(columns={'join_date', 'ID', 'ID2', 'target'})\n", 1926 | "y = train[['target']]\n", 1927 | "ID = test['ID']\n", 1928 | "testing = test.drop(columns={'join_date', 'ID', 'ID2', 'target'})\n", 1929 | "for train_index, test_index in fold.split(x, y):\n", 1930 | " x_train, x_val = x.iloc[train_index], x.iloc[test_index]\n", 1931 | " y_train, y_val = y.iloc[train_index], y.iloc[test_index]\n", 1932 | " m = LGBMClassifier(n_estimators=10000,\n", 1933 | " n_jobs=-1,\n", 1934 | " random_state=69,\n", 1935 | " learning_rate=0.01,\n", 1936 | " max_depth=5,\n", 1937 | " num_leaves=128,\n", 1938 | " colsample_bytree=0.5,\n", 1939 | " colsample_bynode=0.5,\n", 1940 | " min_data_in_leaf=40,\n", 1941 | " bagging_freq=2,\n", 1942 | " bagging_fraction=0.9,\n", 1943 | " reg_alpha=0.5,\n", 1944 | " reg_lambda=1)\n", 1945 | " m.fit(x_train,\n", 1946 | " y_train,\n", 1947 | " eval_set=[(x_train, y_train), (x_val, y_val)],\n", 1948 | " early_stopping_rounds=20,\n", 1949 | " eval_metric='multi_logloss',\n", 1950 | " verbose=200)\n", 1951 | " pred_y = m.predict_proba(x_val)\n", 1952 | " print(i, \" err_lgm: \", log_loss(y_val, pred_y))\n", 1953 | " err.append(log_loss(y_val, pred_y))\n", 1954 | " pred_test = m.predict_proba(testing)\n", 1955 | " y_pred_tot_lgb += pred_test\n", 1956 | "y_pred_tot_lgb = y_pred_tot_lgb / 3\n", 1957 | "(err[0] + err[1] + err[2]) / 3" 1958 | ] 1959 | }, 1960 | { 1961 | "cell_type": "code", 1962 | "execution_count": 62, 1963 | "metadata": {}, 1964 | "outputs": [ 1965 | { 1966 | "name": "stdout", 1967 | "output_type": "stream", 1968 | "text": [ 1969 | "[0]\tvalidation_0-mlogloss:2.23928\tvalidation_1-mlogloss:2.24216\n", 1970 | "Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.\n", 1971 | "\n", 1972 | "Will train until validation_1-mlogloss hasn't improved in 20 rounds.\n", 1973 | "[20]\tvalidation_0-mlogloss:0.55627\tvalidation_1-mlogloss:0.58586\n", 1974 | "[40]\tvalidation_0-mlogloss:0.36262\tvalidation_1-mlogloss:0.41120\n", 1975 | "[60]\tvalidation_0-mlogloss:0.30980\tvalidation_1-mlogloss:0.37351\n", 1976 | "[80]\tvalidation_0-mlogloss:0.28610\tvalidation_1-mlogloss:0.36247\n", 1977 | "[100]\tvalidation_0-mlogloss:0.27054\tvalidation_1-mlogloss:0.35819\n", 1978 | "[120]\tvalidation_0-mlogloss:0.25927\tvalidation_1-mlogloss:0.35662\n", 1979 | "[140]\tvalidation_0-mlogloss:0.25005\tvalidation_1-mlogloss:0.35568\n", 1980 | "[160]\tvalidation_0-mlogloss:0.24265\tvalidation_1-mlogloss:0.35481\n", 1981 | "[180]\tvalidation_0-mlogloss:0.23612\tvalidation_1-mlogloss:0.35453\n", 1982 | "[200]\tvalidation_0-mlogloss:0.23081\tvalidation_1-mlogloss:0.35423\n", 1983 | "[220]\tvalidation_0-mlogloss:0.22614\tvalidation_1-mlogloss:0.35409\n", 1984 | "[240]\tvalidation_0-mlogloss:0.22208\tvalidation_1-mlogloss:0.35406\n", 1985 | "[260]\tvalidation_0-mlogloss:0.21838\tvalidation_1-mlogloss:0.35401\n", 1986 | "[280]\tvalidation_0-mlogloss:0.21490\tvalidation_1-mlogloss:0.35399\n", 1987 | "[300]\tvalidation_0-mlogloss:0.21182\tvalidation_1-mlogloss:0.35393\n", 1988 | "Stopping. Best iteration:\n", 1989 | "[295]\tvalidation_0-mlogloss:0.21251\tvalidation_1-mlogloss:0.35388\n", 1990 | "\n", 1991 | "1 err_lgm: 0.35387566261402437\n", 1992 | "[0]\tvalidation_0-mlogloss:2.23749\tvalidation_1-mlogloss:2.24493\n", 1993 | "Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.\n", 1994 | "\n", 1995 | "Will train until validation_1-mlogloss hasn't improved in 20 rounds.\n", 1996 | "[20]\tvalidation_0-mlogloss:0.55083\tvalidation_1-mlogloss:0.59409\n", 1997 | "[40]\tvalidation_0-mlogloss:0.35619\tvalidation_1-mlogloss:0.42096\n", 1998 | "[60]\tvalidation_0-mlogloss:0.30289\tvalidation_1-mlogloss:0.38610\n", 1999 | "[80]\tvalidation_0-mlogloss:0.27864\tvalidation_1-mlogloss:0.37726\n", 2000 | "[100]\tvalidation_0-mlogloss:0.26256\tvalidation_1-mlogloss:0.37437\n", 2001 | "[120]\tvalidation_0-mlogloss:0.25081\tvalidation_1-mlogloss:0.37333\n", 2002 | "[140]\tvalidation_0-mlogloss:0.24170\tvalidation_1-mlogloss:0.37294\n", 2003 | "[160]\tvalidation_0-mlogloss:0.23452\tvalidation_1-mlogloss:0.37292\n", 2004 | "Stopping. Best iteration:\n", 2005 | "[156]\tvalidation_0-mlogloss:0.23581\tvalidation_1-mlogloss:0.37285\n", 2006 | "\n", 2007 | "1 err_lgm: 0.3728546684019317\n", 2008 | "[0]\tvalidation_0-mlogloss:2.23681\tvalidation_1-mlogloss:2.24268\n", 2009 | "Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.\n", 2010 | "\n", 2011 | "Will train until validation_1-mlogloss hasn't improved in 20 rounds.\n", 2012 | "[20]\tvalidation_0-mlogloss:0.55195\tvalidation_1-mlogloss:0.59279\n", 2013 | "[40]\tvalidation_0-mlogloss:0.35853\tvalidation_1-mlogloss:0.41876\n", 2014 | "[60]\tvalidation_0-mlogloss:0.30635\tvalidation_1-mlogloss:0.38258\n", 2015 | "[80]\tvalidation_0-mlogloss:0.28186\tvalidation_1-mlogloss:0.37189\n", 2016 | "[100]\tvalidation_0-mlogloss:0.26577\tvalidation_1-mlogloss:0.36806\n", 2017 | "[120]\tvalidation_0-mlogloss:0.25558\tvalidation_1-mlogloss:0.36647\n", 2018 | "[140]\tvalidation_0-mlogloss:0.24680\tvalidation_1-mlogloss:0.36596\n", 2019 | "[160]\tvalidation_0-mlogloss:0.23923\tvalidation_1-mlogloss:0.36540\n", 2020 | "[180]\tvalidation_0-mlogloss:0.23289\tvalidation_1-mlogloss:0.36518\n", 2021 | "[200]\tvalidation_0-mlogloss:0.22699\tvalidation_1-mlogloss:0.36497\n", 2022 | "[220]\tvalidation_0-mlogloss:0.22182\tvalidation_1-mlogloss:0.36473\n", 2023 | "[240]\tvalidation_0-mlogloss:0.21746\tvalidation_1-mlogloss:0.36472\n", 2024 | "[260]\tvalidation_0-mlogloss:0.21355\tvalidation_1-mlogloss:0.36466\n", 2025 | "Stopping. Best iteration:\n", 2026 | "[251]\tvalidation_0-mlogloss:0.21522\tvalidation_1-mlogloss:0.36453\n", 2027 | "\n", 2028 | "1 err_lgm: 0.3645286154121357\n" 2029 | ] 2030 | }, 2031 | { 2032 | "data": { 2033 | "text/plain": [ 2034 | "0.36375298214269725" 2035 | ] 2036 | }, 2037 | "execution_count": 62, 2038 | "metadata": {}, 2039 | "output_type": "execute_result" 2040 | } 2041 | ], 2042 | "source": [ 2043 | "#XGB model\n", 2044 | "#\n", 2045 | "err = []\n", 2046 | "y_pred_tot_xgb = 0\n", 2047 | "\n", 2048 | "fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1997)\n", 2049 | "i = 1\n", 2050 | "\n", 2051 | "for train_index, test_index in fold.split(x, y):\n", 2052 | " x_train, x_val = x.iloc[train_index], x.iloc[test_index]\n", 2053 | " y_train, y_val = y.iloc[train_index], y.iloc[test_index]\n", 2054 | " m = XGBClassifier(\n", 2055 | " n_estimators=10000,\n", 2056 | " eta=0.1,\n", 2057 | " n_jobs=-1,\n", 2058 | " random_state=69,\n", 2059 | " reg_alpha=0.5, #reg_lambda=1.2 \n", 2060 | " colsample_bytree=0.8,\n", 2061 | " colsample_bylevel=0.8,\n", 2062 | " colsample_bynode=0.8,\n", 2063 | " subsample=0.9,\n", 2064 | " gamma=1.5,\n", 2065 | " max_depth=7)\n", 2066 | " m.fit(x_train,\n", 2067 | " y_train,\n", 2068 | " eval_set=[(x_train, y_train), (x_val, y_val)],\n", 2069 | " early_stopping_rounds=20,\n", 2070 | " eval_metric='mlogloss',\n", 2071 | " verbose=20)\n", 2072 | " pred_y = m.predict_proba(x_val)\n", 2073 | " print(i, \" err_lgm: \", log_loss(y_val, pred_y))\n", 2074 | " err.append(log_loss(y_val, pred_y))\n", 2075 | " pred_test = m.predict_proba(testing)\n", 2076 | " y_pred_tot_xgb += pred_test\n", 2077 | "y_pred_tot_xgb = y_pred_tot_xgb / 3\n", 2078 | "(err[0] + err[1] + err[2]) / 3" 2079 | ] 2080 | }, 2081 | { 2082 | "cell_type": "markdown", 2083 | "metadata": {}, 2084 | "source": [ 2085 | "# Averaging submission" 2086 | ] 2087 | }, 2088 | { 2089 | "cell_type": "code", 2090 | "execution_count": 66, 2091 | "metadata": {}, 2092 | "outputs": [], 2093 | "source": [ 2094 | "pred = y_pred_tot_lgb * 0.6 + y_pred_tot_xgb * 0.4\n", 2095 | "y_test = pd.DataFrame(pred)\n", 2096 | "y_test.columns = te.inverse_transform(y_test.columns)" 2097 | ] 2098 | }, 2099 | { 2100 | "cell_type": "code", 2101 | "execution_count": 69, 2102 | "metadata": {}, 2103 | "outputs": [ 2104 | { 2105 | "data": { 2106 | "application/vnd.jupyter.widget-view+json": { 2107 | "model_id": "fdc2159a005045a98a782bbdd871d84b", 2108 | "version_major": 2, 2109 | "version_minor": 0 2110 | }, 2111 | "text/plain": [ 2112 | "HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))" 2113 | ] 2114 | }, 2115 | "metadata": {}, 2116 | "output_type": "display_data" 2117 | }, 2118 | { 2119 | "name": "stdout", 2120 | "output_type": "stream", 2121 | "text": [ 2122 | "\n", 2123 | "Wall time: 37 s\n" 2124 | ] 2125 | }, 2126 | { 2127 | "data": { 2128 | "text/html": [ 2129 | "
\n", 2130 | "\n", 2143 | "\n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | " \n", 2175 | " \n", 2176 | " \n", 2177 | " \n", 2178 | "
ID X PCODELabel
0F86J5PC X 66FJ0.000077
1F86J5PC X 7POT0.000107
2F86J5PC X 8NN10.000014
3F86J5PC X AHXO0.000102
4F86J5PC X BSTQ0.000021
\n", 2179 | "
" 2180 | ], 2181 | "text/plain": [ 2182 | " ID X PCODE Label\n", 2183 | "0 F86J5PC X 66FJ 0.000077\n", 2184 | "1 F86J5PC X 7POT 0.000107\n", 2185 | "2 F86J5PC X 8NN1 0.000014\n", 2186 | "3 F86J5PC X AHXO 0.000102\n", 2187 | "4 F86J5PC X BSTQ 0.000021" 2188 | ] 2189 | }, 2190 | "execution_count": 69, 2191 | "metadata": {}, 2192 | "output_type": "execute_result" 2193 | } 2194 | ], 2195 | "source": [ 2196 | "%%time\n", 2197 | "answer_mass = []\n", 2198 | "for i in range(test.shape[0]):\n", 2199 | " test['ID'] = ID\n", 2200 | " id = test['ID'].iloc[i]\n", 2201 | " for c in y_test.columns:\n", 2202 | " answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])\n", 2203 | "\n", 2204 | "df_answer = pd.DataFrame(answer_mass)\n", 2205 | "df_answer.columns = ['ID X PCODE', 'Label']\n", 2206 | "for i in tqdm_notebook(range(df_answer.shape[0])):\n", 2207 | " if df_answer['ID X PCODE'].iloc[i] in true_values:\n", 2208 | " df_answer['Label'].iloc[i] = 1.0\n", 2209 | "df_answer.head()" 2210 | ] 2211 | }, 2212 | { 2213 | "cell_type": "code", 2214 | "execution_count": 70, 2215 | "metadata": {}, 2216 | "outputs": [], 2217 | "source": [ 2218 | "sub1 = df_answer[['ID X PCODE', 'Label']]\n", 2219 | "sub1.reset_index(drop=True, inplace=True)" 2220 | ] 2221 | }, 2222 | { 2223 | "cell_type": "code", 2224 | "execution_count": 71, 2225 | "metadata": {}, 2226 | "outputs": [], 2227 | "source": [ 2228 | "sub = pd.read_csv('SampleSubmission.csv')" 2229 | ] 2230 | }, 2231 | { 2232 | "cell_type": "code", 2233 | "execution_count": 72, 2234 | "metadata": {}, 2235 | "outputs": [], 2236 | "source": [ 2237 | "sub.sort_values(by=['ID X PCODE'], inplace=True)\n", 2238 | "sub1.sort_values(by=['ID X PCODE'], inplace=True)" 2239 | ] 2240 | }, 2241 | { 2242 | "cell_type": "code", 2243 | "execution_count": 73, 2244 | "metadata": {}, 2245 | "outputs": [], 2246 | "source": [ 2247 | "actual = sub1\n", 2248 | "findl = actual['ID X PCODE'].values\n", 2249 | "replacel = actual['Label'].values\n", 2250 | "sub.loc[sub['ID X PCODE'].isin(findl), ['Label']] = replacel" 2251 | ] 2252 | }, 2253 | { 2254 | "cell_type": "code", 2255 | "execution_count": 81, 2256 | "metadata": {}, 2257 | "outputs": [], 2258 | "source": [ 2259 | "#Make submission\n", 2260 | "sub.to_csv('submiss.csv',index=False)" 2261 | ] 2262 | }, 2263 | { 2264 | "cell_type": "markdown", 2265 | "metadata": {}, 2266 | "source": [ 2267 | "Open Zimnat_insurance_cat_target+multy.ipynb" 2268 | ] 2269 | } 2270 | ], 2271 | "metadata": { 2272 | "kernelspec": { 2273 | "display_name": "Python 3", 2274 | "language": "python", 2275 | "name": "python3" 2276 | }, 2277 | "language_info": { 2278 | "codemirror_mode": { 2279 | "name": "ipython", 2280 | "version": 3 2281 | }, 2282 | "file_extension": ".py", 2283 | "mimetype": "text/x-python", 2284 | "name": "python", 2285 | "nbconvert_exporter": "python", 2286 | "pygments_lexer": "ipython3", 2287 | "version": "3.8.3" 2288 | }, 2289 | "toc": { 2290 | "base_numbering": 1, 2291 | "nav_menu": {}, 2292 | "number_sections": true, 2293 | "sideBar": true, 2294 | "skip_h1_title": false, 2295 | "title_cell": "Table of Contents", 2296 | "title_sidebar": "Contents", 2297 | "toc_cell": false, 2298 | "toc_position": {}, 2299 | "toc_section_display": true, 2300 | "toc_window_display": false 2301 | } 2302 | }, 2303 | "nbformat": 4, 2304 | "nbformat_minor": 4 2305 | } 2306 | -------------------------------------------------------------------------------- /Zimnat_insurance_best_multy_overall.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "82PpxfmqglcE" 8 | }, 9 | "source": [ 10 | "# Download libraries and data" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "#Download CatBoost\n", 20 | "#\n", 21 | "!pip install catboost==0.23.2" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "ExecuteTime": { 29 | "end_time": "2020-08-07T19:29:03.461280Z", 30 | "start_time": "2020-08-07T19:29:03.186558Z" 31 | }, 32 | "colab": {}, 33 | "colab_type": "code", 34 | "id": "3arOHSJwCtNq" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "#Import libraries\n", 39 | "#\n", 40 | "import pandas as pd, os, gc\n", 41 | "import numpy as np\n", 42 | "import math\n", 43 | "import copy\n", 44 | "from itertools import combinations\n", 45 | "\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "\n", 48 | "from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler\n", 49 | "from sklearn.metrics import roc_curve, auc, log_loss\n", 50 | "\n", 51 | "from tqdm import tqdm, tqdm_notebook\n", 52 | "\n", 53 | "from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold, train_test_split, GroupKFold\n", 54 | "from catboost import CatBoostClassifier\n", 55 | "from xgboost import XGBClassifier\n", 56 | "\n", 57 | "%matplotlib inline\n", 58 | "import warnings\n", 59 | "warnings.filterwarnings('ignore')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": { 66 | "ExecuteTime": { 67 | "end_time": "2020-08-07T19:29:03.664112Z", 68 | "start_time": "2020-08-07T19:29:03.501157Z" 69 | }, 70 | "colab": {}, 71 | "colab_type": "code", 72 | "id": "s3ZCXtzKDiDe" 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "#Download data\n", 77 | "#\n", 78 | "train = pd.read_csv('Train.csv')\n", 79 | "test = pd.read_csv('Test.csv')\n", 80 | "sub = pd.read_csv('SampleSubmission.csv')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 105 | "\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP...AHXOBSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3
04WKQSBB1/2/2019FM19871X1H2A7IT4MS00...0001000000
1CP5S02H1/6/2019FM1981UAOD2A7IT4MS00...0001000000
22YKDILJ1/6/2013MU1991748LQZYX90QI00...0000000001
32S9E81J1/8/2019MM19901X1HBP0956SI00...0001000000
4BHDYVFT1/8/2019MM1990748LNO3LT4MS00...0000001100
\n", 255 | "

5 rows × 29 columns

\n", 256 | "
" 257 | ], 258 | "text/plain": [ 259 | " ID join_date sex marital_status birth_year branch_code \\\n", 260 | "0 4WKQSBB 1/2/2019 F M 1987 1X1H \n", 261 | "1 CP5S02H 1/6/2019 F M 1981 UAOD \n", 262 | "2 2YKDILJ 1/6/2013 M U 1991 748L \n", 263 | "3 2S9E81J 1/8/2019 M M 1990 1X1H \n", 264 | "4 BHDYVFT 1/8/2019 M M 1990 748L \n", 265 | "\n", 266 | " occupation_code occupation_category_code P5DA RIBP ... AHXO BSTQ FM3X \\\n", 267 | "0 2A7I T4MS 0 0 ... 0 0 0 \n", 268 | "1 2A7I T4MS 0 0 ... 0 0 0 \n", 269 | "2 QZYX 90QI 0 0 ... 0 0 0 \n", 270 | "3 BP09 56SI 0 0 ... 0 0 0 \n", 271 | "4 NO3L T4MS 0 0 ... 0 0 0 \n", 272 | "\n", 273 | " K6QO QBOL JWFN JZ9D J9JW GHYX ECY3 \n", 274 | "0 1 0 0 0 0 0 0 \n", 275 | "1 1 0 0 0 0 0 0 \n", 276 | "2 0 0 0 0 0 0 1 \n", 277 | "3 1 0 0 0 0 0 0 \n", 278 | "4 0 0 0 1 1 0 0 \n", 279 | "\n", 280 | "[5 rows x 29 columns]" 281 | ] 282 | }, 283 | "execution_count": 4, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "train.head()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 5, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/html": [ 300 | "
\n", 301 | "\n", 314 | "\n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP...AHXOBSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3
0F86J5PC1/12/2018MM198494KCDZRV90QI00...0000000000
1H6141K31/10/2019MM19961X1HJ9SY90QI00...0001000000
2RBAYUXZ1/1/2020FW1968UAOD2A7IT4MS00...0001000000
3KCBILBQ1/2/2019MM198994KC2A7IT4MS00...0000000000
4LSEC1ZJ1/2/2020FM1982UAOD0KIDT4MS00...0000001000
\n", 464 | "

5 rows × 29 columns

\n", 465 | "
" 466 | ], 467 | "text/plain": [ 468 | " ID join_date sex marital_status birth_year branch_code \\\n", 469 | "0 F86J5PC 1/12/2018 M M 1984 94KC \n", 470 | "1 H6141K3 1/10/2019 M M 1996 1X1H \n", 471 | "2 RBAYUXZ 1/1/2020 F W 1968 UAOD \n", 472 | "3 KCBILBQ 1/2/2019 M M 1989 94KC \n", 473 | "4 LSEC1ZJ 1/2/2020 F M 1982 UAOD \n", 474 | "\n", 475 | " occupation_code occupation_category_code P5DA RIBP ... AHXO BSTQ FM3X \\\n", 476 | "0 DZRV 90QI 0 0 ... 0 0 0 \n", 477 | "1 J9SY 90QI 0 0 ... 0 0 0 \n", 478 | "2 2A7I T4MS 0 0 ... 0 0 0 \n", 479 | "3 2A7I T4MS 0 0 ... 0 0 0 \n", 480 | "4 0KID T4MS 0 0 ... 0 0 0 \n", 481 | "\n", 482 | " K6QO QBOL JWFN JZ9D J9JW GHYX ECY3 \n", 483 | "0 0 0 0 0 0 0 0 \n", 484 | "1 1 0 0 0 0 0 0 \n", 485 | "2 1 0 0 0 0 0 0 \n", 486 | "3 0 0 0 0 0 0 0 \n", 487 | "4 0 0 0 1 0 0 0 \n", 488 | "\n", 489 | "[5 rows x 29 columns]" 490 | ] 491 | }, 492 | "execution_count": 5, 493 | "metadata": {}, 494 | "output_type": "execute_result" 495 | } 496 | ], 497 | "source": [ 498 | "test.head()" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "# Data preparing" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 6, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "replace_train=list(set(train['occupation_code'].unique().tolist())-set(test['occupation_code']))\n", 515 | "replace_test=list(set(test['occupation_code'].unique().tolist())-set(train['occupation_code']))\n", 516 | "\n", 517 | "train['occupation_code']=train['occupation_code'].replace(replace_train,np.nan)\n", 518 | "test['occupation_code']=test['occupation_code'].replace(replace_test,np.nan)\n", 519 | "train['occupation_code'].fillna(train['occupation_category_code'],inplace=True)\n", 520 | "test['occupation_code'].fillna(test['occupation_category_code'],inplace=True)" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 7, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "#Adding amount of purchased products for each client(for test without 1 missing)\n", 530 | "#\n", 531 | "train['sum'] = train.iloc[:, 8:].T.sum()\n", 532 | "\n", 533 | "test['sum'] = test.iloc[:, 8:].T.sum()+1" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 8, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "train.loc[train.marital_status == 'f', 'marital_status'] = 'F'" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 9, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "#Renaming features to prevent any repeating\n", 552 | "#\n", 553 | "train['sex'] += '_sex'\n", 554 | "train['marital_status'] += '_marital_status'\n", 555 | "train['branch_code'] += '_branch_code'\n", 556 | "train['occupation_code'] += '_occupation_code'\n", 557 | "train['occupation_category_code'] += '_occupation_category_code'\n", 558 | "test['sex'] += '_sex'\n", 559 | "test['marital_status'] += '_marital_status'\n", 560 | "test['branch_code'] += '_branch_code'\n", 561 | "test['occupation_code'] += '_occupation_code'\n", 562 | "test['occupation_category_code'] += '_occupation_category_code'" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 10, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "names_products = [\n", 572 | " 'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',\n", 573 | " 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',\n", 574 | " 'J9JW', 'GHYX', 'ECY3'\n", 575 | "]" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 16, 581 | "metadata": { 582 | "ExecuteTime": { 583 | "end_time": "2020-08-07T19:29:05.039157Z", 584 | "start_time": "2020-08-07T19:29:04.237473Z" 585 | }, 586 | "colab": {}, 587 | "colab_type": "code", 588 | "id": "oxjj_QAfEyZw" 589 | }, 590 | "outputs": [ 591 | { 592 | "data": { 593 | "application/vnd.jupyter.widget-view+json": { 594 | "model_id": "6d7164fbcd5143bda3ad980cd86f12b6", 595 | "version_major": 2, 596 | "version_minor": 0 597 | }, 598 | "text/plain": [ 599 | "HBox(children=(FloatProgress(value=0.0, max=29132.0), HTML(value='')))" 600 | ] 601 | }, 602 | "metadata": {}, 603 | "output_type": "display_data" 604 | }, 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "\n" 610 | ] 611 | } 612 | ], 613 | "source": [ 614 | "#Make spliting train clients info. Trying to reproduce the situation with test\n", 615 | "#\n", 616 | "X_train = []\n", 617 | "X_train_columns = train.columns[:-1]\n", 618 | "df_train_true = []\n", 619 | "client_index = 0\n", 620 | "\n", 621 | "for line in tqdm_notebook(train.values):\n", 622 | "\n", 623 | " info = line[:8]\n", 624 | " info_products = line[8:-1]\n", 625 | " indexes = [k for k, i in enumerate(info_products) if i == 1]\n", 626 | "\n", 627 | " for i in indexes:\n", 628 | "\n", 629 | " client_index += 1\n", 630 | "\n", 631 | " for k in range(len(info_products)):\n", 632 | "\n", 633 | " if k == i:\n", 634 | "\n", 635 | " info_products_transformed = list(copy.copy(info_products))\n", 636 | " df_train_true.append(info_products)\n", 637 | " info_products_transformed[i] = 0\n", 638 | "\n", 639 | " X_train.append(\n", 640 | " list(info) + info_products_transformed +\n", 641 | " [X_train_columns[8 + k]] + [client_index])\n", 642 | "\n", 643 | "X_train = pd.DataFrame(X_train)\n", 644 | "df_train_true = pd.DataFrame(df_train_true)\n", 645 | "df_train_true.columns = [\n", 646 | " 'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',\n", 647 | " 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',\n", 648 | " 'J9JW', 'GHYX', 'ECY3'\n", 649 | "]\n", 650 | "X_train.columns = [\n", 651 | " 'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',\n", 652 | " 'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',\n", 653 | " '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',\n", 654 | " 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',\n", 655 | " 'product_pred', 'ID2'\n", 656 | "]" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 17, 662 | "metadata": { 663 | "ExecuteTime": { 664 | "end_time": "2020-08-07T19:29:10.768064Z", 665 | "start_time": "2020-08-07T19:29:10.494815Z" 666 | }, 667 | "colab": {}, 668 | "colab_type": "code", 669 | "id": "URdSMgJeOnLE" 670 | }, 671 | "outputs": [ 672 | { 673 | "data": { 674 | "application/vnd.jupyter.widget-view+json": { 675 | "model_id": "4597b1fe75884ba6ba095cbddad5b809", 676 | "version_major": 2, 677 | "version_minor": 0 678 | }, 679 | "text/plain": [ 680 | "HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))" 681 | ] 682 | }, 683 | "metadata": {}, 684 | "output_type": "display_data" 685 | }, 686 | { 687 | "name": "stdout", 688 | "output_type": "stream", 689 | "text": [ 690 | "\n" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "#Make info about true values in data of predictions\n", 696 | "#\n", 697 | "X_test = []\n", 698 | "true_values = []\n", 699 | "client_index = 0\n", 700 | "for line in tqdm_notebook(test.values):\n", 701 | "\n", 702 | " client_index += 1\n", 703 | "\n", 704 | " info = line[:8]\n", 705 | " info_products = line[8:-1]\n", 706 | " indexes = [k for k, i in enumerate(info_products) if i == 1]\n", 707 | "\n", 708 | " X_test.append(list(info) + list(info_products) + [client_index])\n", 709 | "\n", 710 | " for true in test.columns[8:][indexes]:\n", 711 | " true_values.append(line[0] + ' X ' + true)\n", 712 | "\n", 713 | "X_test = pd.DataFrame(X_test)\n", 714 | "X_test.columns = [\n", 715 | " 'ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',\n", 716 | " 'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',\n", 717 | " '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',\n", 718 | " 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3',\n", 719 | " 'ID2'\n", 720 | "]" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 18, 726 | "metadata": { 727 | "ExecuteTime": { 728 | "end_time": "2020-08-07T19:29:11.136106Z", 729 | "start_time": "2020-08-07T19:29:11.129621Z" 730 | } 731 | }, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/plain": [ 736 | "((29132, 30), (66353, 31))" 737 | ] 738 | }, 739 | "execution_count": 18, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "#Checking shapes\n", 746 | "#\n", 747 | "train.shape, X_train.shape" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 19, 753 | "metadata": { 754 | "ExecuteTime": { 755 | "end_time": "2020-08-07T19:29:07.853673Z", 756 | "start_time": "2020-08-07T19:29:07.785086Z" 757 | } 758 | }, 759 | "outputs": [ 760 | { 761 | "data": { 762 | "text/html": [ 763 | "
\n", 764 | "\n", 777 | "\n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP...FM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3product_predID2
04WKQSBB1/2/2019F_sexM_marital_status19871X1H_branch_code2A7I_occupation_codeT4MS_occupation_category_code00...01000000RVSZ1
14WKQSBB1/2/2019F_sexM_marital_status19871X1H_branch_code2A7I_occupation_codeT4MS_occupation_category_code00...00000000K6QO2
\n", 855 | "

2 rows × 31 columns

\n", 856 | "
" 857 | ], 858 | "text/plain": [ 859 | " ID join_date sex marital_status birth_year branch_code \\\n", 860 | "0 4WKQSBB 1/2/2019 F_sex M_marital_status 1987 1X1H_branch_code \n", 861 | "1 4WKQSBB 1/2/2019 F_sex M_marital_status 1987 1X1H_branch_code \n", 862 | "\n", 863 | " occupation_code occupation_category_code P5DA RIBP ... FM3X \\\n", 864 | "0 2A7I_occupation_code T4MS_occupation_category_code 0 0 ... 0 \n", 865 | "1 2A7I_occupation_code T4MS_occupation_category_code 0 0 ... 0 \n", 866 | "\n", 867 | " K6QO QBOL JWFN JZ9D J9JW GHYX ECY3 product_pred ID2 \n", 868 | "0 1 0 0 0 0 0 0 RVSZ 1 \n", 869 | "1 0 0 0 0 0 0 0 K6QO 2 \n", 870 | "\n", 871 | "[2 rows x 31 columns]" 872 | ] 873 | }, 874 | "execution_count": 19, 875 | "metadata": {}, 876 | "output_type": "execute_result" 877 | } 878 | ], 879 | "source": [ 880 | "#Look of train data after alters\n", 881 | "#\n", 882 | "X_train.head(2)" 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "execution_count": 20, 888 | "metadata": { 889 | "ExecuteTime": { 890 | "end_time": "2020-08-07T19:29:12.081590Z", 891 | "start_time": "2020-08-07T19:29:12.053903Z" 892 | } 893 | }, 894 | "outputs": [ 895 | { 896 | "data": { 897 | "text/html": [ 898 | "
\n", 899 | "\n", 912 | "\n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | "
IDjoin_datesexmarital_statusbirth_yearbranch_codeoccupation_codeoccupation_category_codeP5DARIBP...BSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3ID2
0F86J5PC1/12/2018M_sexM_marital_status198494KC_branch_codeDZRV_occupation_code90QI_occupation_category_code00...0000000001
1H6141K31/10/2019M_sexM_marital_status19961X1H_branch_codeJ9SY_occupation_code90QI_occupation_category_code00...0010000002
\n", 990 | "

2 rows × 30 columns

\n", 991 | "
" 992 | ], 993 | "text/plain": [ 994 | " ID join_date sex marital_status birth_year branch_code \\\n", 995 | "0 F86J5PC 1/12/2018 M_sex M_marital_status 1984 94KC_branch_code \n", 996 | "1 H6141K3 1/10/2019 M_sex M_marital_status 1996 1X1H_branch_code \n", 997 | "\n", 998 | " occupation_code occupation_category_code P5DA RIBP ... BSTQ \\\n", 999 | "0 DZRV_occupation_code 90QI_occupation_category_code 0 0 ... 0 \n", 1000 | "1 J9SY_occupation_code 90QI_occupation_category_code 0 0 ... 0 \n", 1001 | "\n", 1002 | " FM3X K6QO QBOL JWFN JZ9D J9JW GHYX ECY3 ID2 \n", 1003 | "0 0 0 0 0 0 0 0 0 1 \n", 1004 | "1 0 1 0 0 0 0 0 0 2 \n", 1005 | "\n", 1006 | "[2 rows x 30 columns]" 1007 | ] 1008 | }, 1009 | "execution_count": 20, 1010 | "metadata": {}, 1011 | "output_type": "execute_result" 1012 | } 1013 | ], 1014 | "source": [ 1015 | "#Look of test data after alters\n", 1016 | "#\n", 1017 | "X_test.head(2)" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "code", 1022 | "execution_count": 21, 1023 | "metadata": {}, 1024 | "outputs": [ 1025 | { 1026 | "data": { 1027 | "text/html": [ 1028 | "
\n", 1029 | "\n", 1042 | "\n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | "
P5DARIBP8NN17POT66FJGYSRSOP4RVSZPYUQLJR9...AHXOBSTQFM3XK6QOQBOLJWFNJZ9DJ9JWGHYXECY3
00000000100...0001000000
10000000100...0001000000
\n", 1120 | "

2 rows × 21 columns

\n", 1121 | "
" 1122 | ], 1123 | "text/plain": [ 1124 | " P5DA RIBP 8NN1 7POT 66FJ GYSR SOP4 RVSZ PYUQ LJR9 ... AHXO \\\n", 1125 | "0 0 0 0 0 0 0 0 1 0 0 ... 0 \n", 1126 | "1 0 0 0 0 0 0 0 1 0 0 ... 0 \n", 1127 | "\n", 1128 | " BSTQ FM3X K6QO QBOL JWFN JZ9D J9JW GHYX ECY3 \n", 1129 | "0 0 0 1 0 0 0 0 0 0 \n", 1130 | "1 0 0 1 0 0 0 0 0 0 \n", 1131 | "\n", 1132 | "[2 rows x 21 columns]" 1133 | ] 1134 | }, 1135 | "execution_count": 21, 1136 | "metadata": {}, 1137 | "output_type": "execute_result" 1138 | } 1139 | ], 1140 | "source": [ 1141 | "#It is true values for train data\n", 1142 | "#\n", 1143 | "df_train_true.head(2)" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "markdown", 1148 | "metadata": { 1149 | "colab_type": "text", 1150 | "id": "s1KcI9I6g1de" 1151 | }, 1152 | "source": [ 1153 | "# Reshaping data" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "code", 1158 | "execution_count": 22, 1159 | "metadata": { 1160 | "ExecuteTime": { 1161 | "end_time": "2020-08-07T19:29:13.305572Z", 1162 | "start_time": "2020-08-07T19:29:13.262176Z" 1163 | }, 1164 | "code_folding": [], 1165 | "colab": {}, 1166 | "colab_type": "code", 1167 | "id": "ttUKNdnAczTd" 1168 | }, 1169 | "outputs": [], 1170 | "source": [ 1171 | "#Make data with reshape\n", 1172 | "#\n", 1173 | "features_train = []\n", 1174 | "features_test = []\n", 1175 | "columns = []\n", 1176 | "\n", 1177 | "append_features = [\n", 1178 | " 'P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ',\n", 1179 | " 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',\n", 1180 | " 'J9JW', 'GHYX', 'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status',\n", 1181 | " 'branch_code', 'occupation_code', 'occupation_category_code', 'birth_year'\n", 1182 | "]\n", 1183 | "for f in append_features:\n", 1184 | "\n", 1185 | " features_train.append(X_train[f].values.reshape(-1, 1))\n", 1186 | " features_test.append(X_test[f].values.reshape(-1, 1))\n", 1187 | "\n", 1188 | " columns.append(np.array([f]))\n", 1189 | "\n", 1190 | "y_train = X_train[['product_pred']]" 1191 | ] 1192 | }, 1193 | { 1194 | "cell_type": "code", 1195 | "execution_count": 23, 1196 | "metadata": { 1197 | "ExecuteTime": { 1198 | "end_time": "2020-08-07T19:29:13.988566Z", 1199 | "start_time": "2020-08-07T19:29:13.870945Z" 1200 | }, 1201 | "colab": {}, 1202 | "colab_type": "code", 1203 | "id": "kWCK5LrfkPp-" 1204 | }, 1205 | "outputs": [], 1206 | "source": [ 1207 | "features_train = np.concatenate(features_train, axis=1)\n", 1208 | "features_test = np.concatenate(features_test, axis=1)\n", 1209 | "columns = np.concatenate(np.array(columns))\n", 1210 | "\n", 1211 | "X_train = pd.DataFrame(features_train)\n", 1212 | "X_train.columns = columns\n", 1213 | "\n", 1214 | "X_test = pd.DataFrame(features_test)\n", 1215 | "X_test.columns = columns" 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "markdown", 1220 | "metadata": { 1221 | "colab_type": "text", 1222 | "id": "YVUKKo3llI0y" 1223 | }, 1224 | "source": [ 1225 | "# Add new features" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "code", 1230 | "execution_count": 24, 1231 | "metadata": { 1232 | "ExecuteTime": { 1233 | "end_time": "2020-08-07T19:29:17.239570Z", 1234 | "start_time": "2020-08-07T19:29:16.140241Z" 1235 | }, 1236 | "colab": {}, 1237 | "colab_type": "code", 1238 | "id": "NhfA7yullH76" 1239 | }, 1240 | "outputs": [], 1241 | "source": [ 1242 | "#Reformatting date of join to some features: year, month, day, day of week, day of year of join; add age of clients\n", 1243 | "#\n", 1244 | "for df in [X_train, X_test]:\n", 1245 | " df['join_date'] = pd.to_datetime(df.join_date, format='%d/%m/%Y')\n", 1246 | "\n", 1247 | " df['from_begin'] = (df.join_date - pd.datetime(2010, 1, 1)).dt.days\n", 1248 | "\n", 1249 | " df['join_day'] = df['join_date'].dt.day\n", 1250 | " df['join_month'] = df['join_date'].dt.month\n", 1251 | " df['join_year'] = df['join_date'].dt.year\n", 1252 | " df['dayofweek'] = df['join_date'].dt.weekday\n", 1253 | " df['day_of_year'] = df['join_date'].dt.dayofyear\n", 1254 | "\n", 1255 | " df['age'] = (df['join_year'] - df['birth_year']).astype(float)" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "code", 1260 | "execution_count": 25, 1261 | "metadata": { 1262 | "ExecuteTime": { 1263 | "end_time": "2020-08-07T19:31:04.854098Z", 1264 | "start_time": "2020-08-07T19:31:04.805496Z" 1265 | }, 1266 | "code_folding": [] 1267 | }, 1268 | "outputs": [], 1269 | "source": [ 1270 | "#Concating train and test data\n", 1271 | "#\n", 1272 | "common = X_train.append(X_test)" 1273 | ] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "execution_count": 26, 1278 | "metadata": {}, 1279 | "outputs": [], 1280 | "source": [ 1281 | "common['branch_start']=pd.datetime.now().year-common.groupby('branch_code')['join_year'].transform('min')" 1282 | ] 1283 | }, 1284 | { 1285 | "cell_type": "code", 1286 | "execution_count": 27, 1287 | "metadata": {}, 1288 | "outputs": [], 1289 | "source": [ 1290 | "def transform(df, row):\n", 1291 | " df[row[0]]=common.groupby(row[1])[row[2]].transform(row[3])" 1292 | ] 1293 | }, 1294 | { 1295 | "cell_type": "code", 1296 | "execution_count": 28, 1297 | "metadata": {}, 1298 | "outputs": [], 1299 | "source": [ 1300 | "row_features = [['nuniq_people', 'branch_code', 'ID', 'nunique'],\n", 1301 | " ['nuniq_branch_in_year', 'join_year', 'branch_code', 'nunique'], \n", 1302 | " ['nuniq_year', 'branch_code', 'join_year', 'nunique'], \n", 1303 | " ['nuniq_month', 'branch_code', 'join_month', 'nunique'], \n", 1304 | " ['mean_age_in_branch', 'branch_code', 'age', 'mean'],\n", 1305 | " ['std_age_in_branch', 'branch_code', 'age', 'std'],\n", 1306 | " ['median_age_in_branch', 'branch_code', 'age', 'median'],\n", 1307 | " ['mean_age_in_occupation', 'occupation_code', 'age', 'mean'],\n", 1308 | " ['std_age_in_occupation', 'occupation_code', 'age', 'std'],\n", 1309 | " ['median_age_in_occupation', 'occupation_code', 'age', 'median']]\n", 1310 | "for row in row_features:\n", 1311 | " transform(common,row)" 1312 | ] 1313 | }, 1314 | { 1315 | "cell_type": "code", 1316 | "execution_count": 29, 1317 | "metadata": {}, 1318 | "outputs": [], 1319 | "source": [ 1320 | "common['birth_year_binary']= pd.cut(common['birth_year'], bins=5)\n", 1321 | "\n", 1322 | "common['branch_ocupation']=common['branch_code']+'_'+common['occupation_code']\n", 1323 | "common['branch_ocupcode']=common['branch_code']+'_'+common['occupation_category_code']" 1324 | ] 1325 | }, 1326 | { 1327 | "cell_type": "code", 1328 | "execution_count": 30, 1329 | "metadata": {}, 1330 | "outputs": [], 1331 | "source": [ 1332 | "common['Number_of_Insurance_Bought']=common.iloc[:, :21].sum(axis=1)\n", 1333 | "\n", 1334 | "def mapper(common):\n", 1335 | " if common['Number_of_Insurance_Bought']==1:\n", 1336 | " return 'One'\n", 1337 | " elif (common['Number_of_Insurance_Bought']>1) & (common['Number_of_Insurance_Bought']<5):\n", 1338 | " return 'Medium'\n", 1339 | " elif (common['Number_of_Insurance_Bought']>4 )& (common['Number_of_Insurance_Bought']<8):\n", 1340 | " return 'High' \n", 1341 | " else:\n", 1342 | " return 'Too High' \n", 1343 | "common['Insurance_Count']=common.apply(lambda common:mapper(common) ,axis = 1)\n", 1344 | "del common['Number_of_Insurance_Bought']" 1345 | ] 1346 | }, 1347 | { 1348 | "cell_type": "code", 1349 | "execution_count": 31, 1350 | "metadata": {}, 1351 | "outputs": [], 1352 | "source": [ 1353 | "for name in [\n", 1354 | " 'sex', 'marital_status', 'occupation_code', 'occupation_category_code',\n", 1355 | " 'birth_year_binary', 'branch_ocupation', 'branch_ocupcode', 'Insurance_Count'\n", 1356 | "]:\n", 1357 | " freq = (common.groupby(name).size()) / len(common)\n", 1358 | " common[name + '_freq'] = common[name].apply(lambda x: freq[x])\n", 1359 | " common[name + '_freq'] = common[name + '_freq'].astype(float)" 1360 | ] 1361 | }, 1362 | { 1363 | "cell_type": "code", 1364 | "execution_count": 32, 1365 | "metadata": {}, 1366 | "outputs": [], 1367 | "source": [ 1368 | "le_ins = LabelEncoder()\n", 1369 | "common['Insurance_Count'] = le_ins.fit_transform(common['Insurance_Count'])" 1370 | ] 1371 | }, 1372 | { 1373 | "cell_type": "code", 1374 | "execution_count": 33, 1375 | "metadata": {}, 1376 | "outputs": [ 1377 | { 1378 | "data": { 1379 | "application/vnd.jupyter.widget-view+json": { 1380 | "model_id": "814282d1539a46fe838138df39f56a12", 1381 | "version_major": 2, 1382 | "version_minor": 0 1383 | }, 1384 | "text/plain": [ 1385 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" 1386 | ] 1387 | }, 1388 | "metadata": {}, 1389 | "output_type": "display_data" 1390 | }, 1391 | { 1392 | "name": "stdout", 1393 | "output_type": "stream", 1394 | "text": [ 1395 | "\n" 1396 | ] 1397 | } 1398 | ], 1399 | "source": [ 1400 | "for i,row in tqdm_notebook(common.iterrows()):\n", 1401 | " res = []\n", 1402 | " for c in names_products:\n", 1403 | " if row[c] == 1:\n", 1404 | " res.append(c)\n", 1405 | " common.loc[common.index == i, 'product_comb'] = '_'.join(sorted(res))\n", 1406 | "common['product_comb'] = le_ins.fit_transform(common['product_comb'])" 1407 | ] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": 34, 1412 | "metadata": {}, 1413 | "outputs": [ 1414 | { 1415 | "data": { 1416 | "application/vnd.jupyter.widget-view+json": { 1417 | "model_id": "e2b5ae2003cc4d2396effe9199d79d3a", 1418 | "version_major": 2, 1419 | "version_minor": 0 1420 | }, 1421 | "text/plain": [ 1422 | "HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))" 1423 | ] 1424 | }, 1425 | "metadata": {}, 1426 | "output_type": "display_data" 1427 | }, 1428 | { 1429 | "name": "stdout", 1430 | "output_type": "stream", 1431 | "text": [ 1432 | "\n" 1433 | ] 1434 | } 1435 | ], 1436 | "source": [ 1437 | "for col in tqdm_notebook(names_products):\n", 1438 | " for cols in names_products:\n", 1439 | " if col!=cols:\n", 1440 | " common[col+'_'+cols]=common.groupby(col)[cols].transform(sum)" 1441 | ] 1442 | }, 1443 | { 1444 | "cell_type": "code", 1445 | "execution_count": 35, 1446 | "metadata": {}, 1447 | "outputs": [], 1448 | "source": [ 1449 | "common.drop(\n", 1450 | " columns=['birth_year_binary', 'branch_ocupation', 'branch_ocupcode'],\n", 1451 | " inplace=True)" 1452 | ] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": 36, 1457 | "metadata": { 1458 | "scrolled": true 1459 | }, 1460 | "outputs": [ 1461 | { 1462 | "data": { 1463 | "application/vnd.jupyter.widget-view+json": { 1464 | "model_id": "a1a0bab252f64d8992e09fd5c35fe805", 1465 | "version_major": 2, 1466 | "version_minor": 0 1467 | }, 1468 | "text/plain": [ 1469 | "HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))" 1470 | ] 1471 | }, 1472 | "metadata": {}, 1473 | "output_type": "display_data" 1474 | }, 1475 | { 1476 | "name": "stdout", 1477 | "output_type": "stream", 1478 | "text": [ 1479 | "\n" 1480 | ] 1481 | } 1482 | ], 1483 | "source": [ 1484 | "#Approximate counting of days after open branches and after first buy of each product\n", 1485 | "#\n", 1486 | "for code in tqdm_notebook(common.branch_code.unique()):\n", 1487 | " common.loc[common.branch_code == code, 'from_arise_branch'] = \\\n", 1488 | " common.loc[common.branch_code == code, 'from_begin'] - common.loc[common.branch_code == code, 'from_begin'].min()\n", 1489 | " for product in names_products:\n", 1490 | " common.loc[common.branch_code == code, 'from_arise_product_'+product+'_in_branch'] = \\\n", 1491 | " common.loc[common.branch_code == code, 'from_begin'] - common.loc[(common.branch_code == code)&(common[product]==1), 'from_begin'].min()" 1492 | ] 1493 | }, 1494 | { 1495 | "cell_type": "code", 1496 | "execution_count": 37, 1497 | "metadata": { 1498 | "code_folding": [] 1499 | }, 1500 | "outputs": [ 1501 | { 1502 | "data": { 1503 | "application/vnd.jupyter.widget-view+json": { 1504 | "model_id": "5956fe24f4ff4ed19eac512f2988de2f", 1505 | "version_major": 2, 1506 | "version_minor": 0 1507 | }, 1508 | "text/plain": [ 1509 | "HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))" 1510 | ] 1511 | }, 1512 | "metadata": {}, 1513 | "output_type": "display_data" 1514 | }, 1515 | { 1516 | "name": "stdout", 1517 | "output_type": "stream", 1518 | "text": [ 1519 | "\n" 1520 | ] 1521 | } 1522 | ], 1523 | "source": [ 1524 | "for product in tqdm_notebook(names_products):\n", 1525 | " common['from_arise_product_'+product] = (common['join_date'] - common.loc[common[product] == 1, 'join_date'].min()).dt.days\n", 1526 | " common[product+'_'+'sum_in_branch']=common.groupby('branch_code')[product].transform(sum)\n", 1527 | " common[product+'_'+'_age_mean']=common.groupby(product)['age'].transform('mean')\n", 1528 | " common[product+'_'+'_age_std']=common.groupby(product)['age'].transform('std')\n", 1529 | " common[product+'_'+'_age_median']=common.groupby(product)['age'].transform('median')\n", 1530 | " common[product+'_'+'_sum_join_year']=common.groupby('join_year')[product].transform(sum)" 1531 | ] 1532 | }, 1533 | { 1534 | "cell_type": "code", 1535 | "execution_count": 41, 1536 | "metadata": {}, 1537 | "outputs": [], 1538 | "source": [ 1539 | "#Splitting concating data to train and test\n", 1540 | "#\n", 1541 | "X_train = common[:66353]\n", 1542 | "X_test = common[66353:]" 1543 | ] 1544 | }, 1545 | { 1546 | "cell_type": "markdown", 1547 | "metadata": { 1548 | "colab_type": "text", 1549 | "id": "GDTy7qyulLoP" 1550 | }, 1551 | "source": [ 1552 | "# Encoding" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "execution_count": 46, 1558 | "metadata": {}, 1559 | "outputs": [], 1560 | "source": [ 1561 | "#Encoding of target values that look like names of missing products\n", 1562 | "#\n", 1563 | "le = LabelEncoder()\n", 1564 | "le.fit(y_train.iloc[:, 0])\n", 1565 | "\n", 1566 | "y_train = pd.DataFrame(le.transform(y_train.iloc[:, 0]))\n", 1567 | "y_train.columns = ['target']" 1568 | ] 1569 | }, 1570 | { 1571 | "cell_type": "code", 1572 | "execution_count": 48, 1573 | "metadata": { 1574 | "code_folding": [] 1575 | }, 1576 | "outputs": [], 1577 | "source": [ 1578 | "#Merging amount of purchased products\n", 1579 | "#\n", 1580 | "X_train = X_train.merge(train[['ID', 'sum']])\n", 1581 | "X_test = X_test.merge(test[['ID', 'sum']])" 1582 | ] 1583 | }, 1584 | { 1585 | "cell_type": "code", 1586 | "execution_count": 49, 1587 | "metadata": { 1588 | "code_folding": [] 1589 | }, 1590 | "outputs": [], 1591 | "source": [ 1592 | "#Adding features with replacing to string type to use them like cat_features\n", 1593 | "#\n", 1594 | "for df in [X_train, X_test]:\n", 1595 | " df['dayofweek_cat'] = df['dayofweek'].astype(str)\n", 1596 | " df['from_begin_cat'] = df['from_begin'].astype(str)\n", 1597 | " df['birth_year'] = df['birth_year'].astype(str)\n", 1598 | " df['join_year_cat'] = df['join_year'].astype(str)\n", 1599 | " df['sum_cat'] = df['sum'].astype(str)\n", 1600 | " df['day_of_year_cat'] = df['day_of_year'].astype(str)" 1601 | ] 1602 | }, 1603 | { 1604 | "cell_type": "markdown", 1605 | "metadata": { 1606 | "colab_type": "text", 1607 | "id": "qn9zHq0iqhA3" 1608 | }, 1609 | "source": [ 1610 | "# Model" 1611 | ] 1612 | }, 1613 | { 1614 | "cell_type": "markdown", 1615 | "metadata": {}, 1616 | "source": [ 1617 | "## Model main" 1618 | ] 1619 | }, 1620 | { 1621 | "cell_type": "code", 1622 | "execution_count": 50, 1623 | "metadata": {}, 1624 | "outputs": [], 1625 | "source": [ 1626 | "#Names of cat_features\n", 1627 | "#\n", 1628 | "cat_features = [\n", 1629 | " 'sex',\n", 1630 | " 'marital_status',\n", 1631 | " 'branch_code',\n", 1632 | " 'occupation_category_code',\n", 1633 | " 'occupation_code',\n", 1634 | " 'dayofweek_cat',\n", 1635 | " 'from_begin_cat',\n", 1636 | " 'sum_cat',\n", 1637 | " 'birth_year',\n", 1638 | " 'join_year_cat'\n", 1639 | "]" 1640 | ] 1641 | }, 1642 | { 1643 | "cell_type": "code", 1644 | "execution_count": 51, 1645 | "metadata": { 1646 | "code_folding": [], 1647 | "scrolled": true 1648 | }, 1649 | "outputs": [ 1650 | { 1651 | "name": "stdout", 1652 | "output_type": "stream", 1653 | "text": [ 1654 | "Fold 1\n", 1655 | "0:\tlearn: 2.9445576\ttest: 2.9428904\tbest: 2.9428904 (0)\ttotal: 77.7ms\tremaining: 19m 26s\n", 1656 | "100:\tlearn: 0.9699479\ttest: 0.9618732\tbest: 0.9618732 (100)\ttotal: 5.8s\tremaining: 14m 15s\n", 1657 | "200:\tlearn: 0.6838025\ttest: 0.6798409\tbest: 0.6798409 (200)\ttotal: 11.4s\tremaining: 13m 58s\n", 1658 | "300:\tlearn: 0.5793816\ttest: 0.5790330\tbest: 0.5790330 (300)\ttotal: 16.8s\tremaining: 13m 41s\n", 1659 | "400:\tlearn: 0.5248620\ttest: 0.5276316\tbest: 0.5276316 (400)\ttotal: 22s\tremaining: 13m 21s\n", 1660 | "500:\tlearn: 0.4913727\ttest: 0.4965360\tbest: 0.4965360 (500)\ttotal: 27.3s\tremaining: 13m 11s\n", 1661 | "600:\tlearn: 0.4687746\ttest: 0.4759556\tbest: 0.4759556 (600)\ttotal: 32.6s\tremaining: 13m 1s\n", 1662 | "700:\tlearn: 0.4517229\ttest: 0.4605643\tbest: 0.4605643 (700)\ttotal: 37.9s\tremaining: 12m 53s\n", 1663 | "800:\tlearn: 0.4381602\ttest: 0.4486232\tbest: 0.4486232 (800)\ttotal: 43.2s\tremaining: 12m 45s\n", 1664 | "900:\tlearn: 0.4274216\ttest: 0.4394789\tbest: 0.4394789 (900)\ttotal: 48.4s\tremaining: 12m 37s\n", 1665 | "1000:\tlearn: 0.4178731\ttest: 0.4313687\tbest: 0.4313687 (1000)\ttotal: 53.6s\tremaining: 12m 29s\n", 1666 | "1100:\tlearn: 0.4100848\ttest: 0.4250238\tbest: 0.4250238 (1100)\ttotal: 58.9s\tremaining: 12m 23s\n", 1667 | "1200:\tlearn: 0.4027702\ttest: 0.4192038\tbest: 0.4192038 (1200)\ttotal: 1m 4s\tremaining: 12m 17s\n", 1668 | "1300:\tlearn: 0.3964511\ttest: 0.4144097\tbest: 0.4144097 (1300)\ttotal: 1m 9s\tremaining: 12m 11s\n", 1669 | "1400:\tlearn: 0.3907707\ttest: 0.4101905\tbest: 0.4101905 (1400)\ttotal: 1m 14s\tremaining: 12m 5s\n", 1670 | "1500:\tlearn: 0.3856775\ttest: 0.4065659\tbest: 0.4065659 (1500)\ttotal: 1m 19s\tremaining: 11m 59s\n", 1671 | "1600:\tlearn: 0.3810719\ttest: 0.4034603\tbest: 0.4034603 (1600)\ttotal: 1m 25s\tremaining: 11m 53s\n", 1672 | "1700:\tlearn: 0.3768876\ttest: 0.4007674\tbest: 0.4007674 (1700)\ttotal: 1m 30s\tremaining: 11m 47s\n", 1673 | "1800:\tlearn: 0.3732356\ttest: 0.3986237\tbest: 0.3986237 (1800)\ttotal: 1m 35s\tremaining: 11m 41s\n", 1674 | "1900:\tlearn: 0.3697762\ttest: 0.3965820\tbest: 0.3965820 (1900)\ttotal: 1m 40s\tremaining: 11m 34s\n", 1675 | "2000:\tlearn: 0.3666192\ttest: 0.3947294\tbest: 0.3947294 (2000)\ttotal: 1m 45s\tremaining: 11m 28s\n", 1676 | "2100:\tlearn: 0.3636188\ttest: 0.3931982\tbest: 0.3931982 (2100)\ttotal: 1m 51s\tremaining: 11m 21s\n", 1677 | "2200:\tlearn: 0.3609654\ttest: 0.3918206\tbest: 0.3918206 (2200)\ttotal: 1m 56s\tremaining: 11m 15s\n", 1678 | "2300:\tlearn: 0.3584496\ttest: 0.3905929\tbest: 0.3905929 (2300)\ttotal: 2m 1s\tremaining: 11m 9s\n", 1679 | "2400:\tlearn: 0.3560133\ttest: 0.3894057\tbest: 0.3894057 (2400)\ttotal: 2m 6s\tremaining: 11m 2s\n", 1680 | "2500:\tlearn: 0.3538295\ttest: 0.3882776\tbest: 0.3882776 (2500)\ttotal: 2m 11s\tremaining: 10m 56s\n", 1681 | "2600:\tlearn: 0.3518477\ttest: 0.3873623\tbest: 0.3873623 (2600)\ttotal: 2m 16s\tremaining: 10m 49s\n", 1682 | "2700:\tlearn: 0.3497690\ttest: 0.3863694\tbest: 0.3863694 (2700)\ttotal: 2m 21s\tremaining: 10m 43s\n", 1683 | "2800:\tlearn: 0.3478093\ttest: 0.3854819\tbest: 0.3854819 (2800)\ttotal: 2m 26s\tremaining: 10m 37s\n", 1684 | "2900:\tlearn: 0.3460032\ttest: 0.3848065\tbest: 0.3848065 (2900)\ttotal: 2m 31s\tremaining: 10m 31s\n", 1685 | "3000:\tlearn: 0.3442535\ttest: 0.3841118\tbest: 0.3841080 (2999)\ttotal: 2m 36s\tremaining: 10m 25s\n", 1686 | "3100:\tlearn: 0.3425588\ttest: 0.3834379\tbest: 0.3834379 (3100)\ttotal: 2m 41s\tremaining: 10m 19s\n", 1687 | "3200:\tlearn: 0.3410767\ttest: 0.3829064\tbest: 0.3829029 (3198)\ttotal: 2m 46s\tremaining: 10m 13s\n", 1688 | "3300:\tlearn: 0.3395174\ttest: 0.3823415\tbest: 0.3823415 (3300)\ttotal: 2m 51s\tremaining: 10m 6s\n", 1689 | "3400:\tlearn: 0.3379953\ttest: 0.3817752\tbest: 0.3817748 (3399)\ttotal: 2m 56s\tremaining: 10m 1s\n", 1690 | "3500:\tlearn: 0.3365688\ttest: 0.3813758\tbest: 0.3813758 (3500)\ttotal: 3m 1s\tremaining: 9m 55s\n", 1691 | "3600:\tlearn: 0.3352206\ttest: 0.3809488\tbest: 0.3809488 (3600)\ttotal: 3m 6s\tremaining: 9m 49s\n", 1692 | "3700:\tlearn: 0.3339196\ttest: 0.3804761\tbest: 0.3804761 (3700)\ttotal: 3m 11s\tremaining: 9m 43s\n", 1693 | "3800:\tlearn: 0.3326430\ttest: 0.3801036\tbest: 0.3801036 (3800)\ttotal: 3m 15s\tremaining: 9m 37s\n", 1694 | "3900:\tlearn: 0.3313024\ttest: 0.3797104\tbest: 0.3797104 (3900)\ttotal: 3m 20s\tremaining: 9m 31s\n", 1695 | "4000:\tlearn: 0.3301287\ttest: 0.3793236\tbest: 0.3793236 (4000)\ttotal: 3m 25s\tremaining: 9m 25s\n", 1696 | "4100:\tlearn: 0.3288722\ttest: 0.3789769\tbest: 0.3789769 (4100)\ttotal: 3m 30s\tremaining: 9m 19s\n", 1697 | "4200:\tlearn: 0.3275755\ttest: 0.3786657\tbest: 0.3786657 (4200)\ttotal: 3m 35s\tremaining: 9m 14s\n", 1698 | "4300:\tlearn: 0.3263775\ttest: 0.3783198\tbest: 0.3783198 (4300)\ttotal: 3m 40s\tremaining: 9m 8s\n", 1699 | "4400:\tlearn: 0.3252476\ttest: 0.3780543\tbest: 0.3780543 (4400)\ttotal: 3m 45s\tremaining: 9m 2s\n", 1700 | "4500:\tlearn: 0.3241109\ttest: 0.3777310\tbest: 0.3777310 (4500)\ttotal: 3m 50s\tremaining: 8m 56s\n", 1701 | "4600:\tlearn: 0.3229418\ttest: 0.3775090\tbest: 0.3775086 (4599)\ttotal: 3m 55s\tremaining: 8m 51s\n", 1702 | "4700:\tlearn: 0.3218174\ttest: 0.3772874\tbest: 0.3772870 (4699)\ttotal: 4m\tremaining: 8m 45s\n", 1703 | "4800:\tlearn: 0.3207294\ttest: 0.3770379\tbest: 0.3770379 (4800)\ttotal: 4m 4s\tremaining: 8m 40s\n", 1704 | "4900:\tlearn: 0.3196369\ttest: 0.3767789\tbest: 0.3767789 (4900)\ttotal: 4m 9s\tremaining: 8m 34s\n", 1705 | "5000:\tlearn: 0.3185608\ttest: 0.3765598\tbest: 0.3765598 (5000)\ttotal: 4m 14s\tremaining: 8m 29s\n", 1706 | "5100:\tlearn: 0.3174650\ttest: 0.3762987\tbest: 0.3762987 (5100)\ttotal: 4m 19s\tremaining: 8m 23s\n", 1707 | "5200:\tlearn: 0.3164677\ttest: 0.3760948\tbest: 0.3760948 (5200)\ttotal: 4m 24s\tremaining: 8m 18s\n", 1708 | "5300:\tlearn: 0.3155009\ttest: 0.3759666\tbest: 0.3759542 (5289)\ttotal: 4m 29s\tremaining: 8m 13s\n", 1709 | "5400:\tlearn: 0.3145589\ttest: 0.3758027\tbest: 0.3758027 (5400)\ttotal: 4m 34s\tremaining: 8m 7s\n", 1710 | "5500:\tlearn: 0.3136438\ttest: 0.3756387\tbest: 0.3756387 (5500)\ttotal: 4m 39s\tremaining: 8m 2s\n", 1711 | "5600:\tlearn: 0.3126727\ttest: 0.3754576\tbest: 0.3754576 (5600)\ttotal: 4m 44s\tremaining: 7m 56s\n", 1712 | "5700:\tlearn: 0.3117880\ttest: 0.3752937\tbest: 0.3752937 (5700)\ttotal: 4m 48s\tremaining: 7m 51s\n", 1713 | "5800:\tlearn: 0.3107715\ttest: 0.3751184\tbest: 0.3751184 (5800)\ttotal: 4m 53s\tremaining: 7m 46s\n", 1714 | "5900:\tlearn: 0.3099772\ttest: 0.3749611\tbest: 0.3749544 (5897)\ttotal: 4m 58s\tremaining: 7m 40s\n", 1715 | "6000:\tlearn: 0.3090775\ttest: 0.3748101\tbest: 0.3748101 (6000)\ttotal: 5m 3s\tremaining: 7m 35s\n", 1716 | "6100:\tlearn: 0.3082600\ttest: 0.3746978\tbest: 0.3746978 (6100)\ttotal: 5m 8s\tremaining: 7m 30s\n", 1717 | "6200:\tlearn: 0.3073831\ttest: 0.3746079\tbest: 0.3746074 (6199)\ttotal: 5m 13s\tremaining: 7m 24s\n", 1718 | "6300:\tlearn: 0.3065163\ttest: 0.3744541\tbest: 0.3744541 (6300)\ttotal: 5m 18s\tremaining: 7m 19s\n", 1719 | "6400:\tlearn: 0.3056747\ttest: 0.3743266\tbest: 0.3743266 (6400)\ttotal: 5m 23s\tremaining: 7m 14s\n", 1720 | "6500:\tlearn: 0.3048217\ttest: 0.3741856\tbest: 0.3741856 (6500)\ttotal: 5m 28s\tremaining: 7m 8s\n", 1721 | "6600:\tlearn: 0.3039494\ttest: 0.3740515\tbest: 0.3740515 (6600)\ttotal: 5m 33s\tremaining: 7m 3s\n", 1722 | "6700:\tlearn: 0.3031394\ttest: 0.3739326\tbest: 0.3739326 (6700)\ttotal: 5m 38s\tremaining: 6m 58s\n", 1723 | "6800:\tlearn: 0.3023120\ttest: 0.3738246\tbest: 0.3738200 (6792)\ttotal: 5m 42s\tremaining: 6m 53s\n", 1724 | "6900:\tlearn: 0.3014222\ttest: 0.3737430\tbest: 0.3737430 (6900)\ttotal: 5m 47s\tremaining: 6m 48s\n", 1725 | "7000:\tlearn: 0.3006848\ttest: 0.3736238\tbest: 0.3736229 (6999)\ttotal: 5m 52s\tremaining: 6m 43s\n", 1726 | "7100:\tlearn: 0.2999227\ttest: 0.3735453\tbest: 0.3735441 (7095)\ttotal: 5m 57s\tremaining: 6m 37s\n", 1727 | "7200:\tlearn: 0.2990862\ttest: 0.3734714\tbest: 0.3734714 (7200)\ttotal: 6m 2s\tremaining: 6m 32s\n", 1728 | "7300:\tlearn: 0.2982340\ttest: 0.3733713\tbest: 0.3733706 (7296)\ttotal: 6m 7s\tremaining: 6m 27s\n", 1729 | "7400:\tlearn: 0.2974178\ttest: 0.3732681\tbest: 0.3732632 (7393)\ttotal: 6m 12s\tremaining: 6m 22s\n", 1730 | "7500:\tlearn: 0.2965426\ttest: 0.3731930\tbest: 0.3731913 (7499)\ttotal: 6m 17s\tremaining: 6m 17s\n", 1731 | "7600:\tlearn: 0.2957518\ttest: 0.3731020\tbest: 0.3731008 (7597)\ttotal: 6m 22s\tremaining: 6m 12s\n", 1732 | "7700:\tlearn: 0.2949449\ttest: 0.3730386\tbest: 0.3730376 (7695)\ttotal: 6m 27s\tremaining: 6m 7s\n", 1733 | "7800:\tlearn: 0.2941542\ttest: 0.3730005\tbest: 0.3729990 (7765)\ttotal: 6m 32s\tremaining: 6m 2s\n", 1734 | "7900:\tlearn: 0.2934190\ttest: 0.3729462\tbest: 0.3729462 (7900)\ttotal: 6m 37s\tremaining: 5m 56s\n", 1735 | "8000:\tlearn: 0.2926701\ttest: 0.3729496\tbest: 0.3729383 (7953)\ttotal: 6m 42s\tremaining: 5m 51s\n", 1736 | "8100:\tlearn: 0.2919350\ttest: 0.3729189\tbest: 0.3729189 (8100)\ttotal: 6m 47s\tremaining: 5m 46s\n", 1737 | "8200:\tlearn: 0.2911466\ttest: 0.3728935\tbest: 0.3728867 (8195)\ttotal: 6m 52s\tremaining: 5m 41s\n", 1738 | "8300:\tlearn: 0.2904501\ttest: 0.3728195\tbest: 0.3728104 (8281)\ttotal: 6m 57s\tremaining: 5m 36s\n", 1739 | "8400:\tlearn: 0.2897403\ttest: 0.3727076\tbest: 0.3727076 (8400)\ttotal: 7m 2s\tremaining: 5m 31s\n", 1740 | "8500:\tlearn: 0.2889573\ttest: 0.3726123\tbest: 0.3726123 (8500)\ttotal: 7m 7s\tremaining: 5m 26s\n", 1741 | "8600:\tlearn: 0.2882222\ttest: 0.3725166\tbest: 0.3725107 (8592)\ttotal: 7m 11s\tremaining: 5m 21s\n", 1742 | "8700:\tlearn: 0.2874155\ttest: 0.3724603\tbest: 0.3724588 (8696)\ttotal: 7m 16s\tremaining: 5m 16s\n" 1743 | ] 1744 | }, 1745 | { 1746 | "name": "stdout", 1747 | "output_type": "stream", 1748 | "text": [ 1749 | "8800:\tlearn: 0.2866961\ttest: 0.3724647\tbest: 0.3724497 (8715)\ttotal: 7m 21s\tremaining: 5m 11s\n", 1750 | "8900:\tlearn: 0.2859461\ttest: 0.3724325\tbest: 0.3724298 (8897)\ttotal: 7m 26s\tremaining: 5m 6s\n", 1751 | "9000:\tlearn: 0.2852204\ttest: 0.3723815\tbest: 0.3723774 (8996)\ttotal: 7m 31s\tremaining: 5m 1s\n", 1752 | "9100:\tlearn: 0.2845444\ttest: 0.3723280\tbest: 0.3723280 (9100)\ttotal: 7m 36s\tremaining: 4m 56s\n", 1753 | "9200:\tlearn: 0.2837712\ttest: 0.3722565\tbest: 0.3722565 (9200)\ttotal: 7m 41s\tremaining: 4m 50s\n", 1754 | "9300:\tlearn: 0.2830607\ttest: 0.3722009\tbest: 0.3722009 (9300)\ttotal: 7m 46s\tremaining: 4m 45s\n", 1755 | "9400:\tlearn: 0.2823363\ttest: 0.3721711\tbest: 0.3721694 (9358)\ttotal: 7m 51s\tremaining: 4m 40s\n", 1756 | "9500:\tlearn: 0.2816317\ttest: 0.3721397\tbest: 0.3721264 (9494)\ttotal: 7m 56s\tremaining: 4m 35s\n", 1757 | "9600:\tlearn: 0.2809993\ttest: 0.3721344\tbest: 0.3721242 (9538)\ttotal: 8m 1s\tremaining: 4m 30s\n", 1758 | "9700:\tlearn: 0.2803175\ttest: 0.3721418\tbest: 0.3721211 (9633)\ttotal: 8m 6s\tremaining: 4m 25s\n", 1759 | "9800:\tlearn: 0.2796034\ttest: 0.3721264\tbest: 0.3721145 (9724)\ttotal: 8m 11s\tremaining: 4m 20s\n", 1760 | "9900:\tlearn: 0.2788351\ttest: 0.3721505\tbest: 0.3721145 (9724)\ttotal: 8m 16s\tremaining: 4m 15s\n", 1761 | "bestTest = 0.3721145043\n", 1762 | "bestIteration = 9724\n", 1763 | "Shrink model to first 9725 iterations.\n", 1764 | "Fold 2\n", 1765 | "0:\tlearn: 2.9443681\ttest: 2.9434765\tbest: 2.9434765 (0)\ttotal: 61ms\tremaining: 15m 14s\n", 1766 | "100:\tlearn: 0.9694982\ttest: 0.9623182\tbest: 0.9623182 (100)\ttotal: 5.77s\tremaining: 14m 10s\n", 1767 | "200:\tlearn: 0.6842900\ttest: 0.6783291\tbest: 0.6783291 (200)\ttotal: 11.5s\tremaining: 14m 4s\n", 1768 | "300:\tlearn: 0.5802920\ttest: 0.5734318\tbest: 0.5734318 (300)\ttotal: 17s\tremaining: 13m 48s\n", 1769 | "400:\tlearn: 0.5268177\ttest: 0.5189816\tbest: 0.5189816 (400)\ttotal: 22.2s\tremaining: 13m 29s\n", 1770 | "500:\tlearn: 0.4928743\ttest: 0.4865865\tbest: 0.4865865 (500)\ttotal: 27.7s\tremaining: 13m 20s\n", 1771 | "600:\tlearn: 0.4699431\ttest: 0.4655205\tbest: 0.4655205 (600)\ttotal: 33s\tremaining: 13m 11s\n", 1772 | "700:\tlearn: 0.4532967\ttest: 0.4507689\tbest: 0.4507689 (700)\ttotal: 38.3s\tremaining: 13m 2s\n", 1773 | "800:\tlearn: 0.4395780\ttest: 0.4393647\tbest: 0.4393647 (800)\ttotal: 43.8s\tremaining: 12m 55s\n", 1774 | "900:\tlearn: 0.4282105\ttest: 0.4301656\tbest: 0.4301656 (900)\ttotal: 49.2s\tremaining: 12m 49s\n", 1775 | "1000:\tlearn: 0.4188862\ttest: 0.4229607\tbest: 0.4229607 (1000)\ttotal: 54.5s\tremaining: 12m 42s\n", 1776 | "1100:\tlearn: 0.4108868\ttest: 0.4169308\tbest: 0.4169308 (1100)\ttotal: 59.8s\tremaining: 12m 34s\n", 1777 | "1200:\tlearn: 0.4035234\ttest: 0.4117374\tbest: 0.4117374 (1200)\ttotal: 1m 5s\tremaining: 12m 27s\n", 1778 | "1300:\tlearn: 0.3969228\ttest: 0.4073188\tbest: 0.4073188 (1300)\ttotal: 1m 10s\tremaining: 12m 21s\n", 1779 | "1400:\tlearn: 0.3913798\ttest: 0.4038983\tbest: 0.4038983 (1400)\ttotal: 1m 15s\tremaining: 12m 14s\n", 1780 | "1500:\tlearn: 0.3866217\ttest: 0.4009698\tbest: 0.4009698 (1500)\ttotal: 1m 20s\tremaining: 12m 7s\n", 1781 | "1600:\tlearn: 0.3821708\ttest: 0.3983890\tbest: 0.3983890 (1600)\ttotal: 1m 26s\tremaining: 12m\n", 1782 | "1700:\tlearn: 0.3781741\ttest: 0.3960898\tbest: 0.3960898 (1700)\ttotal: 1m 31s\tremaining: 11m 53s\n", 1783 | "1800:\tlearn: 0.3742509\ttest: 0.3940105\tbest: 0.3940105 (1800)\ttotal: 1m 36s\tremaining: 11m 47s\n", 1784 | "1900:\tlearn: 0.3709104\ttest: 0.3922476\tbest: 0.3922476 (1900)\ttotal: 1m 41s\tremaining: 11m 40s\n", 1785 | "2000:\tlearn: 0.3678909\ttest: 0.3907634\tbest: 0.3907634 (2000)\ttotal: 1m 46s\tremaining: 11m 32s\n", 1786 | "2100:\tlearn: 0.3650593\ttest: 0.3893629\tbest: 0.3893629 (2100)\ttotal: 1m 51s\tremaining: 11m 26s\n", 1787 | "2200:\tlearn: 0.3623068\ttest: 0.3880129\tbest: 0.3880129 (2200)\ttotal: 1m 56s\tremaining: 11m 19s\n", 1788 | "2300:\tlearn: 0.3599148\ttest: 0.3868767\tbest: 0.3868767 (2300)\ttotal: 2m 1s\tremaining: 11m 12s\n", 1789 | "2400:\tlearn: 0.3577155\ttest: 0.3859346\tbest: 0.3859346 (2400)\ttotal: 2m 6s\tremaining: 11m 5s\n", 1790 | "2500:\tlearn: 0.3553781\ttest: 0.3849259\tbest: 0.3849259 (2500)\ttotal: 2m 11s\tremaining: 10m 59s\n", 1791 | "2600:\tlearn: 0.3533105\ttest: 0.3840945\tbest: 0.3840945 (2600)\ttotal: 2m 17s\tremaining: 10m 53s\n", 1792 | "2700:\tlearn: 0.3513950\ttest: 0.3833142\tbest: 0.3833142 (2700)\ttotal: 2m 22s\tremaining: 10m 46s\n", 1793 | "2800:\tlearn: 0.3496242\ttest: 0.3826326\tbest: 0.3826326 (2800)\ttotal: 2m 26s\tremaining: 10m 39s\n", 1794 | "2900:\tlearn: 0.3478921\ttest: 0.3819524\tbest: 0.3819506 (2898)\ttotal: 2m 31s\tremaining: 10m 33s\n", 1795 | "3000:\tlearn: 0.3462332\ttest: 0.3813759\tbest: 0.3813759 (3000)\ttotal: 2m 36s\tremaining: 10m 27s\n", 1796 | "3100:\tlearn: 0.3445562\ttest: 0.3807373\tbest: 0.3807373 (3100)\ttotal: 2m 41s\tremaining: 10m 21s\n", 1797 | "3200:\tlearn: 0.3429904\ttest: 0.3801625\tbest: 0.3801625 (3200)\ttotal: 2m 46s\tremaining: 10m 14s\n", 1798 | "3300:\tlearn: 0.3414159\ttest: 0.3795433\tbest: 0.3795433 (3300)\ttotal: 2m 51s\tremaining: 10m 8s\n", 1799 | "3400:\tlearn: 0.3400266\ttest: 0.3791237\tbest: 0.3791181 (3398)\ttotal: 2m 56s\tremaining: 10m 2s\n", 1800 | "3500:\tlearn: 0.3386484\ttest: 0.3786599\tbest: 0.3786599 (3500)\ttotal: 3m 1s\tremaining: 9m 56s\n", 1801 | "3600:\tlearn: 0.3374751\ttest: 0.3782948\tbest: 0.3782948 (3600)\ttotal: 3m 6s\tremaining: 9m 49s\n", 1802 | "3700:\tlearn: 0.3361114\ttest: 0.3778727\tbest: 0.3778727 (3700)\ttotal: 3m 11s\tremaining: 9m 44s\n", 1803 | "3800:\tlearn: 0.3348361\ttest: 0.3774884\tbest: 0.3774855 (3799)\ttotal: 3m 16s\tremaining: 9m 37s\n", 1804 | "3900:\tlearn: 0.3336625\ttest: 0.3771435\tbest: 0.3771435 (3900)\ttotal: 3m 21s\tremaining: 9m 31s\n", 1805 | "4000:\tlearn: 0.3325035\ttest: 0.3767501\tbest: 0.3767501 (4000)\ttotal: 3m 25s\tremaining: 9m 25s\n", 1806 | "4100:\tlearn: 0.3313293\ttest: 0.3764553\tbest: 0.3764553 (4100)\ttotal: 3m 30s\tremaining: 9m 20s\n", 1807 | "4200:\tlearn: 0.3299892\ttest: 0.3760755\tbest: 0.3760755 (4200)\ttotal: 3m 35s\tremaining: 9m 14s\n", 1808 | "4300:\tlearn: 0.3287937\ttest: 0.3757468\tbest: 0.3757460 (4299)\ttotal: 3m 40s\tremaining: 9m 8s\n", 1809 | "4400:\tlearn: 0.3276672\ttest: 0.3754384\tbest: 0.3754382 (4396)\ttotal: 3m 45s\tremaining: 9m 2s\n", 1810 | "4500:\tlearn: 0.3266092\ttest: 0.3751657\tbest: 0.3751657 (4500)\ttotal: 3m 50s\tremaining: 8m 57s\n", 1811 | "4600:\tlearn: 0.3254817\ttest: 0.3748788\tbest: 0.3748761 (4599)\ttotal: 3m 55s\tremaining: 8m 51s\n", 1812 | "4700:\tlearn: 0.3243820\ttest: 0.3746932\tbest: 0.3746932 (4700)\ttotal: 4m\tremaining: 8m 46s\n", 1813 | "4800:\tlearn: 0.3233590\ttest: 0.3744619\tbest: 0.3744619 (4800)\ttotal: 4m 4s\tremaining: 8m 40s\n", 1814 | "4900:\tlearn: 0.3224071\ttest: 0.3742858\tbest: 0.3742853 (4899)\ttotal: 4m 9s\tremaining: 8m 34s\n", 1815 | "5000:\tlearn: 0.3213104\ttest: 0.3740207\tbest: 0.3740207 (5000)\ttotal: 4m 14s\tremaining: 8m 29s\n", 1816 | "5100:\tlearn: 0.3202703\ttest: 0.3738921\tbest: 0.3738919 (5097)\ttotal: 4m 19s\tremaining: 8m 23s\n", 1817 | "5200:\tlearn: 0.3192828\ttest: 0.3736484\tbest: 0.3736484 (5200)\ttotal: 4m 24s\tremaining: 8m 18s\n", 1818 | "5300:\tlearn: 0.3182278\ttest: 0.3734363\tbest: 0.3734353 (5297)\ttotal: 4m 29s\tremaining: 8m 12s\n", 1819 | "5400:\tlearn: 0.3171807\ttest: 0.3732738\tbest: 0.3732724 (5390)\ttotal: 4m 34s\tremaining: 8m 7s\n", 1820 | "5500:\tlearn: 0.3162452\ttest: 0.3731329\tbest: 0.3731325 (5494)\ttotal: 4m 38s\tremaining: 8m 1s\n", 1821 | "5600:\tlearn: 0.3153347\ttest: 0.3730231\tbest: 0.3730224 (5597)\ttotal: 4m 43s\tremaining: 7m 56s\n", 1822 | "5700:\tlearn: 0.3142878\ttest: 0.3728379\tbest: 0.3728379 (5700)\ttotal: 4m 48s\tremaining: 7m 51s\n", 1823 | "5800:\tlearn: 0.3133186\ttest: 0.3726643\tbest: 0.3726610 (5789)\ttotal: 4m 53s\tremaining: 7m 45s\n", 1824 | "5900:\tlearn: 0.3123658\ttest: 0.3724978\tbest: 0.3724978 (5900)\ttotal: 4m 58s\tremaining: 7m 40s\n", 1825 | "6000:\tlearn: 0.3114211\ttest: 0.3723352\tbest: 0.3723309 (5995)\ttotal: 5m 3s\tremaining: 7m 34s\n", 1826 | "6100:\tlearn: 0.3105598\ttest: 0.3721834\tbest: 0.3721793 (6098)\ttotal: 5m 8s\tremaining: 7m 29s\n", 1827 | "6200:\tlearn: 0.3096669\ttest: 0.3720013\tbest: 0.3720013 (6200)\ttotal: 5m 13s\tremaining: 7m 24s\n", 1828 | "6300:\tlearn: 0.3087257\ttest: 0.3718906\tbest: 0.3718861 (6294)\ttotal: 5m 18s\tremaining: 7m 19s\n", 1829 | "6400:\tlearn: 0.3078272\ttest: 0.3717507\tbest: 0.3717507 (6400)\ttotal: 5m 22s\tremaining: 7m 13s\n", 1830 | "6500:\tlearn: 0.3069387\ttest: 0.3716658\tbest: 0.3716632 (6499)\ttotal: 5m 27s\tremaining: 7m 8s\n", 1831 | "6600:\tlearn: 0.3061115\ttest: 0.3715509\tbest: 0.3715509 (6600)\ttotal: 5m 32s\tremaining: 7m 3s\n", 1832 | "6700:\tlearn: 0.3052217\ttest: 0.3713980\tbest: 0.3713980 (6700)\ttotal: 5m 37s\tremaining: 6m 58s\n", 1833 | "6800:\tlearn: 0.3043711\ttest: 0.3713464\tbest: 0.3713464 (6800)\ttotal: 5m 42s\tremaining: 6m 52s\n", 1834 | "6900:\tlearn: 0.3035667\ttest: 0.3712873\tbest: 0.3712718 (6869)\ttotal: 5m 47s\tremaining: 6m 47s\n", 1835 | "7000:\tlearn: 0.3026999\ttest: 0.3712250\tbest: 0.3712218 (6998)\ttotal: 5m 52s\tremaining: 6m 42s\n", 1836 | "7100:\tlearn: 0.3018933\ttest: 0.3711512\tbest: 0.3711507 (7087)\ttotal: 5m 57s\tremaining: 6m 37s\n", 1837 | "7200:\tlearn: 0.3010557\ttest: 0.3710529\tbest: 0.3710529 (7200)\ttotal: 6m 2s\tremaining: 6m 32s\n", 1838 | "7300:\tlearn: 0.3002674\ttest: 0.3709802\tbest: 0.3709802 (7300)\ttotal: 6m 7s\tremaining: 6m 27s\n", 1839 | "7400:\tlearn: 0.2994336\ttest: 0.3708955\tbest: 0.3708939 (7399)\ttotal: 6m 11s\tremaining: 6m 21s\n" 1840 | ] 1841 | }, 1842 | { 1843 | "name": "stdout", 1844 | "output_type": "stream", 1845 | "text": [ 1846 | "7500:\tlearn: 0.2986146\ttest: 0.3708438\tbest: 0.3708386 (7471)\ttotal: 6m 16s\tremaining: 6m 16s\n", 1847 | "7600:\tlearn: 0.2978320\ttest: 0.3707964\tbest: 0.3707964 (7600)\ttotal: 6m 21s\tremaining: 6m 11s\n", 1848 | "7700:\tlearn: 0.2971115\ttest: 0.3707068\tbest: 0.3707068 (7700)\ttotal: 6m 26s\tremaining: 6m 6s\n", 1849 | "7800:\tlearn: 0.2962688\ttest: 0.3706367\tbest: 0.3706354 (7799)\ttotal: 6m 31s\tremaining: 6m 1s\n", 1850 | "7900:\tlearn: 0.2954515\ttest: 0.3706102\tbest: 0.3705969 (7888)\ttotal: 6m 36s\tremaining: 5m 56s\n", 1851 | "8000:\tlearn: 0.2946311\ttest: 0.3705748\tbest: 0.3705715 (7997)\ttotal: 6m 41s\tremaining: 5m 51s\n", 1852 | "8100:\tlearn: 0.2938061\ttest: 0.3705295\tbest: 0.3705261 (8087)\ttotal: 6m 46s\tremaining: 5m 46s\n", 1853 | "8200:\tlearn: 0.2930550\ttest: 0.3704819\tbest: 0.3704692 (8189)\ttotal: 6m 51s\tremaining: 5m 41s\n", 1854 | "8300:\tlearn: 0.2922324\ttest: 0.3704177\tbest: 0.3704177 (8300)\ttotal: 6m 56s\tremaining: 5m 35s\n", 1855 | "8400:\tlearn: 0.2914426\ttest: 0.3703328\tbest: 0.3703322 (8395)\ttotal: 7m 1s\tremaining: 5m 30s\n", 1856 | "8500:\tlearn: 0.2906545\ttest: 0.3702983\tbest: 0.3702943 (8489)\ttotal: 7m 6s\tremaining: 5m 25s\n", 1857 | "8600:\tlearn: 0.2898853\ttest: 0.3702520\tbest: 0.3702520 (8600)\ttotal: 7m 11s\tremaining: 5m 20s\n", 1858 | "8700:\tlearn: 0.2891316\ttest: 0.3702383\tbest: 0.3702377 (8697)\ttotal: 7m 16s\tremaining: 5m 15s\n", 1859 | "8800:\tlearn: 0.2883580\ttest: 0.3701953\tbest: 0.3701936 (8789)\ttotal: 7m 21s\tremaining: 5m 10s\n", 1860 | "8900:\tlearn: 0.2876060\ttest: 0.3702348\tbest: 0.3701914 (8824)\ttotal: 7m 25s\tremaining: 5m 5s\n", 1861 | "9000:\tlearn: 0.2869364\ttest: 0.3702237\tbest: 0.3701914 (8824)\ttotal: 7m 30s\tremaining: 5m\n", 1862 | "bestTest = 0.370191368\n", 1863 | "bestIteration = 8824\n", 1864 | "Shrink model to first 8825 iterations.\n", 1865 | "Fold 3\n", 1866 | "0:\tlearn: 2.9437895\ttest: 2.9439809\tbest: 2.9439809 (0)\ttotal: 64.5ms\tremaining: 16m 7s\n", 1867 | "100:\tlearn: 0.9666337\ttest: 0.9737121\tbest: 0.9737121 (100)\ttotal: 5.76s\tremaining: 14m 10s\n", 1868 | "200:\tlearn: 0.6822880\ttest: 0.6907236\tbest: 0.6907236 (200)\ttotal: 11.6s\tremaining: 14m 15s\n", 1869 | "300:\tlearn: 0.5780329\ttest: 0.5866115\tbest: 0.5866115 (300)\ttotal: 17.3s\tremaining: 14m 7s\n", 1870 | "400:\tlearn: 0.5249610\ttest: 0.5340124\tbest: 0.5340124 (400)\ttotal: 22.7s\tremaining: 13m 46s\n", 1871 | "500:\tlearn: 0.4903229\ttest: 0.5009206\tbest: 0.5009206 (500)\ttotal: 28.2s\tremaining: 13m 37s\n", 1872 | "600:\tlearn: 0.4671883\ttest: 0.4791335\tbest: 0.4791335 (600)\ttotal: 33.8s\tremaining: 13m 28s\n", 1873 | "700:\tlearn: 0.4507938\ttest: 0.4639248\tbest: 0.4639248 (700)\ttotal: 39.2s\tremaining: 13m 20s\n", 1874 | "800:\tlearn: 0.4375317\ttest: 0.4518411\tbest: 0.4518411 (800)\ttotal: 44.7s\tremaining: 13m 12s\n", 1875 | "900:\tlearn: 0.4269307\ttest: 0.4425353\tbest: 0.4425353 (900)\ttotal: 50.2s\tremaining: 13m 5s\n", 1876 | "1000:\tlearn: 0.4181567\ttest: 0.4349669\tbest: 0.4349669 (1000)\ttotal: 55.6s\tremaining: 12m 57s\n", 1877 | "1100:\tlearn: 0.4105997\ttest: 0.4286424\tbest: 0.4286424 (1100)\ttotal: 1m 1s\tremaining: 12m 50s\n", 1878 | "1200:\tlearn: 0.4033486\ttest: 0.4228499\tbest: 0.4228499 (1200)\ttotal: 1m 6s\tremaining: 12m 41s\n", 1879 | "1300:\tlearn: 0.3969627\ttest: 0.4179884\tbest: 0.4179884 (1300)\ttotal: 1m 11s\tremaining: 12m 33s\n", 1880 | "1400:\tlearn: 0.3912228\ttest: 0.4138862\tbest: 0.4138862 (1400)\ttotal: 1m 17s\tremaining: 12m 28s\n", 1881 | "1500:\tlearn: 0.3859352\ttest: 0.4100195\tbest: 0.4100195 (1500)\ttotal: 1m 22s\tremaining: 12m 21s\n", 1882 | "1600:\tlearn: 0.3815999\ttest: 0.4070405\tbest: 0.4070405 (1600)\ttotal: 1m 27s\tremaining: 12m 13s\n", 1883 | "1700:\tlearn: 0.3776680\ttest: 0.4044705\tbest: 0.4044705 (1700)\ttotal: 1m 32s\tremaining: 12m 5s\n", 1884 | "1800:\tlearn: 0.3740485\ttest: 0.4020943\tbest: 0.4020943 (1800)\ttotal: 1m 37s\tremaining: 11m 57s\n", 1885 | "1900:\tlearn: 0.3708849\ttest: 0.4001783\tbest: 0.4001783 (1900)\ttotal: 1m 42s\tremaining: 11m 49s\n", 1886 | "2000:\tlearn: 0.3679143\ttest: 0.3983328\tbest: 0.3983328 (2000)\ttotal: 1m 48s\tremaining: 11m 41s\n", 1887 | "2100:\tlearn: 0.3652782\ttest: 0.3967781\tbest: 0.3967781 (2100)\ttotal: 1m 53s\tremaining: 11m 34s\n", 1888 | "2200:\tlearn: 0.3627526\ttest: 0.3953468\tbest: 0.3953468 (2200)\ttotal: 1m 58s\tremaining: 11m 26s\n", 1889 | "2300:\tlearn: 0.3603579\ttest: 0.3938987\tbest: 0.3938987 (2300)\ttotal: 2m 2s\tremaining: 11m 18s\n", 1890 | "2400:\tlearn: 0.3581742\ttest: 0.3926778\tbest: 0.3926778 (2400)\ttotal: 2m 7s\tremaining: 11m 11s\n", 1891 | "2500:\tlearn: 0.3562758\ttest: 0.3916742\tbest: 0.3916742 (2500)\ttotal: 2m 12s\tremaining: 11m 3s\n", 1892 | "2600:\tlearn: 0.3543680\ttest: 0.3907456\tbest: 0.3907456 (2600)\ttotal: 2m 17s\tremaining: 10m 56s\n", 1893 | "2700:\tlearn: 0.3528549\ttest: 0.3900831\tbest: 0.3900831 (2700)\ttotal: 2m 22s\tremaining: 10m 49s\n", 1894 | "2800:\tlearn: 0.3512045\ttest: 0.3892952\tbest: 0.3892952 (2800)\ttotal: 2m 27s\tremaining: 10m 44s\n", 1895 | "2900:\tlearn: 0.3498431\ttest: 0.3886999\tbest: 0.3886999 (2900)\ttotal: 2m 32s\tremaining: 10m 37s\n", 1896 | "3000:\tlearn: 0.3481895\ttest: 0.3879855\tbest: 0.3879855 (3000)\ttotal: 2m 37s\tremaining: 10m 31s\n", 1897 | "3100:\tlearn: 0.3466946\ttest: 0.3873528\tbest: 0.3873528 (3100)\ttotal: 2m 43s\tremaining: 10m 25s\n", 1898 | "3200:\tlearn: 0.3451060\ttest: 0.3866692\tbest: 0.3866692 (3200)\ttotal: 2m 47s\tremaining: 10m 19s\n", 1899 | "3300:\tlearn: 0.3438039\ttest: 0.3862147\tbest: 0.3862147 (3300)\ttotal: 2m 52s\tremaining: 10m 12s\n", 1900 | "3400:\tlearn: 0.3423666\ttest: 0.3856366\tbest: 0.3856366 (3400)\ttotal: 2m 57s\tremaining: 10m 7s\n", 1901 | "3500:\tlearn: 0.3410860\ttest: 0.3851325\tbest: 0.3851325 (3500)\ttotal: 3m 2s\tremaining: 10m\n", 1902 | "3600:\tlearn: 0.3398489\ttest: 0.3846819\tbest: 0.3846819 (3600)\ttotal: 3m 7s\tremaining: 9m 54s\n", 1903 | "3700:\tlearn: 0.3385640\ttest: 0.3841184\tbest: 0.3841184 (3700)\ttotal: 3m 12s\tremaining: 9m 49s\n", 1904 | "3800:\tlearn: 0.3373945\ttest: 0.3837013\tbest: 0.3837013 (3800)\ttotal: 3m 17s\tremaining: 9m 42s\n", 1905 | "3900:\tlearn: 0.3362733\ttest: 0.3832844\tbest: 0.3832844 (3900)\ttotal: 3m 22s\tremaining: 9m 36s\n", 1906 | "4000:\tlearn: 0.3351111\ttest: 0.3828893\tbest: 0.3828883 (3999)\ttotal: 3m 27s\tremaining: 9m 30s\n", 1907 | "4100:\tlearn: 0.3339635\ttest: 0.3825139\tbest: 0.3825139 (4100)\ttotal: 3m 32s\tremaining: 9m 24s\n", 1908 | "4200:\tlearn: 0.3328368\ttest: 0.3821965\tbest: 0.3821947 (4198)\ttotal: 3m 37s\tremaining: 9m 18s\n", 1909 | "4300:\tlearn: 0.3317781\ttest: 0.3818511\tbest: 0.3818511 (4300)\ttotal: 3m 41s\tremaining: 9m 12s\n", 1910 | "4400:\tlearn: 0.3307150\ttest: 0.3815458\tbest: 0.3815458 (4400)\ttotal: 3m 46s\tremaining: 9m 6s\n", 1911 | "4500:\tlearn: 0.3297377\ttest: 0.3813009\tbest: 0.3813009 (4500)\ttotal: 3m 51s\tremaining: 9m\n", 1912 | "4600:\tlearn: 0.3286289\ttest: 0.3809880\tbest: 0.3809880 (4600)\ttotal: 3m 56s\tremaining: 8m 54s\n", 1913 | "4700:\tlearn: 0.3275877\ttest: 0.3806923\tbest: 0.3806863 (4696)\ttotal: 4m 1s\tremaining: 8m 48s\n", 1914 | "4800:\tlearn: 0.3265512\ttest: 0.3803794\tbest: 0.3803779 (4798)\ttotal: 4m 6s\tremaining: 8m 42s\n", 1915 | "4900:\tlearn: 0.3256019\ttest: 0.3801802\tbest: 0.3801802 (4900)\ttotal: 4m 11s\tremaining: 8m 37s\n", 1916 | "5000:\tlearn: 0.3246328\ttest: 0.3798789\tbest: 0.3798789 (5000)\ttotal: 4m 15s\tremaining: 8m 31s\n", 1917 | "5100:\tlearn: 0.3237276\ttest: 0.3796492\tbest: 0.3796492 (5100)\ttotal: 4m 20s\tremaining: 8m 25s\n", 1918 | "5200:\tlearn: 0.3228442\ttest: 0.3794166\tbest: 0.3794166 (5200)\ttotal: 4m 25s\tremaining: 8m 20s\n", 1919 | "5300:\tlearn: 0.3219078\ttest: 0.3791499\tbest: 0.3791499 (5300)\ttotal: 4m 30s\tremaining: 8m 14s\n", 1920 | "5400:\tlearn: 0.3209660\ttest: 0.3789839\tbest: 0.3789788 (5391)\ttotal: 4m 35s\tremaining: 8m 9s\n", 1921 | "5500:\tlearn: 0.3200584\ttest: 0.3788319\tbest: 0.3788319 (5500)\ttotal: 4m 40s\tremaining: 8m 3s\n", 1922 | "5600:\tlearn: 0.3191029\ttest: 0.3785811\tbest: 0.3785811 (5600)\ttotal: 4m 44s\tremaining: 7m 58s\n", 1923 | "5700:\tlearn: 0.3182321\ttest: 0.3784223\tbest: 0.3784223 (5698)\ttotal: 4m 49s\tremaining: 7m 52s\n", 1924 | "5800:\tlearn: 0.3173450\ttest: 0.3782003\tbest: 0.3782003 (5800)\ttotal: 4m 54s\tremaining: 7m 47s\n", 1925 | "5900:\tlearn: 0.3164450\ttest: 0.3780575\tbest: 0.3780575 (5900)\ttotal: 4m 59s\tremaining: 7m 41s\n", 1926 | "6000:\tlearn: 0.3155760\ttest: 0.3778265\tbest: 0.3778217 (5998)\ttotal: 5m 4s\tremaining: 7m 36s\n", 1927 | "6100:\tlearn: 0.3147287\ttest: 0.3776652\tbest: 0.3776587 (6098)\ttotal: 5m 9s\tremaining: 7m 31s\n", 1928 | "6200:\tlearn: 0.3138946\ttest: 0.3775094\tbest: 0.3775094 (6200)\ttotal: 5m 14s\tremaining: 7m 25s\n", 1929 | "6300:\tlearn: 0.3130381\ttest: 0.3773897\tbest: 0.3773864 (6299)\ttotal: 5m 18s\tremaining: 7m 20s\n", 1930 | "6400:\tlearn: 0.3122034\ttest: 0.3772815\tbest: 0.3772779 (6390)\ttotal: 5m 23s\tremaining: 7m 14s\n", 1931 | "6500:\tlearn: 0.3113488\ttest: 0.3771375\tbest: 0.3771361 (6490)\ttotal: 5m 28s\tremaining: 7m 9s\n", 1932 | "6600:\tlearn: 0.3104638\ttest: 0.3769675\tbest: 0.3769675 (6599)\ttotal: 5m 33s\tremaining: 7m 4s\n", 1933 | "6700:\tlearn: 0.3096569\ttest: 0.3768008\tbest: 0.3768002 (6698)\ttotal: 5m 38s\tremaining: 6m 58s\n", 1934 | "6800:\tlearn: 0.3089298\ttest: 0.3767119\tbest: 0.3767119 (6800)\ttotal: 5m 43s\tremaining: 6m 53s\n", 1935 | "6900:\tlearn: 0.3081403\ttest: 0.3765886\tbest: 0.3765880 (6898)\ttotal: 5m 47s\tremaining: 6m 48s\n", 1936 | "7000:\tlearn: 0.3074183\ttest: 0.3764605\tbest: 0.3764600 (6997)\ttotal: 5m 52s\tremaining: 6m 43s\n" 1937 | ] 1938 | }, 1939 | { 1940 | "name": "stdout", 1941 | "output_type": "stream", 1942 | "text": [ 1943 | "7100:\tlearn: 0.3066156\ttest: 0.3763372\tbest: 0.3763372 (7100)\ttotal: 5m 57s\tremaining: 6m 37s\n", 1944 | "7200:\tlearn: 0.3058966\ttest: 0.3762786\tbest: 0.3762786 (7200)\ttotal: 6m 2s\tremaining: 6m 32s\n", 1945 | "7300:\tlearn: 0.3051038\ttest: 0.3761495\tbest: 0.3761495 (7300)\ttotal: 6m 7s\tremaining: 6m 27s\n", 1946 | "7400:\tlearn: 0.3043295\ttest: 0.3760555\tbest: 0.3760538 (7399)\ttotal: 6m 12s\tremaining: 6m 22s\n", 1947 | "7500:\tlearn: 0.3035415\ttest: 0.3759647\tbest: 0.3759647 (7500)\ttotal: 6m 18s\tremaining: 6m 18s\n", 1948 | "7600:\tlearn: 0.3027972\ttest: 0.3758383\tbest: 0.3758347 (7596)\ttotal: 6m 23s\tremaining: 6m 13s\n", 1949 | "7700:\tlearn: 0.3019187\ttest: 0.3756992\tbest: 0.3756992 (7700)\ttotal: 6m 28s\tremaining: 6m 8s\n", 1950 | "7800:\tlearn: 0.3011553\ttest: 0.3755568\tbest: 0.3755561 (7799)\ttotal: 6m 33s\tremaining: 6m 2s\n", 1951 | "7900:\tlearn: 0.3003440\ttest: 0.3754536\tbest: 0.3754521 (7898)\ttotal: 6m 38s\tremaining: 5m 57s\n", 1952 | "8000:\tlearn: 0.2995573\ttest: 0.3753286\tbest: 0.3753271 (7998)\ttotal: 6m 43s\tremaining: 5m 52s\n", 1953 | "8100:\tlearn: 0.2987896\ttest: 0.3752060\tbest: 0.3752060 (8100)\ttotal: 6m 47s\tremaining: 5m 47s\n", 1954 | "8200:\tlearn: 0.2980186\ttest: 0.3751054\tbest: 0.3751047 (8181)\ttotal: 6m 52s\tremaining: 5m 42s\n", 1955 | "8300:\tlearn: 0.2972517\ttest: 0.3750124\tbest: 0.3750084 (8298)\ttotal: 6m 57s\tremaining: 5m 37s\n", 1956 | "8400:\tlearn: 0.2965652\ttest: 0.3749325\tbest: 0.3749325 (8400)\ttotal: 7m 2s\tremaining: 5m 31s\n", 1957 | "8500:\tlearn: 0.2958502\ttest: 0.3748635\tbest: 0.3748634 (8499)\ttotal: 7m 7s\tremaining: 5m 26s\n", 1958 | "8600:\tlearn: 0.2950783\ttest: 0.3747830\tbest: 0.3747830 (8600)\ttotal: 7m 12s\tremaining: 5m 21s\n", 1959 | "8700:\tlearn: 0.2943076\ttest: 0.3747100\tbest: 0.3747099 (8697)\ttotal: 7m 17s\tremaining: 5m 16s\n", 1960 | "8800:\tlearn: 0.2935542\ttest: 0.3746214\tbest: 0.3746214 (8800)\ttotal: 7m 22s\tremaining: 5m 11s\n", 1961 | "8900:\tlearn: 0.2928097\ttest: 0.3745509\tbest: 0.3745509 (8900)\ttotal: 7m 26s\tremaining: 5m 6s\n", 1962 | "9000:\tlearn: 0.2921266\ttest: 0.3744730\tbest: 0.3744730 (9000)\ttotal: 7m 31s\tremaining: 5m 1s\n", 1963 | "9100:\tlearn: 0.2914218\ttest: 0.3743784\tbest: 0.3743755 (9086)\ttotal: 7m 36s\tremaining: 4m 55s\n", 1964 | "9200:\tlearn: 0.2906955\ttest: 0.3743307\tbest: 0.3743295 (9199)\ttotal: 7m 41s\tremaining: 4m 50s\n", 1965 | "9300:\tlearn: 0.2899799\ttest: 0.3742830\tbest: 0.3742819 (9291)\ttotal: 7m 46s\tremaining: 4m 45s\n", 1966 | "9400:\tlearn: 0.2892462\ttest: 0.3742510\tbest: 0.3742505 (9399)\ttotal: 7m 51s\tremaining: 4m 40s\n", 1967 | "9500:\tlearn: 0.2884749\ttest: 0.3741921\tbest: 0.3741838 (9495)\ttotal: 7m 56s\tremaining: 4m 35s\n", 1968 | "9600:\tlearn: 0.2877788\ttest: 0.3741160\tbest: 0.3741160 (9600)\ttotal: 8m\tremaining: 4m 30s\n", 1969 | "9700:\tlearn: 0.2871115\ttest: 0.3741111\tbest: 0.3741040 (9657)\ttotal: 8m 5s\tremaining: 4m 25s\n", 1970 | "9800:\tlearn: 0.2863541\ttest: 0.3740104\tbest: 0.3740085 (9799)\ttotal: 8m 10s\tremaining: 4m 20s\n", 1971 | "9900:\tlearn: 0.2856196\ttest: 0.3739835\tbest: 0.3739822 (9899)\ttotal: 8m 15s\tremaining: 4m 15s\n", 1972 | "10000:\tlearn: 0.2849435\ttest: 0.3739559\tbest: 0.3739547 (9998)\ttotal: 8m 20s\tremaining: 4m 10s\n", 1973 | "10100:\tlearn: 0.2842390\ttest: 0.3739441\tbest: 0.3739186 (10084)\ttotal: 8m 25s\tremaining: 4m 5s\n", 1974 | "10200:\tlearn: 0.2834556\ttest: 0.3739202\tbest: 0.3739152 (10196)\ttotal: 8m 30s\tremaining: 4m\n", 1975 | "10300:\tlearn: 0.2827621\ttest: 0.3738768\tbest: 0.3738734 (10298)\ttotal: 8m 35s\tremaining: 3m 54s\n", 1976 | "10400:\tlearn: 0.2821135\ttest: 0.3738436\tbest: 0.3738386 (10386)\ttotal: 8m 39s\tremaining: 3m 49s\n", 1977 | "10500:\tlearn: 0.2813812\ttest: 0.3737976\tbest: 0.3737973 (10499)\ttotal: 8m 44s\tremaining: 3m 44s\n", 1978 | "10600:\tlearn: 0.2806919\ttest: 0.3737563\tbest: 0.3737528 (10598)\ttotal: 8m 49s\tremaining: 3m 39s\n", 1979 | "10700:\tlearn: 0.2799841\ttest: 0.3737778\tbest: 0.3737528 (10598)\ttotal: 8m 54s\tremaining: 3m 34s\n", 1980 | "10800:\tlearn: 0.2793340\ttest: 0.3737218\tbest: 0.3737211 (10799)\ttotal: 8m 59s\tremaining: 3m 29s\n", 1981 | "10900:\tlearn: 0.2786476\ttest: 0.3736935\tbest: 0.3736915 (10897)\ttotal: 9m 4s\tremaining: 3m 24s\n", 1982 | "11000:\tlearn: 0.2780168\ttest: 0.3736679\tbest: 0.3736679 (11000)\ttotal: 9m 9s\tremaining: 3m 19s\n", 1983 | "11100:\tlearn: 0.2773362\ttest: 0.3736590\tbest: 0.3736512 (11010)\ttotal: 9m 14s\tremaining: 3m 14s\n", 1984 | "11200:\tlearn: 0.2767015\ttest: 0.3736327\tbest: 0.3736327 (11200)\ttotal: 9m 19s\tremaining: 3m 9s\n", 1985 | "11300:\tlearn: 0.2760368\ttest: 0.3735720\tbest: 0.3735720 (11300)\ttotal: 9m 24s\tremaining: 3m 4s\n", 1986 | "11400:\tlearn: 0.2753389\ttest: 0.3735426\tbest: 0.3735282 (11386)\ttotal: 9m 30s\tremaining: 2m 59s\n", 1987 | "11500:\tlearn: 0.2747149\ttest: 0.3735154\tbest: 0.3735116 (11468)\ttotal: 9m 35s\tremaining: 2m 54s\n", 1988 | "11600:\tlearn: 0.2740372\ttest: 0.3735139\tbest: 0.3735049 (11595)\ttotal: 9m 40s\tremaining: 2m 49s\n", 1989 | "11700:\tlearn: 0.2733493\ttest: 0.3734869\tbest: 0.3734767 (11687)\ttotal: 9m 44s\tremaining: 2m 44s\n", 1990 | "11800:\tlearn: 0.2726567\ttest: 0.3734749\tbest: 0.3734634 (11773)\ttotal: 9m 49s\tremaining: 2m 39s\n", 1991 | "11900:\tlearn: 0.2719464\ttest: 0.3734793\tbest: 0.3734602 (11826)\ttotal: 9m 54s\tremaining: 2m 34s\n", 1992 | "12000:\tlearn: 0.2712719\ttest: 0.3734943\tbest: 0.3734503 (11921)\ttotal: 9m 59s\tremaining: 2m 29s\n", 1993 | "12100:\tlearn: 0.2705960\ttest: 0.3734967\tbest: 0.3734503 (11921)\ttotal: 10m 4s\tremaining: 2m 24s\n", 1994 | "bestTest = 0.3734503482\n", 1995 | "bestIteration = 11921\n", 1996 | "Shrink model to first 11922 iterations.\n", 1997 | "Fold 4\n", 1998 | "0:\tlearn: 2.9441145\ttest: 2.9428512\tbest: 2.9428512 (0)\ttotal: 63.7ms\tremaining: 15m 55s\n", 1999 | "100:\tlearn: 0.9713663\ttest: 0.9542919\tbest: 0.9542919 (100)\ttotal: 5.82s\tremaining: 14m 19s\n", 2000 | "200:\tlearn: 0.6868976\ttest: 0.6694787\tbest: 0.6694787 (200)\ttotal: 11.6s\tremaining: 14m 11s\n", 2001 | "300:\tlearn: 0.5816151\ttest: 0.5668115\tbest: 0.5668115 (300)\ttotal: 17.1s\tremaining: 13m 55s\n", 2002 | "400:\tlearn: 0.5277489\ttest: 0.5168683\tbest: 0.5168683 (400)\ttotal: 22.4s\tremaining: 13m 34s\n", 2003 | "500:\tlearn: 0.4930415\ttest: 0.4849697\tbest: 0.4849697 (500)\ttotal: 27.8s\tremaining: 13m 24s\n", 2004 | "600:\tlearn: 0.4702242\ttest: 0.4644594\tbest: 0.4644594 (600)\ttotal: 33.3s\tremaining: 13m 16s\n", 2005 | "700:\tlearn: 0.4531113\ttest: 0.4495191\tbest: 0.4495191 (700)\ttotal: 38.6s\tremaining: 13m 7s\n", 2006 | "800:\tlearn: 0.4396474\ttest: 0.4380675\tbest: 0.4380675 (800)\ttotal: 44s\tremaining: 12m 59s\n", 2007 | "900:\tlearn: 0.4285257\ttest: 0.4289758\tbest: 0.4289758 (900)\ttotal: 49.3s\tremaining: 12m 52s\n", 2008 | "1000:\tlearn: 0.4196144\ttest: 0.4218432\tbest: 0.4218432 (1000)\ttotal: 54.6s\tremaining: 12m 43s\n", 2009 | "1100:\tlearn: 0.4113955\ttest: 0.4157216\tbest: 0.4157216 (1100)\ttotal: 1m\tremaining: 12m 37s\n", 2010 | "1200:\tlearn: 0.4042766\ttest: 0.4105635\tbest: 0.4105635 (1200)\ttotal: 1m 5s\tremaining: 12m 32s\n", 2011 | "1300:\tlearn: 0.3978758\ttest: 0.4063071\tbest: 0.4063071 (1300)\ttotal: 1m 10s\tremaining: 12m 26s\n", 2012 | "1400:\tlearn: 0.3920750\ttest: 0.4025975\tbest: 0.4025975 (1400)\ttotal: 1m 16s\tremaining: 12m 19s\n", 2013 | "1500:\tlearn: 0.3869891\ttest: 0.3994454\tbest: 0.3994454 (1500)\ttotal: 1m 21s\tremaining: 12m 13s\n", 2014 | "1600:\tlearn: 0.3826395\ttest: 0.3967433\tbest: 0.3967433 (1600)\ttotal: 1m 26s\tremaining: 12m 6s\n", 2015 | "1700:\tlearn: 0.3786445\ttest: 0.3945701\tbest: 0.3945701 (1700)\ttotal: 1m 32s\tremaining: 11m 59s\n", 2016 | "1800:\tlearn: 0.3750828\ttest: 0.3924128\tbest: 0.3924128 (1800)\ttotal: 1m 37s\tremaining: 11m 52s\n", 2017 | "1900:\tlearn: 0.3713657\ttest: 0.3902837\tbest: 0.3902837 (1900)\ttotal: 1m 42s\tremaining: 11m 45s\n", 2018 | "2000:\tlearn: 0.3681296\ttest: 0.3884967\tbest: 0.3884967 (2000)\ttotal: 1m 47s\tremaining: 11m 39s\n", 2019 | "2100:\tlearn: 0.3653758\ttest: 0.3871591\tbest: 0.3871591 (2100)\ttotal: 1m 52s\tremaining: 11m 32s\n", 2020 | "2200:\tlearn: 0.3626846\ttest: 0.3858827\tbest: 0.3858827 (2200)\ttotal: 1m 57s\tremaining: 11m 25s\n", 2021 | "2300:\tlearn: 0.3603069\ttest: 0.3847996\tbest: 0.3847979 (2299)\ttotal: 2m 2s\tremaining: 11m 17s\n", 2022 | "2400:\tlearn: 0.3581784\ttest: 0.3838213\tbest: 0.3838213 (2400)\ttotal: 2m 7s\tremaining: 11m 10s\n", 2023 | "2500:\tlearn: 0.3561337\ttest: 0.3829761\tbest: 0.3829761 (2500)\ttotal: 2m 12s\tremaining: 11m 3s\n", 2024 | "2600:\tlearn: 0.3541508\ttest: 0.3821290\tbest: 0.3821290 (2600)\ttotal: 2m 17s\tremaining: 10m 56s\n", 2025 | "2700:\tlearn: 0.3523220\ttest: 0.3813719\tbest: 0.3813719 (2700)\ttotal: 2m 22s\tremaining: 10m 49s\n", 2026 | "2800:\tlearn: 0.3505456\ttest: 0.3806785\tbest: 0.3806785 (2800)\ttotal: 2m 27s\tremaining: 10m 42s\n", 2027 | "2900:\tlearn: 0.3487332\ttest: 0.3800245\tbest: 0.3800245 (2900)\ttotal: 2m 32s\tremaining: 10m 36s\n", 2028 | "3000:\tlearn: 0.3471686\ttest: 0.3794431\tbest: 0.3794427 (2999)\ttotal: 2m 37s\tremaining: 10m 30s\n", 2029 | "3100:\tlearn: 0.3456715\ttest: 0.3789680\tbest: 0.3789680 (3100)\ttotal: 2m 42s\tremaining: 10m 23s\n", 2030 | "3200:\tlearn: 0.3442029\ttest: 0.3785018\tbest: 0.3785018 (3200)\ttotal: 2m 47s\tremaining: 10m 17s\n", 2031 | "3300:\tlearn: 0.3428733\ttest: 0.3780782\tbest: 0.3780762 (3299)\ttotal: 2m 52s\tremaining: 10m 11s\n", 2032 | "3400:\tlearn: 0.3415839\ttest: 0.3776550\tbest: 0.3776550 (3400)\ttotal: 2m 57s\tremaining: 10m 4s\n", 2033 | "3500:\tlearn: 0.3404091\ttest: 0.3773172\tbest: 0.3773172 (3500)\ttotal: 3m 2s\tremaining: 9m 58s\n" 2034 | ] 2035 | }, 2036 | { 2037 | "name": "stdout", 2038 | "output_type": "stream", 2039 | "text": [ 2040 | "3600:\tlearn: 0.3392124\ttest: 0.3769761\tbest: 0.3769761 (3600)\ttotal: 3m 7s\tremaining: 9m 52s\n", 2041 | "3700:\tlearn: 0.3379441\ttest: 0.3765148\tbest: 0.3765148 (3700)\ttotal: 3m 11s\tremaining: 9m 45s\n", 2042 | "3800:\tlearn: 0.3366381\ttest: 0.3761183\tbest: 0.3761183 (3800)\ttotal: 3m 16s\tremaining: 9m 39s\n", 2043 | "3900:\tlearn: 0.3353981\ttest: 0.3757377\tbest: 0.3757377 (3900)\ttotal: 3m 21s\tremaining: 9m 33s\n", 2044 | "4000:\tlearn: 0.3341714\ttest: 0.3754291\tbest: 0.3754283 (3997)\ttotal: 3m 26s\tremaining: 9m 27s\n", 2045 | "4100:\tlearn: 0.3329611\ttest: 0.3751036\tbest: 0.3751036 (4100)\ttotal: 3m 31s\tremaining: 9m 22s\n", 2046 | "4200:\tlearn: 0.3319368\ttest: 0.3748113\tbest: 0.3748090 (4197)\ttotal: 3m 36s\tremaining: 9m 16s\n", 2047 | "4300:\tlearn: 0.3308520\ttest: 0.3745480\tbest: 0.3745480 (4300)\ttotal: 3m 41s\tremaining: 9m 10s\n", 2048 | "4400:\tlearn: 0.3297650\ttest: 0.3743114\tbest: 0.3743106 (4399)\ttotal: 3m 46s\tremaining: 9m 4s\n", 2049 | "4500:\tlearn: 0.3286908\ttest: 0.3740695\tbest: 0.3740695 (4500)\ttotal: 3m 51s\tremaining: 8m 58s\n", 2050 | "4600:\tlearn: 0.3276579\ttest: 0.3738413\tbest: 0.3738413 (4600)\ttotal: 3m 55s\tremaining: 8m 53s\n", 2051 | "4700:\tlearn: 0.3266524\ttest: 0.3736030\tbest: 0.3736030 (4700)\ttotal: 4m\tremaining: 8m 47s\n", 2052 | "4800:\tlearn: 0.3255082\ttest: 0.3733491\tbest: 0.3733491 (4800)\ttotal: 4m 5s\tremaining: 8m 41s\n", 2053 | "4900:\tlearn: 0.3245424\ttest: 0.3731565\tbest: 0.3731565 (4900)\ttotal: 4m 10s\tremaining: 8m 35s\n", 2054 | "5000:\tlearn: 0.3235796\ttest: 0.3729578\tbest: 0.3729578 (5000)\ttotal: 4m 15s\tremaining: 8m 30s\n", 2055 | "5100:\tlearn: 0.3226926\ttest: 0.3727767\tbest: 0.3727767 (5100)\ttotal: 4m 20s\tremaining: 8m 24s\n", 2056 | "5200:\tlearn: 0.3218391\ttest: 0.3726883\tbest: 0.3726883 (5200)\ttotal: 4m 24s\tremaining: 8m 19s\n", 2057 | "5300:\tlearn: 0.3209391\ttest: 0.3725385\tbest: 0.3725376 (5299)\ttotal: 4m 29s\tremaining: 8m 13s\n", 2058 | "5400:\tlearn: 0.3199756\ttest: 0.3723152\tbest: 0.3723142 (5397)\ttotal: 4m 34s\tremaining: 8m 8s\n", 2059 | "5500:\tlearn: 0.3190305\ttest: 0.3721521\tbest: 0.3721521 (5500)\ttotal: 4m 39s\tremaining: 8m 2s\n", 2060 | "5600:\tlearn: 0.3180663\ttest: 0.3719317\tbest: 0.3719317 (5600)\ttotal: 4m 44s\tremaining: 7m 57s\n", 2061 | "5700:\tlearn: 0.3172743\ttest: 0.3718330\tbest: 0.3718330 (5700)\ttotal: 4m 49s\tremaining: 7m 51s\n", 2062 | "5800:\tlearn: 0.3163399\ttest: 0.3716938\tbest: 0.3716938 (5800)\ttotal: 4m 53s\tremaining: 7m 46s\n", 2063 | "5900:\tlearn: 0.3153997\ttest: 0.3714888\tbest: 0.3714886 (5899)\ttotal: 4m 58s\tremaining: 7m 40s\n", 2064 | "6000:\tlearn: 0.3145143\ttest: 0.3713565\tbest: 0.3713443 (5991)\ttotal: 5m 3s\tremaining: 7m 35s\n", 2065 | "6100:\tlearn: 0.3135688\ttest: 0.3711795\tbest: 0.3711768 (6099)\ttotal: 5m 8s\tremaining: 7m 29s\n", 2066 | "6200:\tlearn: 0.3127407\ttest: 0.3710886\tbest: 0.3710886 (6200)\ttotal: 5m 13s\tremaining: 7m 24s\n", 2067 | "6300:\tlearn: 0.3118820\ttest: 0.3709710\tbest: 0.3709710 (6300)\ttotal: 5m 18s\tremaining: 7m 19s\n", 2068 | "6400:\tlearn: 0.3110969\ttest: 0.3708528\tbest: 0.3708528 (6400)\ttotal: 5m 22s\tremaining: 7m 13s\n", 2069 | "6500:\tlearn: 0.3102707\ttest: 0.3707461\tbest: 0.3707448 (6498)\ttotal: 5m 27s\tremaining: 7m 8s\n", 2070 | "6600:\tlearn: 0.3094866\ttest: 0.3706898\tbest: 0.3706898 (6600)\ttotal: 5m 32s\tremaining: 7m 3s\n", 2071 | "6700:\tlearn: 0.3086424\ttest: 0.3706255\tbest: 0.3706218 (6690)\ttotal: 5m 37s\tremaining: 6m 57s\n", 2072 | "6800:\tlearn: 0.3078532\ttest: 0.3704983\tbest: 0.3704947 (6798)\ttotal: 5m 42s\tremaining: 6m 52s\n", 2073 | "6900:\tlearn: 0.3069886\ttest: 0.3703896\tbest: 0.3703828 (6892)\ttotal: 5m 47s\tremaining: 6m 47s\n", 2074 | "7000:\tlearn: 0.3062019\ttest: 0.3702855\tbest: 0.3702855 (7000)\ttotal: 5m 52s\tremaining: 6m 42s\n", 2075 | "7100:\tlearn: 0.3054384\ttest: 0.3702080\tbest: 0.3702076 (7098)\ttotal: 5m 56s\tremaining: 6m 37s\n", 2076 | "7200:\tlearn: 0.3046275\ttest: 0.3701393\tbest: 0.3701393 (7200)\ttotal: 6m 1s\tremaining: 6m 31s\n", 2077 | "7300:\tlearn: 0.3039194\ttest: 0.3700597\tbest: 0.3700597 (7300)\ttotal: 6m 6s\tremaining: 6m 26s\n", 2078 | "7400:\tlearn: 0.3030846\ttest: 0.3700079\tbest: 0.3699990 (7392)\ttotal: 6m 11s\tremaining: 6m 21s\n", 2079 | "7500:\tlearn: 0.3022570\ttest: 0.3698628\tbest: 0.3698628 (7500)\ttotal: 6m 16s\tremaining: 6m 16s\n", 2080 | "7600:\tlearn: 0.3014671\ttest: 0.3697234\tbest: 0.3697209 (7597)\ttotal: 6m 21s\tremaining: 6m 11s\n", 2081 | "7700:\tlearn: 0.3007301\ttest: 0.3696408\tbest: 0.3696349 (7695)\ttotal: 6m 26s\tremaining: 6m 6s\n", 2082 | "7800:\tlearn: 0.2999650\ttest: 0.3696314\tbest: 0.3696284 (7799)\ttotal: 6m 31s\tremaining: 6m 1s\n", 2083 | "7900:\tlearn: 0.2992056\ttest: 0.3696034\tbest: 0.3696034 (7900)\ttotal: 6m 36s\tremaining: 5m 56s\n", 2084 | "8000:\tlearn: 0.2983549\ttest: 0.3694847\tbest: 0.3694847 (8000)\ttotal: 6m 41s\tremaining: 5m 51s\n", 2085 | "8100:\tlearn: 0.2975575\ttest: 0.3693905\tbest: 0.3693832 (8094)\ttotal: 6m 46s\tremaining: 5m 46s\n", 2086 | "8200:\tlearn: 0.2968142\ttest: 0.3693207\tbest: 0.3693207 (8200)\ttotal: 6m 51s\tremaining: 5m 41s\n", 2087 | "8300:\tlearn: 0.2960877\ttest: 0.3692626\tbest: 0.3692597 (8294)\ttotal: 6m 56s\tremaining: 5m 36s\n", 2088 | "8400:\tlearn: 0.2953211\ttest: 0.3692038\tbest: 0.3691988 (8398)\ttotal: 7m 1s\tremaining: 5m 31s\n", 2089 | "8500:\tlearn: 0.2945980\ttest: 0.3691641\tbest: 0.3691545 (8476)\ttotal: 7m 7s\tremaining: 5m 26s\n", 2090 | "8600:\tlearn: 0.2938322\ttest: 0.3691234\tbest: 0.3691212 (8596)\ttotal: 7m 12s\tremaining: 5m 21s\n", 2091 | "8700:\tlearn: 0.2930071\ttest: 0.3690673\tbest: 0.3690580 (8658)\ttotal: 7m 17s\tremaining: 5m 16s\n", 2092 | "8800:\tlearn: 0.2922303\ttest: 0.3690234\tbest: 0.3690234 (8800)\ttotal: 7m 22s\tremaining: 5m 11s\n", 2093 | "8900:\tlearn: 0.2914938\ttest: 0.3689460\tbest: 0.3689460 (8900)\ttotal: 7m 27s\tremaining: 5m 6s\n", 2094 | "9000:\tlearn: 0.2907710\ttest: 0.3688827\tbest: 0.3688799 (8997)\ttotal: 7m 31s\tremaining: 5m 1s\n", 2095 | "9100:\tlearn: 0.2900229\ttest: 0.3688352\tbest: 0.3688352 (9100)\ttotal: 7m 36s\tremaining: 4m 56s\n", 2096 | "9200:\tlearn: 0.2892442\ttest: 0.3687413\tbest: 0.3687413 (9200)\ttotal: 7m 41s\tremaining: 4m 50s\n", 2097 | "9300:\tlearn: 0.2885062\ttest: 0.3687262\tbest: 0.3687207 (9269)\ttotal: 7m 46s\tremaining: 4m 45s\n", 2098 | "9400:\tlearn: 0.2877793\ttest: 0.3686913\tbest: 0.3686891 (9397)\ttotal: 7m 51s\tremaining: 4m 40s\n", 2099 | "9500:\tlearn: 0.2869538\ttest: 0.3685994\tbest: 0.3685987 (9498)\ttotal: 7m 56s\tremaining: 4m 35s\n", 2100 | "9600:\tlearn: 0.2861795\ttest: 0.3685772\tbest: 0.3685734 (9592)\ttotal: 8m 1s\tremaining: 4m 30s\n", 2101 | "9700:\tlearn: 0.2854078\ttest: 0.3685484\tbest: 0.3685396 (9696)\ttotal: 8m 6s\tremaining: 4m 25s\n", 2102 | "9800:\tlearn: 0.2846896\ttest: 0.3685243\tbest: 0.3685205 (9782)\ttotal: 8m 11s\tremaining: 4m 20s\n", 2103 | "9900:\tlearn: 0.2838710\ttest: 0.3684569\tbest: 0.3684540 (9895)\ttotal: 8m 16s\tremaining: 4m 15s\n", 2104 | "10000:\tlearn: 0.2831418\ttest: 0.3684233\tbest: 0.3684208 (9993)\ttotal: 8m 21s\tremaining: 4m 10s\n", 2105 | "10100:\tlearn: 0.2825218\ttest: 0.3684175\tbest: 0.3684037 (10082)\ttotal: 8m 26s\tremaining: 4m 5s\n", 2106 | "10200:\tlearn: 0.2817390\ttest: 0.3683958\tbest: 0.3683908 (10183)\ttotal: 8m 31s\tremaining: 4m\n", 2107 | "10300:\tlearn: 0.2809499\ttest: 0.3683933\tbest: 0.3683905 (10212)\ttotal: 8m 36s\tremaining: 3m 55s\n", 2108 | "10400:\tlearn: 0.2802800\ttest: 0.3683796\tbest: 0.3683796 (10400)\ttotal: 8m 41s\tremaining: 3m 50s\n", 2109 | "10500:\tlearn: 0.2795632\ttest: 0.3683903\tbest: 0.3683783 (10401)\ttotal: 8m 46s\tremaining: 3m 45s\n", 2110 | "10600:\tlearn: 0.2788819\ttest: 0.3683868\tbest: 0.3683642 (10527)\ttotal: 8m 51s\tremaining: 3m 40s\n", 2111 | "10700:\tlearn: 0.2781030\ttest: 0.3683809\tbest: 0.3683576 (10677)\ttotal: 8m 56s\tremaining: 3m 35s\n", 2112 | "10800:\tlearn: 0.2774174\ttest: 0.3683718\tbest: 0.3683576 (10677)\ttotal: 9m 1s\tremaining: 3m 30s\n", 2113 | "10900:\tlearn: 0.2767312\ttest: 0.3683335\tbest: 0.3683310 (10898)\ttotal: 9m 7s\tremaining: 3m 25s\n", 2114 | "11000:\tlearn: 0.2760596\ttest: 0.3682948\tbest: 0.3682884 (10996)\ttotal: 9m 12s\tremaining: 3m 20s\n", 2115 | "11100:\tlearn: 0.2753065\ttest: 0.3682629\tbest: 0.3682595 (11086)\ttotal: 9m 17s\tremaining: 3m 15s\n", 2116 | "11200:\tlearn: 0.2746245\ttest: 0.3682473\tbest: 0.3682406 (11171)\ttotal: 9m 22s\tremaining: 3m 10s\n", 2117 | "11300:\tlearn: 0.2739585\ttest: 0.3682471\tbest: 0.3682365 (11217)\ttotal: 9m 27s\tremaining: 3m 5s\n", 2118 | "11400:\tlearn: 0.2732829\ttest: 0.3682331\tbest: 0.3682325 (11386)\ttotal: 9m 33s\tremaining: 3m\n", 2119 | "11500:\tlearn: 0.2725566\ttest: 0.3682264\tbest: 0.3682215 (11496)\ttotal: 9m 38s\tremaining: 2m 55s\n", 2120 | "11600:\tlearn: 0.2719067\ttest: 0.3682170\tbest: 0.3682170 (11600)\ttotal: 9m 43s\tremaining: 2m 50s\n", 2121 | "11700:\tlearn: 0.2712049\ttest: 0.3682375\tbest: 0.3682127 (11627)\ttotal: 9m 48s\tremaining: 2m 45s\n", 2122 | "11800:\tlearn: 0.2704886\ttest: 0.3681876\tbest: 0.3681869 (11798)\ttotal: 9m 54s\tremaining: 2m 41s\n", 2123 | "11900:\tlearn: 0.2697195\ttest: 0.3681579\tbest: 0.3681394 (11858)\ttotal: 10m\tremaining: 2m 36s\n", 2124 | "12000:\tlearn: 0.2690434\ttest: 0.3681903\tbest: 0.3681394 (11858)\ttotal: 10m 7s\tremaining: 2m 31s\n", 2125 | "bestTest = 0.3681394479\n", 2126 | "bestIteration = 11858\n", 2127 | "Shrink model to first 11859 iterations.\n", 2128 | "Fold 5\n", 2129 | "0:\tlearn: 2.9424549\ttest: 2.9442838\tbest: 2.9442838 (0)\ttotal: 77.9ms\tremaining: 19m 28s\n", 2130 | "100:\tlearn: 0.9617585\ttest: 0.9881392\tbest: 0.9881392 (100)\ttotal: 7.21s\tremaining: 17m 43s\n" 2131 | ] 2132 | }, 2133 | { 2134 | "name": "stdout", 2135 | "output_type": "stream", 2136 | "text": [ 2137 | "200:\tlearn: 0.6761444\ttest: 0.7050895\tbest: 0.7050895 (200)\ttotal: 14.5s\tremaining: 17m 47s\n", 2138 | "300:\tlearn: 0.5715305\ttest: 0.6024932\tbest: 0.6024932 (300)\ttotal: 21.5s\tremaining: 17m 29s\n", 2139 | "400:\tlearn: 0.5181647\ttest: 0.5508480\tbest: 0.5508480 (400)\ttotal: 28.1s\tremaining: 17m 4s\n", 2140 | "500:\tlearn: 0.4840808\ttest: 0.5186106\tbest: 0.5186106 (500)\ttotal: 34.9s\tremaining: 16m 50s\n", 2141 | "600:\tlearn: 0.4617105\ttest: 0.4980052\tbest: 0.4980052 (600)\ttotal: 41.7s\tremaining: 16m 38s\n", 2142 | "700:\tlearn: 0.4452419\ttest: 0.4829969\tbest: 0.4829969 (700)\ttotal: 48.5s\tremaining: 16m 28s\n", 2143 | "800:\tlearn: 0.4315767\ttest: 0.4711263\tbest: 0.4711263 (800)\ttotal: 57s\tremaining: 16m 50s\n", 2144 | "900:\tlearn: 0.4202764\ttest: 0.4615710\tbest: 0.4615710 (900)\ttotal: 1m 3s\tremaining: 16m 36s\n", 2145 | "1000:\tlearn: 0.4112617\ttest: 0.4540898\tbest: 0.4540898 (1000)\ttotal: 1m 10s\tremaining: 16m 22s\n", 2146 | "1100:\tlearn: 0.4027244\ttest: 0.4473620\tbest: 0.4473620 (1100)\ttotal: 1m 17s\tremaining: 16m 12s\n", 2147 | "1200:\tlearn: 0.3954945\ttest: 0.4418959\tbest: 0.4418959 (1200)\ttotal: 1m 23s\tremaining: 16m 1s\n", 2148 | "1300:\tlearn: 0.3892795\ttest: 0.4374623\tbest: 0.4374623 (1300)\ttotal: 1m 30s\tremaining: 15m 51s\n", 2149 | "1400:\tlearn: 0.3835470\ttest: 0.4335617\tbest: 0.4335617 (1400)\ttotal: 1m 37s\tremaining: 15m 42s\n", 2150 | "1500:\tlearn: 0.3784162\ttest: 0.4302539\tbest: 0.4302539 (1500)\ttotal: 1m 43s\tremaining: 15m 33s\n", 2151 | "1600:\tlearn: 0.3741321\ttest: 0.4274136\tbest: 0.4274136 (1600)\ttotal: 1m 50s\tremaining: 15m 23s\n", 2152 | "1700:\tlearn: 0.3701685\ttest: 0.4250229\tbest: 0.4250229 (1700)\ttotal: 1m 56s\tremaining: 15m 14s\n", 2153 | "1800:\tlearn: 0.3667586\ttest: 0.4230024\tbest: 0.4230024 (1800)\ttotal: 2m 3s\tremaining: 15m 4s\n", 2154 | "1900:\tlearn: 0.3637877\ttest: 0.4213772\tbest: 0.4213772 (1900)\ttotal: 2m 9s\tremaining: 14m 54s\n", 2155 | "2000:\tlearn: 0.3608081\ttest: 0.4197317\tbest: 0.4197317 (2000)\ttotal: 2m 16s\tremaining: 14m 47s\n", 2156 | "2100:\tlearn: 0.3581836\ttest: 0.4184556\tbest: 0.4184556 (2100)\ttotal: 2m 24s\tremaining: 14m 45s\n", 2157 | "2200:\tlearn: 0.3556847\ttest: 0.4172725\tbest: 0.4172725 (2200)\ttotal: 2m 30s\tremaining: 14m 35s\n", 2158 | "2300:\tlearn: 0.3534802\ttest: 0.4162296\tbest: 0.4162296 (2300)\ttotal: 2m 36s\tremaining: 14m 26s\n", 2159 | "2400:\tlearn: 0.3513135\ttest: 0.4152141\tbest: 0.4152141 (2400)\ttotal: 2m 43s\tremaining: 14m 16s\n", 2160 | "2500:\tlearn: 0.3494048\ttest: 0.4144232\tbest: 0.4144232 (2500)\ttotal: 2m 49s\tremaining: 14m 7s\n", 2161 | "2600:\tlearn: 0.3474297\ttest: 0.4134853\tbest: 0.4134853 (2600)\ttotal: 2m 55s\tremaining: 13m 58s\n", 2162 | "2700:\tlearn: 0.3455128\ttest: 0.4126680\tbest: 0.4126680 (2700)\ttotal: 3m 2s\tremaining: 13m 50s\n", 2163 | "2800:\tlearn: 0.3438843\ttest: 0.4120378\tbest: 0.4120378 (2800)\ttotal: 3m 8s\tremaining: 13m 41s\n", 2164 | "2900:\tlearn: 0.3423575\ttest: 0.4114719\tbest: 0.4114719 (2900)\ttotal: 3m 14s\tremaining: 13m 32s\n", 2165 | "3000:\tlearn: 0.3407821\ttest: 0.4109110\tbest: 0.4109110 (3000)\ttotal: 3m 21s\tremaining: 13m 24s\n", 2166 | "3100:\tlearn: 0.3391884\ttest: 0.4103074\tbest: 0.4103074 (3100)\ttotal: 3m 27s\tremaining: 13m 16s\n", 2167 | "3200:\tlearn: 0.3376242\ttest: 0.4097487\tbest: 0.4097487 (3200)\ttotal: 3m 33s\tremaining: 13m 8s\n", 2168 | "3300:\tlearn: 0.3362089\ttest: 0.4093418\tbest: 0.4093418 (3300)\ttotal: 3m 40s\tremaining: 13m\n", 2169 | "3400:\tlearn: 0.3347776\ttest: 0.4088332\tbest: 0.4088332 (3400)\ttotal: 3m 46s\tremaining: 12m 52s\n", 2170 | "3500:\tlearn: 0.3334630\ttest: 0.4084179\tbest: 0.4084131 (3499)\ttotal: 3m 52s\tremaining: 12m 45s\n", 2171 | "3600:\tlearn: 0.3321925\ttest: 0.4079383\tbest: 0.4079383 (3600)\ttotal: 4m 1s\tremaining: 12m 45s\n", 2172 | "3700:\tlearn: 0.3309955\ttest: 0.4075805\tbest: 0.4075805 (3700)\ttotal: 4m 7s\tremaining: 12m 36s\n", 2173 | "3800:\tlearn: 0.3298875\ttest: 0.4072638\tbest: 0.4072638 (3800)\ttotal: 4m 14s\tremaining: 12m 28s\n", 2174 | "3900:\tlearn: 0.3287322\ttest: 0.4069309\tbest: 0.4069309 (3900)\ttotal: 4m 20s\tremaining: 12m 20s\n", 2175 | "4000:\tlearn: 0.3275012\ttest: 0.4065555\tbest: 0.4065555 (4000)\ttotal: 4m 26s\tremaining: 12m 13s\n", 2176 | "4100:\tlearn: 0.3264662\ttest: 0.4062378\tbest: 0.4062378 (4100)\ttotal: 4m 34s\tremaining: 12m 9s\n", 2177 | "4200:\tlearn: 0.3253504\ttest: 0.4059299\tbest: 0.4059266 (4196)\ttotal: 4m 40s\tremaining: 12m 1s\n", 2178 | "4300:\tlearn: 0.3242990\ttest: 0.4056054\tbest: 0.4056049 (4299)\ttotal: 4m 46s\tremaining: 11m 53s\n", 2179 | "4400:\tlearn: 0.3232095\ttest: 0.4053209\tbest: 0.4053209 (4400)\ttotal: 4m 53s\tremaining: 11m 46s\n", 2180 | "4500:\tlearn: 0.3222201\ttest: 0.4050968\tbest: 0.4050943 (4499)\ttotal: 4m 59s\tremaining: 11m 38s\n", 2181 | "4600:\tlearn: 0.3212505\ttest: 0.4048767\tbest: 0.4048767 (4600)\ttotal: 5m 5s\tremaining: 11m 31s\n", 2182 | "4700:\tlearn: 0.3203304\ttest: 0.4046258\tbest: 0.4046240 (4697)\ttotal: 5m 11s\tremaining: 11m 23s\n", 2183 | "4800:\tlearn: 0.3192728\ttest: 0.4043751\tbest: 0.4043751 (4800)\ttotal: 5m 18s\tremaining: 11m 16s\n", 2184 | "4900:\tlearn: 0.3183672\ttest: 0.4041917\tbest: 0.4041917 (4900)\ttotal: 5m 24s\tremaining: 11m 8s\n", 2185 | "5000:\tlearn: 0.3174759\ttest: 0.4040201\tbest: 0.4040149 (4994)\ttotal: 5m 31s\tremaining: 11m 3s\n", 2186 | "5100:\tlearn: 0.3165674\ttest: 0.4038492\tbest: 0.4038492 (5100)\ttotal: 5m 38s\tremaining: 10m 56s\n", 2187 | "5200:\tlearn: 0.3156905\ttest: 0.4036690\tbest: 0.4036625 (5193)\ttotal: 5m 44s\tremaining: 10m 48s\n", 2188 | "5300:\tlearn: 0.3147239\ttest: 0.4034269\tbest: 0.4034269 (5300)\ttotal: 5m 50s\tremaining: 10m 41s\n", 2189 | "5400:\tlearn: 0.3137961\ttest: 0.4032637\tbest: 0.4032550 (5392)\ttotal: 5m 56s\tremaining: 10m 34s\n", 2190 | "5500:\tlearn: 0.3128559\ttest: 0.4030648\tbest: 0.4030648 (5500)\ttotal: 6m 4s\tremaining: 10m 29s\n", 2191 | "5600:\tlearn: 0.3119545\ttest: 0.4028348\tbest: 0.4028348 (5600)\ttotal: 6m 10s\tremaining: 10m 22s\n", 2192 | "5700:\tlearn: 0.3111319\ttest: 0.4026802\tbest: 0.4026802 (5700)\ttotal: 6m 17s\tremaining: 10m 15s\n", 2193 | "5800:\tlearn: 0.3101547\ttest: 0.4024679\tbest: 0.4024647 (5798)\ttotal: 6m 23s\tremaining: 10m 8s\n", 2194 | "5900:\tlearn: 0.3092607\ttest: 0.4023143\tbest: 0.4023143 (5900)\ttotal: 6m 30s\tremaining: 10m 1s\n", 2195 | "6000:\tlearn: 0.3083620\ttest: 0.4021604\tbest: 0.4021546 (5998)\ttotal: 6m 36s\tremaining: 9m 54s\n", 2196 | "6100:\tlearn: 0.3075716\ttest: 0.4019895\tbest: 0.4019892 (6097)\ttotal: 6m 43s\tremaining: 9m 47s\n", 2197 | "6200:\tlearn: 0.3066833\ttest: 0.4018588\tbest: 0.4018588 (6200)\ttotal: 6m 49s\tremaining: 9m 41s\n", 2198 | "6300:\tlearn: 0.3058037\ttest: 0.4017364\tbest: 0.4017361 (6299)\ttotal: 6m 57s\tremaining: 9m 36s\n", 2199 | "6400:\tlearn: 0.3050026\ttest: 0.4016023\tbest: 0.4016011 (6399)\ttotal: 7m 3s\tremaining: 9m 29s\n", 2200 | "6500:\tlearn: 0.3041733\ttest: 0.4014706\tbest: 0.4014706 (6500)\ttotal: 7m 10s\tremaining: 9m 22s\n", 2201 | "6600:\tlearn: 0.3033654\ttest: 0.4013718\tbest: 0.4013692 (6599)\ttotal: 7m 16s\tremaining: 9m 15s\n", 2202 | "6700:\tlearn: 0.3025699\ttest: 0.4013270\tbest: 0.4013242 (6693)\ttotal: 7m 22s\tremaining: 9m 8s\n", 2203 | "6800:\tlearn: 0.3017861\ttest: 0.4012242\tbest: 0.4012234 (6798)\ttotal: 7m 29s\tremaining: 9m 1s\n", 2204 | "6900:\tlearn: 0.3009621\ttest: 0.4010854\tbest: 0.4010819 (6895)\ttotal: 7m 37s\tremaining: 8m 56s\n", 2205 | "7000:\tlearn: 0.3001619\ttest: 0.4010114\tbest: 0.4010114 (7000)\ttotal: 7m 44s\tremaining: 8m 50s\n", 2206 | "7100:\tlearn: 0.2993572\ttest: 0.4009370\tbest: 0.4009341 (7097)\ttotal: 7m 50s\tremaining: 8m 43s\n", 2207 | "7200:\tlearn: 0.2985159\ttest: 0.4008075\tbest: 0.4008075 (7200)\ttotal: 7m 57s\tremaining: 8m 37s\n", 2208 | "7300:\tlearn: 0.2977427\ttest: 0.4007471\tbest: 0.4007430 (7288)\ttotal: 8m 4s\tremaining: 8m 30s\n", 2209 | "7400:\tlearn: 0.2969435\ttest: 0.4006814\tbest: 0.4006808 (7399)\ttotal: 8m 10s\tremaining: 8m 23s\n", 2210 | "7500:\tlearn: 0.2961321\ttest: 0.4006056\tbest: 0.4005972 (7489)\ttotal: 8m 17s\tremaining: 8m 17s\n", 2211 | "7600:\tlearn: 0.2953712\ttest: 0.4005281\tbest: 0.4005214 (7596)\ttotal: 8m 25s\tremaining: 8m 12s\n", 2212 | "7700:\tlearn: 0.2945888\ttest: 0.4004464\tbest: 0.4004449 (7699)\ttotal: 8m 30s\tremaining: 8m 4s\n", 2213 | "7800:\tlearn: 0.2938290\ttest: 0.4003592\tbest: 0.4003577 (7793)\ttotal: 8m 35s\tremaining: 7m 55s\n", 2214 | "7900:\tlearn: 0.2930511\ttest: 0.4002797\tbest: 0.4002797 (7900)\ttotal: 8m 40s\tremaining: 7m 47s\n", 2215 | "8000:\tlearn: 0.2922237\ttest: 0.4001815\tbest: 0.4001756 (7996)\ttotal: 8m 45s\tremaining: 7m 39s\n", 2216 | "8100:\tlearn: 0.2915275\ttest: 0.4001321\tbest: 0.4001321 (8100)\ttotal: 8m 50s\tremaining: 7m 31s\n", 2217 | "8200:\tlearn: 0.2907568\ttest: 0.4000595\tbest: 0.4000480 (8187)\ttotal: 8m 55s\tremaining: 7m 23s\n", 2218 | "8300:\tlearn: 0.2900331\ttest: 0.3999890\tbest: 0.3999875 (8299)\ttotal: 9m\tremaining: 7m 16s\n", 2219 | "8400:\tlearn: 0.2892897\ttest: 0.3999585\tbest: 0.3999585 (8400)\ttotal: 9m 5s\tremaining: 7m 8s\n", 2220 | "8500:\tlearn: 0.2885418\ttest: 0.3998574\tbest: 0.3998548 (8497)\ttotal: 9m 10s\tremaining: 7m\n", 2221 | "8600:\tlearn: 0.2878089\ttest: 0.3998342\tbest: 0.3998304 (8597)\ttotal: 9m 15s\tremaining: 6m 52s\n", 2222 | "8700:\tlearn: 0.2870797\ttest: 0.3998222\tbest: 0.3998087 (8641)\ttotal: 9m 19s\tremaining: 6m 45s\n", 2223 | "8800:\tlearn: 0.2863412\ttest: 0.3997846\tbest: 0.3997766 (8794)\ttotal: 9m 24s\tremaining: 6m 37s\n", 2224 | "8900:\tlearn: 0.2856493\ttest: 0.3997696\tbest: 0.3997518 (8854)\ttotal: 9m 29s\tremaining: 6m 30s\n" 2225 | ] 2226 | }, 2227 | { 2228 | "name": "stdout", 2229 | "output_type": "stream", 2230 | "text": [ 2231 | "9000:\tlearn: 0.2848238\ttest: 0.3997300\tbest: 0.3997300 (8999)\ttotal: 9m 34s\tremaining: 6m 22s\n", 2232 | "9100:\tlearn: 0.2841335\ttest: 0.3997097\tbest: 0.3997093 (9098)\ttotal: 9m 39s\tremaining: 6m 15s\n", 2233 | "9200:\tlearn: 0.2834343\ttest: 0.3997057\tbest: 0.3996978 (9150)\ttotal: 9m 44s\tremaining: 6m 8s\n", 2234 | "9300:\tlearn: 0.2826697\ttest: 0.3996663\tbest: 0.3996599 (9260)\ttotal: 9m 49s\tremaining: 6m 1s\n", 2235 | "9400:\tlearn: 0.2818948\ttest: 0.3996026\tbest: 0.3996021 (9399)\ttotal: 9m 54s\tremaining: 5m 53s\n", 2236 | "9500:\tlearn: 0.2811680\ttest: 0.3995901\tbest: 0.3995835 (9493)\ttotal: 9m 59s\tremaining: 5m 46s\n", 2237 | "9600:\tlearn: 0.2804824\ttest: 0.3995799\tbest: 0.3995784 (9599)\ttotal: 10m 3s\tremaining: 5m 39s\n", 2238 | "9700:\tlearn: 0.2797218\ttest: 0.3995514\tbest: 0.3995498 (9697)\ttotal: 10m 8s\tremaining: 5m 32s\n", 2239 | "9800:\tlearn: 0.2790184\ttest: 0.3995658\tbest: 0.3995486 (9725)\ttotal: 10m 13s\tremaining: 5m 25s\n", 2240 | "9900:\tlearn: 0.2783224\ttest: 0.3995481\tbest: 0.3995409 (9897)\ttotal: 10m 18s\tremaining: 5m 18s\n", 2241 | "10000:\tlearn: 0.2776369\ttest: 0.3995318\tbest: 0.3995216 (9981)\ttotal: 10m 23s\tremaining: 5m 11s\n", 2242 | "10100:\tlearn: 0.2769338\ttest: 0.3994822\tbest: 0.3994822 (10100)\ttotal: 10m 28s\tremaining: 5m 4s\n", 2243 | "10200:\tlearn: 0.2762047\ttest: 0.3994717\tbest: 0.3994592 (10138)\ttotal: 10m 33s\tremaining: 4m 58s\n", 2244 | "10300:\tlearn: 0.2754445\ttest: 0.3994536\tbest: 0.3994536 (10300)\ttotal: 10m 38s\tremaining: 4m 51s\n", 2245 | "10400:\tlearn: 0.2747585\ttest: 0.3994544\tbest: 0.3994426 (10388)\ttotal: 10m 43s\tremaining: 4m 44s\n", 2246 | "10500:\tlearn: 0.2740382\ttest: 0.3994316\tbest: 0.3994234 (10488)\ttotal: 10m 48s\tremaining: 4m 37s\n", 2247 | "10600:\tlearn: 0.2733845\ttest: 0.3994220\tbest: 0.3994157 (10593)\ttotal: 10m 53s\tremaining: 4m 31s\n", 2248 | "10700:\tlearn: 0.2726328\ttest: 0.3994026\tbest: 0.3994018 (10697)\ttotal: 10m 58s\tremaining: 4m 24s\n", 2249 | "10800:\tlearn: 0.2719250\ttest: 0.3994230\tbest: 0.3993875 (10720)\ttotal: 11m 3s\tremaining: 4m 17s\n", 2250 | "10900:\tlearn: 0.2712554\ttest: 0.3993818\tbest: 0.3993764 (10893)\ttotal: 11m 8s\tremaining: 4m 11s\n", 2251 | "11000:\tlearn: 0.2705212\ttest: 0.3993897\tbest: 0.3993764 (10893)\ttotal: 11m 13s\tremaining: 4m 4s\n", 2252 | "bestTest = 0.3993763864\n", 2253 | "bestIteration = 10893\n", 2254 | "Shrink model to first 10894 iterations.\n", 2255 | "MEAN SCORE = 0.37665441095147056\n" 2256 | ] 2257 | } 2258 | ], 2259 | "source": [ 2260 | "model_cat = CatBoostClassifier(\n", 2261 | " **{\n", 2262 | " 'depth': 5,\n", 2263 | " 'n_estimators': 15000,\n", 2264 | " 'learning_rate': 0.01,\n", 2265 | " 'random_state': 567,\n", 2266 | " 'task_type': 'GPU',\n", 2267 | " 'thread_count': 1,\n", 2268 | " \"verbose\": 100,\n", 2269 | " \"use_best_model\": True,\n", 2270 | " 'nan_mode': 'Max',\n", 2271 | " })\n", 2272 | "\n", 2273 | "probs = []\n", 2274 | "probs_train = []\n", 2275 | "i = 1\n", 2276 | "scoring = 0\n", 2277 | "group_kfold = GroupKFold(n_splits=5)\n", 2278 | "cols = X_train.drop(columns=['ID', 'ID2', 'join_date']).columns\n", 2279 | "for train_index, test_index in group_kfold.split(X_train, y_train,\n", 2280 | " np.array(X_train['ID'])):\n", 2281 | " X_real_train, X_valid = X_train.iloc[train_index], X_train.iloc[test_index]\n", 2282 | " y_real_train, y_valid = y_train.iloc[train_index], y_train.iloc[test_index]\n", 2283 | " print('Fold', i)\n", 2284 | " model_cat.fit(\n", 2285 | " X_real_train[cols],\n", 2286 | " y_real_train,\n", 2287 | " cat_features=cat_features,\n", 2288 | " eval_set=[(X_valid[cols], y_valid)],\n", 2289 | " early_stopping_rounds = 200,\n", 2290 | " )\n", 2291 | " scoring += model_cat.get_best_score()['validation']['MultiClass']\n", 2292 | "\n", 2293 | " proba = model_cat.predict_proba(X_test[cols])\n", 2294 | " probs.append(proba)\n", 2295 | " probs_train.append(model_cat.predict_proba(X_train[cols]))\n", 2296 | " i += 1\n", 2297 | "scoring /= 5\n", 2298 | "print('MEAN SCORE =', scoring)" 2299 | ] 2300 | }, 2301 | { 2302 | "cell_type": "code", 2303 | "execution_count": 52, 2304 | "metadata": {}, 2305 | "outputs": [ 2306 | { 2307 | "data": { 2308 | "text/html": [ 2309 | "
\n", 2310 | "\n", 2323 | "\n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | "
importance
RVSZ3.425022
sex1.030568
marital_status6.894693
occupation_code2.663958
birth_year2.524650
from_begin1.836085
day_of_year1.021888
age2.946634
mean_age_in_occupation1.728786
std_age_in_occupation2.435206
occupation_code_freq1.001891
branch_ocupation_freq1.370450
branch_ocupcode_freq1.119399
product_comb3.557339
RVSZ_JZ9D2.484825
RVSZ_ECY31.351959
from_arise_product_AHXO_in_branch1.080058
RVSZ__age_mean3.978749
RVSZ__age_std2.444638
dayofweek_cat1.230625
from_begin_cat2.898488
\n", 2417 | "
" 2418 | ], 2419 | "text/plain": [ 2420 | " importance\n", 2421 | "RVSZ 3.425022\n", 2422 | "sex 1.030568\n", 2423 | "marital_status 6.894693\n", 2424 | "occupation_code 2.663958\n", 2425 | "birth_year 2.524650\n", 2426 | "from_begin 1.836085\n", 2427 | "day_of_year 1.021888\n", 2428 | "age 2.946634\n", 2429 | "mean_age_in_occupation 1.728786\n", 2430 | "std_age_in_occupation 2.435206\n", 2431 | "occupation_code_freq 1.001891\n", 2432 | "branch_ocupation_freq 1.370450\n", 2433 | "branch_ocupcode_freq 1.119399\n", 2434 | "product_comb 3.557339\n", 2435 | "RVSZ_JZ9D 2.484825\n", 2436 | "RVSZ_ECY3 1.351959\n", 2437 | "from_arise_product_AHXO_in_branch 1.080058\n", 2438 | "RVSZ__age_mean 3.978749\n", 2439 | "RVSZ__age_std 2.444638\n", 2440 | "dayofweek_cat 1.230625\n", 2441 | "from_begin_cat 2.898488" 2442 | ] 2443 | }, 2444 | "execution_count": 52, 2445 | "metadata": {}, 2446 | "output_type": "execute_result" 2447 | } 2448 | ], 2449 | "source": [ 2450 | "pd.DataFrame(model_cat.feature_importances_,\n", 2451 | " index=cols,\n", 2452 | " columns=['importance']).query('importance>1')" 2453 | ] 2454 | }, 2455 | { 2456 | "cell_type": "code", 2457 | "execution_count": 53, 2458 | "metadata": { 2459 | "code_folding": [] 2460 | }, 2461 | "outputs": [], 2462 | "source": [ 2463 | "#Meaning predict values\n", 2464 | "#\n", 2465 | "new_a = np.ones((10000,21)) * 0.0\n", 2466 | "for r in probs:\n", 2467 | " new_a += r\n", 2468 | "new_a /= 5" 2469 | ] 2470 | }, 2471 | { 2472 | "cell_type": "code", 2473 | "execution_count": 54, 2474 | "metadata": {}, 2475 | "outputs": [], 2476 | "source": [ 2477 | "y_test = pd.DataFrame(new_a)\n", 2478 | "y_test.columns = le.inverse_transform(y_test.columns)" 2479 | ] 2480 | }, 2481 | { 2482 | "cell_type": "code", 2483 | "execution_count": 55, 2484 | "metadata": { 2485 | "code_folding": [] 2486 | }, 2487 | "outputs": [ 2488 | { 2489 | "data": { 2490 | "application/vnd.jupyter.widget-view+json": { 2491 | "model_id": "8dc756e81aba45dda072f754c119c765", 2492 | "version_major": 2, 2493 | "version_minor": 0 2494 | }, 2495 | "text/plain": [ 2496 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" 2497 | ] 2498 | }, 2499 | "metadata": {}, 2500 | "output_type": "display_data" 2501 | }, 2502 | { 2503 | "name": "stdout", 2504 | "output_type": "stream", 2505 | "text": [ 2506 | "\n" 2507 | ] 2508 | } 2509 | ], 2510 | "source": [ 2511 | "for i,row in tqdm_notebook(X_test.iterrows()):\n", 2512 | " summ = 0\n", 2513 | " \n", 2514 | " for c in names_products:\n", 2515 | " if row[c] == 1:\n", 2516 | " y_test.loc[y_test.index == i, c] = 1.0\n", 2517 | " else:\n", 2518 | " summ += y_test.loc[y_test.index == i, c].values[0]\n", 2519 | " for c in names_products:\n", 2520 | " if row[c] != 1.0:\n", 2521 | " y_test.loc[y_test.index == i, c] /= summ" 2522 | ] 2523 | }, 2524 | { 2525 | "cell_type": "code", 2526 | "execution_count": 56, 2527 | "metadata": {}, 2528 | "outputs": [ 2529 | { 2530 | "data": { 2531 | "text/html": [ 2532 | "
\n", 2533 | "\n", 2546 | "\n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | " \n", 2581 | "
ID X PCODELabel
0F86J5PC X 66FJ0.000079
1F86J5PC X 7POT0.000080
2F86J5PC X 8NN10.000002
3F86J5PC X AHXO0.000017
4F86J5PC X BSTQ0.000023
\n", 2582 | "
" 2583 | ], 2584 | "text/plain": [ 2585 | " ID X PCODE Label\n", 2586 | "0 F86J5PC X 66FJ 0.000079\n", 2587 | "1 F86J5PC X 7POT 0.000080\n", 2588 | "2 F86J5PC X 8NN1 0.000002\n", 2589 | "3 F86J5PC X AHXO 0.000017\n", 2590 | "4 F86J5PC X BSTQ 0.000023" 2591 | ] 2592 | }, 2593 | "execution_count": 56, 2594 | "metadata": {}, 2595 | "output_type": "execute_result" 2596 | } 2597 | ], 2598 | "source": [ 2599 | "#Reformat predict values to necessary view and replace that are given\n", 2600 | "#\n", 2601 | "answer_mass = []\n", 2602 | "for i in range(X_test.shape[0]):\n", 2603 | " id = X_test['ID'].iloc[i]\n", 2604 | " for c in y_test.columns:\n", 2605 | " answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])\n", 2606 | "\n", 2607 | "df_answer = pd.DataFrame(answer_mass)\n", 2608 | "df_answer.columns = ['ID X PCODE', 'Label']\n", 2609 | "df_answer.head()" 2610 | ] 2611 | }, 2612 | { 2613 | "cell_type": "code", 2614 | "execution_count": 72, 2615 | "metadata": {}, 2616 | "outputs": [ 2617 | { 2618 | "data": { 2619 | "text/html": [ 2620 | "
\n", 2621 | "\n", 2634 | "\n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | " \n", 2656 | " \n", 2657 | " \n", 2658 | " \n", 2659 | " \n", 2660 | " \n", 2661 | " \n", 2662 | " \n", 2663 | " \n", 2664 | " \n", 2665 | " \n", 2666 | " \n", 2667 | " \n", 2668 | " \n", 2669 | "
ID X PCODELabel
00021EE1 X P5DA0.000005
10029J1L X P5DA0.000114
2004QK71 X P5DA0.000091
3005AP9V X P5DA0.000021
40096G27 X P5DA0.000015
\n", 2670 | "
" 2671 | ], 2672 | "text/plain": [ 2673 | " ID X PCODE Label\n", 2674 | "0 0021EE1 X P5DA 0.000005\n", 2675 | "1 0029J1L X P5DA 0.000114\n", 2676 | "2 004QK71 X P5DA 0.000091\n", 2677 | "3 005AP9V X P5DA 0.000021\n", 2678 | "4 0096G27 X P5DA 0.000015" 2679 | ] 2680 | }, 2681 | "execution_count": 72, 2682 | "metadata": {}, 2683 | "output_type": "execute_result" 2684 | } 2685 | ], 2686 | "source": [ 2687 | "#Blending models\n", 2688 | "#\n", 2689 | "df_new = pd.read_csv('submiss.csv')\n", 2690 | "df = df_new.merge(\n", 2691 | " df_answer.rename(columns={'Label': 'lbl'}))\n", 2692 | "df['Label'] = df['Label']*0.8 + df['lbl']*0.2\n", 2693 | "df.drop(['lbl'], axis=1, inplace=True)\n", 2694 | "df_new = pd.read_csv('submis_1.csv')\n", 2695 | "df = df.merge(\n", 2696 | " df_new.rename(columns={'Label': 'lbl'}))\n", 2697 | "df['Label'] = df['Label']*0.6 + df['lbl']*0.4\n", 2698 | "df.drop(['lbl'], axis=1, inplace=True)\n", 2699 | "df.head()" 2700 | ] 2701 | }, 2702 | { 2703 | "cell_type": "code", 2704 | "execution_count": 73, 2705 | "metadata": {}, 2706 | "outputs": [], 2707 | "source": [ 2708 | "#Back to the first view\n", 2709 | "#\n", 2710 | "df_replaced = copy.copy(df)\n", 2711 | "df_replaced['ID'] = list(map(lambda x: x.split(' X ')[0], df_replaced['ID X PCODE']))\n", 2712 | "df_replaced['PCODE'] = list(map(lambda x: x.split(' X ')[1], df_replaced['ID X PCODE']))\n", 2713 | "df_replaced = df_replaced.set_index(['ID','PCODE'])['Label'].unstack().reset_index()" 2714 | ] 2715 | }, 2716 | { 2717 | "cell_type": "code", 2718 | "execution_count": 74, 2719 | "metadata": {}, 2720 | "outputs": [], 2721 | "source": [ 2722 | "#Extra read data\n", 2723 | "#\n", 2724 | "train_base = pd.read_csv('Train.csv')\n", 2725 | "test_base = pd.read_csv('Test.csv')" 2726 | ] 2727 | }, 2728 | { 2729 | "cell_type": "code", 2730 | "execution_count": 76, 2731 | "metadata": {}, 2732 | "outputs": [], 2733 | "source": [ 2734 | "#Making dict with known values from statistics\n", 2735 | "#\n", 2736 | "dict_post = {}\n", 2737 | "for pr1 in names_products:\n", 2738 | " for pr2 in names_products:\n", 2739 | " if pr1 == pr2:\n", 2740 | " continue\n", 2741 | " stats = train_base[train_base[pr1] == 1][pr2].value_counts()\n", 2742 | "\n", 2743 | " stats0 = stats.get(0, 0)\n", 2744 | " stats1 = stats.get(1, 0)\n", 2745 | "\n", 2746 | " threshold = 0.999\n", 2747 | "\n", 2748 | " if stats0 + stats1 > 100 and \\\n", 2749 | " (stats0 / (stats0 + stats1) > threshold or stats1 / (stats1 + stats0) > threshold ):\n", 2750 | " if (stats1 == 0 or stats0 > stats1):\n", 2751 | " dict_post[(pr1, pr2)] = (1,0)\n", 2752 | " elif stats0 == 0 or stats1 > stats0:\n", 2753 | " dict_post[(pr1, pr2)] = (1,1)" 2754 | ] 2755 | }, 2756 | { 2757 | "cell_type": "code", 2758 | "execution_count": 77, 2759 | "metadata": {}, 2760 | "outputs": [ 2761 | { 2762 | "data": { 2763 | "application/vnd.jupyter.widget-view+json": { 2764 | "model_id": "a673810336d3439dbdcb8d61dd1b0039", 2765 | "version_major": 2, 2766 | "version_minor": 0 2767 | }, 2768 | "text/plain": [ 2769 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" 2770 | ] 2771 | }, 2772 | "metadata": {}, 2773 | "output_type": "display_data" 2774 | }, 2775 | { 2776 | "name": "stdout", 2777 | "output_type": "stream", 2778 | "text": [ 2779 | "\n" 2780 | ] 2781 | } 2782 | ], 2783 | "source": [ 2784 | "#Replacing known values from our predictions\n", 2785 | "#\n", 2786 | "for i, row in tqdm_notebook(df_replaced.iterrows()):\n", 2787 | " for key in dict_post.keys():\n", 2788 | " if row[key[0]] == 1:\n", 2789 | " if row[key[1]] != dict_post[key][1]:\n", 2790 | " if dict_post[key][1] == 1:\n", 2791 | " for c in names_products:\n", 2792 | " if c != key[1] and row[c] != 1:\n", 2793 | " df_replaced.loc[df_replaced.index == i, c] = 1e-53\n", 2794 | " elif c==key[1]:\n", 2795 | " df_replaced.loc[df_replaced.index == i, key[1]] = 0.9999999999999999999999999999999999" 2796 | ] 2797 | }, 2798 | { 2799 | "cell_type": "code", 2800 | "execution_count": 78, 2801 | "metadata": {}, 2802 | "outputs": [ 2803 | { 2804 | "data": { 2805 | "text/html": [ 2806 | "
\n", 2807 | "\n", 2820 | "\n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | " \n", 2825 | " \n", 2826 | " \n", 2827 | " \n", 2828 | " \n", 2829 | " \n", 2830 | " \n", 2831 | " \n", 2832 | " \n", 2833 | " \n", 2834 | " \n", 2835 | " \n", 2836 | " \n", 2837 | " \n", 2838 | " \n", 2839 | " \n", 2840 | " \n", 2841 | " \n", 2842 | " \n", 2843 | " \n", 2844 | " \n", 2845 | " \n", 2846 | " \n", 2847 | " \n", 2848 | " \n", 2849 | " \n", 2850 | " \n", 2851 | " \n", 2852 | " \n", 2853 | " \n", 2854 | " \n", 2855 | "
ID X PCODELabel
00021EE1 X P5DA1.000000e-53
10029J1L X P5DA1.138897e-04
2004QK71 X P5DA9.056966e-05
3005AP9V X P5DA2.055958e-05
40096G27 X P5DA1.524471e-05
\n", 2856 | "
" 2857 | ], 2858 | "text/plain": [ 2859 | " ID X PCODE Label\n", 2860 | "0 0021EE1 X P5DA 1.000000e-53\n", 2861 | "1 0029J1L X P5DA 1.138897e-04\n", 2862 | "2 004QK71 X P5DA 9.056966e-05\n", 2863 | "3 005AP9V X P5DA 2.055958e-05\n", 2864 | "4 0096G27 X P5DA 1.524471e-05" 2865 | ] 2866 | }, 2867 | "execution_count": 78, 2868 | "metadata": {}, 2869 | "output_type": "execute_result" 2870 | } 2871 | ], 2872 | "source": [ 2873 | "#Making submission view\n", 2874 | "#\n", 2875 | "products =df_replaced[['P5DA', 'RIBP', '8NN1',\n", 2876 | " '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',\n", 2877 | " 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']]\n", 2878 | "df_replaced = df_replaced.melt(id_vars=['ID'], value_vars=products, var_name = \"PCODE\", value_name=\"Label\" )\n", 2879 | "df_replaced['ID X PCODE'] = df_replaced['ID'] + ' X ' + df_replaced['PCODE']\n", 2880 | "df_replaced = df_replaced[['ID X PCODE', 'Label']]\n", 2881 | "df_replaced.head()" 2882 | ] 2883 | }, 2884 | { 2885 | "cell_type": "code", 2886 | "execution_count": 79, 2887 | "metadata": {}, 2888 | "outputs": [], 2889 | "source": [ 2890 | "df_replaced.to_csv('submis.csv', index = False)" 2891 | ] 2892 | }, 2893 | { 2894 | "cell_type": "markdown", 2895 | "metadata": {}, 2896 | "source": [ 2897 | "Submit" 2898 | ] 2899 | } 2900 | ], 2901 | "metadata": { 2902 | "colab": { 2903 | "collapsed_sections": [], 2904 | "include_colab_link": true, 2905 | "name": "Baseline1.ipynb", 2906 | "provenance": [] 2907 | }, 2908 | "kernelspec": { 2909 | "display_name": "Python 3", 2910 | "language": "python", 2911 | "name": "python3" 2912 | }, 2913 | "language_info": { 2914 | "codemirror_mode": { 2915 | "name": "ipython", 2916 | "version": 3 2917 | }, 2918 | "file_extension": ".py", 2919 | "mimetype": "text/x-python", 2920 | "name": "python", 2921 | "nbconvert_exporter": "python", 2922 | "pygments_lexer": "ipython3", 2923 | "version": "3.8.3" 2924 | }, 2925 | "toc": { 2926 | "base_numbering": 1, 2927 | "nav_menu": { 2928 | "height": "142px", 2929 | "width": "160px" 2930 | }, 2931 | "number_sections": true, 2932 | "sideBar": true, 2933 | "skip_h1_title": false, 2934 | "title_cell": "Table of Contents", 2935 | "title_sidebar": "Contents", 2936 | "toc_cell": false, 2937 | "toc_position": { 2938 | "height": "calc(100% - 180px)", 2939 | "left": "10px", 2940 | "top": "150px", 2941 | "width": "220.6px" 2942 | }, 2943 | "toc_section_display": true, 2944 | "toc_window_display": true 2945 | } 2946 | }, 2947 | "nbformat": 4, 2948 | "nbformat_minor": 1 2949 | } 2950 | --------------------------------------------------------------------------------