├── README.md
├── kaggle-survey-2018.zip
└── End2EndXGBoost-Solution.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # 2022-odsc-xgboost
2 | 


--------------------------------------------------------------------------------
/kaggle-survey-2018.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattharrison/2022-odsc-xgboost/main/kaggle-survey-2018.zip


--------------------------------------------------------------------------------
/End2EndXGBoost-Solution.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# End to End XGBoost\n",
   8 |     "\n",
   9 |     "https://github.com/mattharrison/\n",
  10 |     "\n",
  11 |     "©2022 MetaSnake\n",
  12 |     "\n",
  13 |     "`@__mharrison__`"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "code",
  18 |    "execution_count": null,
  19 |    "metadata": {},
  20 |    "outputs": [],
  21 |    "source": [
  22 |     "import xgboost\n",
  23 |     "xgboost.__version__"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": null,
  29 |    "metadata": {},
  30 |    "outputs": [],
  31 |    "source": []
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": null,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": []
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": null,
  43 |    "metadata": {},
  44 |    "outputs": [],
  45 |    "source": []
  46 |   },
  47 |   {
  48 |    "cell_type": "code",
  49 |    "execution_count": null,
  50 |    "metadata": {},
  51 |    "outputs": [],
  52 |    "source": []
  53 |   },
  54 |   {
  55 |    "cell_type": "code",
  56 |    "execution_count": null,
  57 |    "metadata": {},
  58 |    "outputs": [],
  59 |    "source": []
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": null,
  64 |    "metadata": {},
  65 |    "outputs": [],
  66 |    "source": []
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": null,
  71 |    "metadata": {},
  72 |    "outputs": [],
  73 |    "source": []
  74 |   },
  75 |   {
  76 |    "cell_type": "markdown",
  77 |    "metadata": {},
  78 |    "source": [
  79 |     "## Libraries\n",
  80 |     "We will also use SHAP, xgbfir, openpyxl, hyperopt"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "code",
  85 |    "execution_count": null,
  86 |    "metadata": {},
  87 |    "outputs": [],
  88 |    "source": [
  89 |     "# for colab\n",
  90 |     "!pip install dtreeviz feature_engine pybaobabdt xgbfir shap"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": null,
  96 |    "metadata": {
  97 |     "lines_to_next_cell": 0
  98 |    },
  99 |    "outputs": [],
 100 |    "source": [
 101 |     "from feature_engine import encoding, imputation\n",
 102 |     "import matplotlib.pyplot as plt\n",
 103 |     "import numpy as np\n",
 104 |     "import pandas as pd\n",
 105 |     "\n",
 106 |     "from sklearn import base, compose, datasets, ensemble, \\\n",
 107 |     "    metrics, model_selection, pipeline, preprocessing, tree\n",
 108 |     "import xgboost as xgb\n",
 109 |     "import yellowbrick.model_selection as ms\n",
 110 |     "from yellowbrick import classifier\n",
 111 |     "\n",
 112 |     "import urllib\n",
 113 |     "import zipfile"
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "code",
 118 |    "execution_count": null,
 119 |    "metadata": {},
 120 |    "outputs": [],
 121 |    "source": []
 122 |   },
 123 |   {
 124 |    "cell_type": "code",
 125 |    "execution_count": null,
 126 |    "metadata": {},
 127 |    "outputs": [],
 128 |    "source": []
 129 |   },
 130 |   {
 131 |    "cell_type": "code",
 132 |    "execution_count": null,
 133 |    "metadata": {
 134 |     "lines_to_next_cell": 2
 135 |    },
 136 |    "outputs": [],
 137 |    "source": []
 138 |   },
 139 |   {
 140 |    "cell_type": "markdown",
 141 |    "metadata": {},
 142 |    "source": [
 143 |     "## Datasets\n",
 144 |     "\n",
 145 |     "I'll be demoing with Kaggle 2018 survey data\n"
 146 |    ]
 147 |   },
 148 |   {
 149 |    "cell_type": "code",
 150 |    "execution_count": null,
 151 |    "metadata": {},
 152 |    "outputs": [],
 153 |    "source": [
 154 |     "url = 'https://github.com/mattharrison/datasets/raw/master/data/kaggle-survey-2018.zip'\n",
 155 |     "fin = urllib.request.urlopen(url)\n",
 156 |     "#fin = open('kaggle-survey-2018.zip', mode='rb')\n",
 157 |     "data = fin.read()\n",
 158 |     "with open('kaggle-survey-2018.zip', mode='wb') as fout:\n",
 159 |     "    fout.write(data)\n",
 160 |     "with zipfile.ZipFile('kaggle-survey-2018.zip') as z:\n",
 161 |     "    print(z.namelist())\n",
 162 |     "    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))\n",
 163 |     "    kag_questions = kag.iloc[0]\n",
 164 |     "    raw = kag.iloc[1:]"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "code",
 169 |    "execution_count": null,
 170 |    "metadata": {
 171 |     "scrolled": true
 172 |    },
 173 |    "outputs": [],
 174 |    "source": [
 175 |     "def topn(ser, n=5, default='other'):\n",
 176 |     "    counts = ser.value_counts()\n",
 177 |     "    return ser.where(ser.isin(counts.index[:n]), default)\n",
 178 |     "\n",
 179 |     "def tweak_kag(df):\n",
 180 |     "    return (df\n",
 181 |     "        #.query('Q3.isin([\"United States of America\", \"China\", \"India\"]) '\\\n",
 182 |     "        #       'and Q6.isin([\"Data Scientist\", \"Software Engineer\"])')\n",
 183 |     "        .loc[df.Q3.isin([\"United States of America\", \"China\", \"India\"]) &\n",
 184 |     "             df.Q6.isin([\"Data Scientist\", \"Software Engineer\"])]\n",
 185 |     "        .pipe(lambda df_:\n",
 186 |     "            df_.assign(**(df_.Q1.pipe(pd.get_dummies, drop_first=True, prefix='gender')),\n",
 187 |     "                       age=df_.Q2.str.slice(0,2).astype(int),\n",
 188 |     "                       **(df_.Q3.pipe(pd.get_dummies, drop_first=True, prefix='country')),\n",
 189 |     "                       education=df_.Q4.replace({'Master’s degree': 18,\n",
 190 |     "                         'Bachelor’s degree': 16,\n",
 191 |     "                         'Doctoral degree': 20,\n",
 192 |     "                         'Some college/university study without earning a bachelor’s degree': 13,\n",
 193 |     "                         'Professional degree': 19,\n",
 194 |     "                         'I prefer not to answer': None,\n",
 195 |     "                         'No formal education past high school': 12}),\n",
 196 |     "                       **(df_.Q5\n",
 197 |     "                              .pipe(topn, n=3)\n",
 198 |     "                              .replace({\n",
 199 |     "                        'Computer science (software engineering, etc.)': 'cs',\n",
 200 |     "                        'Engineering (non-computer focused)': 'eng',\n",
 201 |     "                        'Mathematics or statistics': 'stat'})\n",
 202 |     "                              .pipe(pd.get_dummies, drop_first=True, prefix='major')),\n",
 203 |     "                       title=df_.Q6,\n",
 204 |     "                       years_exp=(df_.Q8.str.replace('+','', regex=False)\n",
 205 |     "                           .str.split('-', expand=True)\n",
 206 |     "                           .iloc[:,0]\n",
 207 |     "                           .astype(float)),\n",
 208 |     "                       compensation=(df_.Q9.str.replace('+','', regex=False)\n",
 209 |     "                           .str.replace(',','', regex=False)\n",
 210 |     "                           .str.replace('500000', '500', regex=False)\n",
 211 |     "                           .str.replace('I do not wish to disclose my approximate yearly compensation', '0', regex=False)\n",
 212 |     "                           .str.split('-', expand=True)\n",
 213 |     "                           .iloc[:,0]\n",
 214 |     "                           .fillna(0)\n",
 215 |     "                           .astype(int)\n",
 216 |     "                           .mul(1_000)\n",
 217 |     "                                    ),\n",
 218 |     "                       python=df_.Q16_Part_1.fillna(0).replace('Python', 1),\n",
 219 |     "                       r=df_.Q16_Part_2.fillna(0).replace('R', 1),\n",
 220 |     "                       sql=df_.Q16_Part_3.fillna(0).replace('SQL', 1)\n",
 221 |     "               )#assign\n",
 222 |     "              \n",
 223 |     "        )#pipe\n",
 224 |     "        .rename(columns=lambda col:col.replace(' ', '_'))\n",
 225 |     "        .loc[:, 'gender_Male':]   \n",
 226 |     "        .dropna()\n",
 227 |     "       )\n",
 228 |     "kag = tweak_kag(raw)\n",
 229 |     "kag_X = kag.drop(columns='title')\n",
 230 |     "kag_y = (kag.title == 'Data Scientist')\n",
 231 |     "kag_X_train, kag_X_test, kag_y_train, kag_y_test = model_selection.train_test_split(\n",
 232 |     "    kag_X, kag_y, stratify=kag_y, random_state=42)"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": null,
 238 |    "metadata": {},
 239 |    "outputs": [],
 240 |    "source": []
 241 |   },
 242 |   {
 243 |    "cell_type": "code",
 244 |    "execution_count": null,
 245 |    "metadata": {},
 246 |    "outputs": [],
 247 |    "source": [
 248 |     "kag_y"
 249 |    ]
 250 |   },
 251 |   {
 252 |    "cell_type": "code",
 253 |    "execution_count": null,
 254 |    "metadata": {},
 255 |    "outputs": [],
 256 |    "source": []
 257 |   },
 258 |   {
 259 |    "cell_type": "code",
 260 |    "execution_count": null,
 261 |    "metadata": {},
 262 |    "outputs": [],
 263 |    "source": []
 264 |   },
 265 |   {
 266 |    "cell_type": "markdown",
 267 |    "metadata": {},
 268 |    "source": [
 269 |     "## Stumps, Trees, and Forests\n",
 270 |     "\n",
 271 |     "Decision trees use a greedy algorithm to split on a feature (column) that results in the most \"pure\" split."
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": null,
 277 |    "metadata": {},
 278 |    "outputs": [],
 279 |    "source": [
 280 |     "# True - DS\n",
 281 |     "kag_y.value_counts()"
 282 |    ]
 283 |   },
 284 |   {
 285 |    "cell_type": "code",
 286 |    "execution_count": null,
 287 |    "metadata": {
 288 |     "lines_to_next_cell": 2,
 289 |     "scrolled": true
 290 |    },
 291 |    "outputs": [],
 292 |    "source": [
 293 |     "stump = tree.DecisionTreeClassifier(max_depth=1)\n",
 294 |     "stump.fit(kag_X_train, kag_y_train)\n",
 295 |     "stump.score(kag_X_test, kag_y_test)\n"
 296 |    ]
 297 |   },
 298 |   {
 299 |    "cell_type": "code",
 300 |    "execution_count": null,
 301 |    "metadata": {},
 302 |    "outputs": [],
 303 |    "source": [
 304 |     "# False - SE, Data Scientist - DS\n",
 305 |     "stump.classes_"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "code",
 310 |    "execution_count": null,
 311 |    "metadata": {
 312 |     "scrolled": true
 313 |    },
 314 |    "outputs": [],
 315 |    "source": [
 316 |     "features = list(c for c in kag_X_train.columns)\n",
 317 |     "_ = tree.plot_tree(stump, feature_names=features, filled=True, \n",
 318 |     "                   class_names=['SE', 'DS'])"
 319 |    ]
 320 |   },
 321 |   {
 322 |    "cell_type": "code",
 323 |    "execution_count": null,
 324 |    "metadata": {},
 325 |    "outputs": [],
 326 |    "source": []
 327 |   },
 328 |   {
 329 |    "cell_type": "code",
 330 |    "execution_count": null,
 331 |    "metadata": {},
 332 |    "outputs": [],
 333 |    "source": []
 334 |   },
 335 |   {
 336 |    "cell_type": "markdown",
 337 |    "metadata": {},
 338 |    "source": [
 339 |     "## Underfit\n",
 340 |     "A stump is too simple. It has too much *bias*.\n",
 341 |     "\n",
 342 |     "Solutions:\n",
 343 |     "\n",
 344 |     "* Add more features\n",
 345 |     "* Use a more complex model\n",
 346 |     "\n",
 347 |     "For a tree we can let it grow deeper which should do both."
 348 |    ]
 349 |   },
 350 |   {
 351 |    "cell_type": "code",
 352 |    "execution_count": null,
 353 |    "metadata": {},
 354 |    "outputs": [],
 355 |    "source": []
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "metadata": {},
 360 |    "source": [
 361 |     "## Overfitting\n",
 362 |     "\n",
 363 |     "A model is too complicated. It has too much variance.\n",
 364 |     "\n",
 365 |     "Solutions:\n",
 366 |     "\n",
 367 |     "* Simplify or constrain (*regularize*)\n",
 368 |     "* Add more samples\n",
 369 |     "\n",
 370 |     "For a tree we can prune back the growth so that the leaf nodes are overly specific."
 371 |    ]
 372 |   },
 373 |   {
 374 |    "cell_type": "code",
 375 |    "execution_count": null,
 376 |    "metadata": {
 377 |     "lines_to_next_cell": 2,
 378 |     "scrolled": false
 379 |    },
 380 |    "outputs": [],
 381 |    "source": [
 382 |     "hi_variance = tree.DecisionTreeClassifier(max_depth=None)\n",
 383 |     "hi_variance.fit(kag_X_train, kag_y_train)\n",
 384 |     "hi_variance.score(kag_X_test, kag_y_test)"
 385 |    ]
 386 |   },
 387 |   {
 388 |    "cell_type": "code",
 389 |    "execution_count": null,
 390 |    "metadata": {
 391 |     "scrolled": false
 392 |    },
 393 |    "outputs": [],
 394 |    "source": [
 395 |     "features = list(c for c in kag_X_train.columns)\n",
 396 |     "_ = tree.plot_tree(hi_variance, feature_names=features, filled=True, \n",
 397 |     "                   class_names=['SE', 'DS'])"
 398 |    ]
 399 |   },
 400 |   {
 401 |    "cell_type": "code",
 402 |    "execution_count": null,
 403 |    "metadata": {
 404 |     "scrolled": false
 405 |    },
 406 |    "outputs": [],
 407 |    "source": [
 408 |     "# limit view to first 2\n",
 409 |     "features = list(c for c in kag_X_train.columns)\n",
 410 |     "_ = tree.plot_tree(hi_variance, feature_names=features, filled=True, \n",
 411 |     "                   class_names=['SE', 'DS'], max_depth=2)"
 412 |    ]
 413 |   },
 414 |   {
 415 |    "cell_type": "code",
 416 |    "execution_count": null,
 417 |    "metadata": {},
 418 |    "outputs": [],
 419 |    "source": []
 420 |   },
 421 |   {
 422 |    "cell_type": "code",
 423 |    "execution_count": null,
 424 |    "metadata": {},
 425 |    "outputs": [],
 426 |    "source": []
 427 |   },
 428 |   {
 429 |    "cell_type": "markdown",
 430 |    "metadata": {},
 431 |    "source": [
 432 |     "\n",
 433 |     "## Tree Hyperparameters\n",
 434 |     "\n",
 435 |     "*max_\\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)\n",
 436 |     "\n",
 437 |     "*min_\\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)\n",
 438 |     "\n",
 439 |     "* 'max_depth=None' - Tree depth\n",
 440 |     "* 'max_features=None' - Amount of features to examine for split\n",
 441 |     "* 'max_leaf_nodes=None' - Number of leafs\n",
 442 |     "* 'min_impurity_decrease=0' - Split when *impurity* is >= this value. (*Impurity* : 0 - 100% accurate, .3 - 70%. Going from 70% to 100% accurate is a decrease of .3) \n",
 443 |     "* 'min_samples_leaf=1', - Minimum samples at each leaf.\n",
 444 |     "* 'min_samples_split=2' - Minimum samples required to split a node.\n",
 445 |     "* 'min_weight_fraction_leaf=0' - The fraction fo the total weights required to be a leaf.\n"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": null,
 451 |    "metadata": {
 452 |     "scrolled": true
 453 |    },
 454 |    "outputs": [],
 455 |    "source": [
 456 |     "print(dir(stump))"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": null,
 462 |    "metadata": {},
 463 |    "outputs": [],
 464 |    "source": []
 465 |   },
 466 |   {
 467 |    "cell_type": "code",
 468 |    "execution_count": null,
 469 |    "metadata": {},
 470 |    "outputs": [],
 471 |    "source": []
 472 |   },
 473 |   {
 474 |    "cell_type": "markdown",
 475 |    "metadata": {},
 476 |    "source": [
 477 |     "## Random Forest\n",
 478 |     "\n",
 479 |     "Uses *bagging* to ensemble many trees in an attempt to lower variance."
 480 |    ]
 481 |   },
 482 |   {
 483 |    "cell_type": "code",
 484 |    "execution_count": null,
 485 |    "metadata": {
 486 |     "lines_to_next_cell": 2,
 487 |     "scrolled": false
 488 |    },
 489 |    "outputs": [],
 490 |    "source": [
 491 |     "rf = ensemble.RandomForestClassifier(random_state=42)\n",
 492 |     "rf.fit(kag_X_train, kag_y_train)\n",
 493 |     "rf.score(kag_X_test, kag_y_test)"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "code",
 498 |    "execution_count": null,
 499 |    "metadata": {
 500 |     "scrolled": true
 501 |    },
 502 |    "outputs": [],
 503 |    "source": [
 504 |     "len(rf.estimators_)"
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "code",
 509 |    "execution_count": null,
 510 |    "metadata": {
 511 |     "scrolled": false
 512 |    },
 513 |    "outputs": [],
 514 |    "source": [
 515 |     "features = list(c for c in kag_X_train.columns)\n",
 516 |     "_ = tree.plot_tree(rf.estimators_[0], feature_names=features, filled=True, \n",
 517 |     "                   class_names=['SE', 'DS'])"
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": null,
 523 |    "metadata": {},
 524 |    "outputs": [],
 525 |    "source": []
 526 |   },
 527 |   {
 528 |    "cell_type": "markdown",
 529 |    "metadata": {},
 530 |    "source": [
 531 |     "## Random Forest Hyperparameters\n",
 532 |     "\n",
 533 |     "*max_\\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)\n",
 534 |     "\n",
 535 |     "*min_\\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)\n",
 536 |     "\n",
 537 |     "* 'n_estimators=100' - Number of trees - should be *max_estimators*\n",
 538 |     "* 'oob_score=False' - Can estimate score when training (by using rows that weren't randomly selected). No need to hold out data\n",
 539 |     "* 'warm_start=False' - Can add more trees w/o starting over\n",
 540 |     "\n",
 541 |     "From tree:\n",
 542 |     "\n",
 543 |     "* 'max_depth=None' - Tree depth (1 to Infinity (`None`))\n",
 544 |     "* 'max_features=\"sqrt\"' - Amount of features to examine for split (1 to number of features (int). Float of percent (0. to 1.0). \"log2\" log2(n_features) or \"sqrt\"  sqrt(n_features). (Default square root number of features.)\n",
 545 |     "* 'max_leaf_nodes=None' - Number of leafs. Default (`None`) is unlimited.\n",
 546 |     "* 'min_impurity_decrease=0' - Split when *impurity* is >= this value. (0.0 to 1.0) (*Impurity* : 0 - 100% accurate, .3 - 70%) \n",
 547 |     "* 'min_samples_leaf=1', - Minimum samples at each leaf. (1 to n_samples).\n",
 548 |     "* 'min_samples_split=2' - Minimum samples required to split a node. (1 to n_samples)\n",
 549 |     "* 'min_weight_fraction_leaf=0' - The fraction (0.0 to 1.0) of the total weights required to be a leaf."
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "code",
 554 |    "execution_count": null,
 555 |    "metadata": {
 556 |     "scrolled": false
 557 |    },
 558 |    "outputs": [],
 559 |    "source": [
 560 |     "print(dir(rf))"
 561 |    ]
 562 |   },
 563 |   {
 564 |    "cell_type": "code",
 565 |    "execution_count": null,
 566 |    "metadata": {
 567 |     "scrolled": true
 568 |    },
 569 |    "outputs": [],
 570 |    "source": [
 571 |     "# visualize how changing n_estimators affects score\n",
 572 |     "results = []\n",
 573 |     "rf_ws = ensemble.RandomForestClassifier(random_state=42, warm_start=True, n_estimators=1)\n",
 574 |     "rf_ws.fit(kag_X_train, kag_y_train)\n",
 575 |     "for i in range(2,100):\n",
 576 |     "    rf_ws.set_params(n_estimators=i)\n",
 577 |     "    rf_ws.fit(kag_X_train, kag_y_train)\n",
 578 |     "    # see other metrics\n",
 579 |     "    results.append(metrics.f1_score(kag_y_test, rf_ws.predict(kag_X_test)))\n",
 580 |     "pd.Series(results, index=range(2, 100)).plot(figsize=(8,4))    "
 581 |    ]
 582 |   },
 583 |   {
 584 |    "cell_type": "code",
 585 |    "execution_count": null,
 586 |    "metadata": {
 587 |     "scrolled": true
 588 |    },
 589 |    "outputs": [],
 590 |    "source": [
 591 |     "# visualize how changing max_depth affects score\n",
 592 |     "results = []\n",
 593 |     "for i in range(1,20):\n",
 594 |     "    rf_ws = ensemble.RandomForestClassifier(random_state=42, \n",
 595 |     "                                            max_depth=i)\n",
 596 |     "    rf_ws.fit(kag_X_train, kag_y_train)\n",
 597 |     "    results.append(metrics.f1_score(kag_y_test, rf_ws.predict(kag_X_test)))\n",
 598 |     "pd.Series(results, index=range(1,20)).plot(figsize=(8,4))    "
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "code",
 603 |    "execution_count": null,
 604 |    "metadata": {
 605 |     "lines_to_next_cell": 2
 606 |    },
 607 |    "outputs": [],
 608 |    "source": []
 609 |   },
 610 |   {
 611 |    "cell_type": "code",
 612 |    "execution_count": null,
 613 |    "metadata": {},
 614 |    "outputs": [],
 615 |    "source": []
 616 |   },
 617 |   {
 618 |    "cell_type": "code",
 619 |    "execution_count": null,
 620 |    "metadata": {},
 621 |    "outputs": [],
 622 |    "source": []
 623 |   },
 624 |   {
 625 |    "cell_type": "code",
 626 |    "execution_count": null,
 627 |    "metadata": {},
 628 |    "outputs": [],
 629 |    "source": []
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": null,
 634 |    "metadata": {},
 635 |    "outputs": [],
 636 |    "source": []
 637 |   },
 638 |   {
 639 |    "cell_type": "code",
 640 |    "execution_count": null,
 641 |    "metadata": {},
 642 |    "outputs": [],
 643 |    "source": []
 644 |   },
 645 |   {
 646 |    "cell_type": "code",
 647 |    "execution_count": null,
 648 |    "metadata": {},
 649 |    "outputs": [],
 650 |    "source": []
 651 |   },
 652 |   {
 653 |    "cell_type": "markdown",
 654 |    "metadata": {},
 655 |    "source": [
 656 |     "## XGBoost\n",
 657 |     "\n",
 658 |     "Uses *boosting* to train a series of (weak) trees that try to correct the error of the previous output. (For classification this is mapped to a probability)\n",
 659 |     "\n",
 660 |     "Like golfing (you continue to putt or use a different club depending on first error). Decision tree would be a single tee off. Random forest would be averaging the tee offs. \n",
 661 |     "\n",
 662 |     "* Regularization\n",
 663 |     "* Parallel Processing\n",
 664 |     "* Missing Number Support\n",
 665 |     "* Category Support"
 666 |    ]
 667 |   },
 668 |   {
 669 |    "cell_type": "code",
 670 |    "execution_count": null,
 671 |    "metadata": {
 672 |     "lines_to_next_cell": 2,
 673 |     "scrolled": true
 674 |    },
 675 |    "outputs": [],
 676 |    "source": [
 677 |     "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n",
 678 |     "xg.fit(kag_X_train, kag_y_train)\n",
 679 |     "xg.score(kag_X_test, kag_y_test)"
 680 |    ]
 681 |   },
 682 |   {
 683 |    "cell_type": "code",
 684 |    "execution_count": null,
 685 |    "metadata": {
 686 |     "lines_to_next_cell": 2,
 687 |     "scrolled": false
 688 |    },
 689 |    "outputs": [],
 690 |    "source": [
 691 |     "# Let's try w/ depth of 2 and 2 trees\n",
 692 |     "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, max_depth=2, n_estimators=2)\n",
 693 |     "xg.fit(kag_X_train, kag_y_train)\n",
 694 |     "xg.score(kag_X_test, kag_y_test)"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "code",
 699 |    "execution_count": null,
 700 |    "metadata": {},
 701 |    "outputs": [],
 702 |    "source": [
 703 |     "# first tree\n",
 704 |     "xgb.to_graphviz(xg, size='1,1', num_trees=0, fontsize='1')"
 705 |    ]
 706 |   },
 707 |   {
 708 |    "cell_type": "code",
 709 |    "execution_count": null,
 710 |    "metadata": {
 711 |     "scrolled": true
 712 |    },
 713 |    "outputs": [],
 714 |    "source": [
 715 |     "# second tree\n",
 716 |     "xgb.to_graphviz(xg, size='1,1', num_trees=1, fontsize='1')"
 717 |    ]
 718 |   },
 719 |   {
 720 |    "cell_type": "code",
 721 |    "execution_count": null,
 722 |    "metadata": {},
 723 |    "outputs": [],
 724 |    "source": [
 725 |     "# let's go down the left path with\n",
 726 |     "# this data\n",
 727 |     "row = pd.Series({'gender_Male': 0.0, 'gender_Prefer_not_to_say': 0.0, \n",
 728 |     "    'gender_Prefer_to_self-describe': 0.0, 'age': 30.0, 'country_India': 0.0, \n",
 729 |     "    'country_United_States_of_America': 1.0, 'education': 16.0, 'major_eng': 0.0, \n",
 730 |     "    'major_other': 0.0, 'major_stat': 0.0, 'years_exp': 0.0, 'compensation': 0.0, \n",
 731 |     "    'python': 0.0, 'r': 0.0, 'sql': 0.0}).to_frame().T\n",
 732 |     "row"
 733 |    ]
 734 |   },
 735 |   {
 736 |    "cell_type": "code",
 737 |    "execution_count": null,
 738 |    "metadata": {},
 739 |    "outputs": [],
 740 |    "source": [
 741 |     "# result for DS = .4522\n",
 742 |     "# < .5 ... so Software Engineer!\n",
 743 |     "# this is [prob death, prob survival]\n",
 744 |     "xg.predict_proba(row)"
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "code",
 749 |    "execution_count": null,
 750 |    "metadata": {},
 751 |    "outputs": [],
 752 |    "source": [
 753 |     "xg.predict(row)"
 754 |    ]
 755 |   },
 756 |   {
 757 |    "cell_type": "code",
 758 |    "execution_count": null,
 759 |    "metadata": {
 760 |     "lines_to_next_cell": 0,
 761 |     "scrolled": false
 762 |    },
 763 |    "outputs": [],
 764 |    "source": [
 765 |     "# sum up leafs and throw into \n",
 766 |     "# Example: no r, low ed, low exp\n",
 767 |     "# -.251 + 0.0602\n",
 768 |     "\n",
 769 |     "vals = np.linspace(-10, 10)\n",
 770 |     "def inv_logit(p):\n",
 771 |     "    return np.exp(p) / (1 + np.exp(p))\n",
 772 |     "\n",
 773 |     "x = -.251 + 0.0602\n",
 774 |     "y = inv_logit(-.251 + 0.0602)\n",
 775 |     "print(f'({x:.2}, {y:.2})')\n",
 776 |     "fig, ax = plt.subplots(figsize=(6,4))\n",
 777 |     "ax.plot(vals, inv_logit(vals))\n",
 778 |     "ax.plot([x], [y], marker='o')\n",
 779 |     "ax.set_xlim([-5, 5])\n",
 780 |     "_ = ax.set_xticks([-3, -2, -1, 0, 1, 2, 3])\n",
 781 |     "_ = ax.set_yticks([0,.4, .5, .6, 1])"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": null,
 787 |    "metadata": {},
 788 |    "outputs": [],
 789 |    "source": []
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": null,
 794 |    "metadata": {
 795 |     "lines_to_next_cell": 2
 796 |    },
 797 |    "outputs": [],
 798 |    "source": []
 799 |   },
 800 |   {
 801 |    "cell_type": "code",
 802 |    "execution_count": null,
 803 |    "metadata": {},
 804 |    "outputs": [],
 805 |    "source": []
 806 |   },
 807 |   {
 808 |    "cell_type": "markdown",
 809 |    "metadata": {},
 810 |    "source": [
 811 |     "## Early Stopping\n",
 812 |     "Because you can keep \"putting\" you can keep track of how far away you are from the hole and stop when you are closest."
 813 |    ]
 814 |   },
 815 |   {
 816 |    "cell_type": "code",
 817 |    "execution_count": null,
 818 |    "metadata": {
 819 |     "lines_to_next_cell": 2,
 820 |     "scrolled": false
 821 |    },
 822 |    "outputs": [],
 823 |    "source": [
 824 |     "# defaults\n",
 825 |     "# 100 putts\n",
 826 |     "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n",
 827 |     "xg.fit(kag_X_train, kag_y_train)\n",
 828 |     "xg.score(kag_X_test, kag_y_test)"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": null,
 834 |    "metadata": {
 835 |     "scrolled": true
 836 |    },
 837 |    "outputs": [],
 838 |    "source": [
 839 |     "# Early stopping\n",
 840 |     "# Go up to 100 but stop after you haven't improved for 20 hits\n",
 841 |     "# Min value at round 9\n",
 842 |     "\n",
 843 |     "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False,\n",
 844 |     "                      early_stopping_rounds=20)\n",
 845 |     "xg.fit(kag_X_train, kag_y_train,\n",
 846 |     "       eval_set=[(kag_X_train, kag_y_train),\n",
 847 |     "                 (kag_X_test, kag_y_test)])\n",
 848 |     "xg.score(kag_X_test, kag_y_test)"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": null,
 854 |    "metadata": {
 855 |     "scrolled": true
 856 |    },
 857 |    "outputs": [],
 858 |    "source": [
 859 |     "xg.best_ntree_limit"
 860 |    ]
 861 |   },
 862 |   {
 863 |    "cell_type": "code",
 864 |    "execution_count": null,
 865 |    "metadata": {
 866 |     "scrolled": true
 867 |    },
 868 |    "outputs": [],
 869 |    "source": [
 870 |     "# we can get the evaluation metrics\n",
 871 |     "# validation_0 is for training data\n",
 872 |     "# validation_1 is for testing data\n",
 873 |     "results = xg.evals_result()\n",
 874 |     "results"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": null,
 880 |    "metadata": {},
 881 |    "outputs": [],
 882 |    "source": [
 883 |     "# Testing score is best at 11 trees\n",
 884 |     "results = xg.evals_result()\n",
 885 |     "ax = pd.DataFrame({'training': results['validation_0']['logloss'],\n",
 886 |     "              'testing': results['validation_1']['logloss'],\n",
 887 |     "             }).shift().plot(figsize=(5,4))\n",
 888 |     "ax.set_xlabel('ntrees')"
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "code",
 893 |    "execution_count": null,
 894 |    "metadata": {},
 895 |    "outputs": [],
 896 |    "source": []
 897 |   },
 898 |   {
 899 |    "cell_type": "code",
 900 |    "execution_count": null,
 901 |    "metadata": {},
 902 |    "outputs": [],
 903 |    "source": []
 904 |   },
 905 |   {
 906 |    "cell_type": "code",
 907 |    "execution_count": null,
 908 |    "metadata": {},
 909 |    "outputs": [],
 910 |    "source": []
 911 |   },
 912 |   {
 913 |    "cell_type": "code",
 914 |    "execution_count": null,
 915 |    "metadata": {},
 916 |    "outputs": [],
 917 |    "source": []
 918 |   },
 919 |   {
 920 |    "cell_type": "markdown",
 921 |    "metadata": {},
 922 |    "source": [
 923 |     "## XGBoost Hyperparameters\n",
 924 |     "\n",
 925 |     "*max_\\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)\n",
 926 |     "\n",
 927 |     "*min_\\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)\n",
 928 |     "\n",
 929 |     "* Boosting\n",
 930 |     "\n",
 931 |     "  * ``n_estimators=100`` - number of trees (or boosting rounds). Larger is more complex. Default 100. Use ``early_stopping_rounds`` with ``.fit`` to prevent overfitting.\n",
 932 |     "\n",
 933 |     "  * ``learning_rate=.3`` (called ``eta`` too) - after each boosting step, shrink feature weights. Larger is more conservative. Can be used with n_estimators to adjust time for convergence [0,1], default .3\n",
 934 |     "\n",
 935 |     "  * ``gamma=0`` / ``min_split_loss`` - L0 regularization. Global regularization. Minimum loss required for split. Larger is more conservative. [0, ∞], default 0 - No regularization.\n",
 936 |     "\n",
 937 |     "\n",
 938 |     "* Regularization\n",
 939 |     "\n",
 940 |     "  * ``reg_lambda=1`` - L2 regularization (Root of squared weights). Increase to be more conservative. Default 1\n",
 941 |     "  * ``reg_alpha=0`` - L1 regularization (Mean of weights). Increase to be more conservative. Default 0\n",
 942 |     "\n",
 943 |     "* Sampling - Use different rows\n",
 944 |     "\n",
 945 |     "  * ``subsample=1`` - Use % of samples (this is rows!) for next boosting round. Lower to more conservative. [0, 1], default 1. (When not equal to 1.0, model does *stochastic gradient descent*, ie. there is some randomness in the model.)\n",
 946 |     "\n",
 947 |     "\n",
 948 |     "New tree (sampling) parameters - Use different columns (not rows!):\n",
 949 |     "\n",
 950 |     "  * ``colsample_bytree=1`` - Fraction of columns for each boosting round.\n",
 951 |     "  \n",
 952 |     "  * ``colsample_bylevel=1`` - Fraction of columns for each depth level.\n",
 953 |     "  \n",
 954 |     "  * ``colsample_bynode=1`` - Fraction of columns for each node.\n",
 955 |     "  \n",
 956 |     "\n",
 957 |     "From tree:\n",
 958 |     "\n",
 959 |     "  * ``max_depth=6`` - depth of tree. Larger is more complex (more likely to overfit). How many feature interactions you can have. Each level doubles time. [0, ∞], default 6\n",
 960 |     "  * ``min_child_weight=1`` - Stop splitting after certain amount of purity. Larger will be more conservative.\n",
 961 |     "\n",
 962 |     "\n",
 963 |     "Imbalanced data:\n",
 964 |     "\n",
 965 |     "* ``scale_pos_weight=1`` -  ratio negative/positive. Default 1\n",
 966 |     "* Use ``'auc'`` or ``'aucpr'`` for ``eval_metric`` metric (rather than classification default ``'logless'``)\n",
 967 |     "* ``max_delta_step=0`` - try values from 1-10. Default 0\n",
 968 |     "\n",
 969 |     "\n",
 970 |     "\n"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "code",
 975 |    "execution_count": null,
 976 |    "metadata": {
 977 |     "scrolled": true
 978 |    },
 979 |    "outputs": [],
 980 |    "source": [
 981 |     "# try gamma on xgb\n",
 982 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
 983 |     "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),\n",
 984 |     "                    kag_X, kag_y,\n",
 985 |     "                    param_name='gamma', param_range=[0, .5, 1,2,5,10, 20])"
 986 |    ]
 987 |   },
 988 |   {
 989 |    "cell_type": "code",
 990 |    "execution_count": null,
 991 |    "metadata": {
 992 |     "scrolled": true
 993 |    },
 994 |    "outputs": [],
 995 |    "source": [
 996 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
 997 |     "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n",
 998 |     "                    kag_X, kag_y,\n",
 999 |     "                    param_name='max_depth', param_range=[1,2,3,4,5,10])"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": null,
1005 |    "metadata": {
1006 |     "scrolled": true
1007 |    },
1008 |    "outputs": [],
1009 |    "source": [
1010 |     "# note this depends on n_estimators\n",
1011 |     "# should really use early stopping but yellowbrick doesn't support this 😢\n",
1012 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1013 |     "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n",
1014 |     "                    kag_X, kag_y,\n",
1015 |     "                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1])"
1016 |    ]
1017 |   },
1018 |   {
1019 |    "cell_type": "code",
1020 |    "execution_count": null,
1021 |    "metadata": {},
1022 |    "outputs": [],
1023 |    "source": [
1024 |     "params = {'learning_rate': 0.1,\n",
1025 |     " 'max_depth': 3,\n",
1026 |     " 'n_estimators': 200,\n",
1027 |     " 'n_jobs': -1,\n",
1028 |     " 'random_state': 42,\n",
1029 |     " 'reg_lambda': 0,\n",
1030 |     " 'subsample': 1}"
1031 |    ]
1032 |   },
1033 |   {
1034 |    "cell_type": "code",
1035 |    "execution_count": null,
1036 |    "metadata": {
1037 |     "lines_to_next_cell": 2,
1038 |     "scrolled": true
1039 |    },
1040 |    "outputs": [],
1041 |    "source": [
1042 |     "# this takes a while to run (about 2 minutes)\n",
1043 |     "# can set scoring in GridSearchCV to \n",
1044 |     "# recall, precision, f1, accuracy\n",
1045 |     "params = {'reg_lambda': [0],  # No effect\n",
1046 |     "          'learning_rate': [.1, .3], # makes each boost more conservative (0 - no shrinkage) \n",
1047 |     "          #'colsample_bylevel': [.3, 1], # use 0, 50%, or 100% of columns in boost step\n",
1048 |     "          'subsample': [.7, 1],\n",
1049 |     "          #'gamma': [0, 1],\n",
1050 |     "          'max_depth': [1, 2, 3],\n",
1051 |     "          'random_state': [42],\n",
1052 |     "          'n_jobs': [-1],\n",
1053 |     "          #'early_stopping_rounds':[10],\n",
1054 |     "          'n_estimators': [200]}\n",
1055 |     "kag_xgb2 = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n",
1056 |     "cv = (model_selection.GridSearchCV(kag_xgb2, params, cv=3, n_jobs=-1)\n",
1057 |     "    .fit(kag_X_train, kag_y_train,\n",
1058 |     "         eval_set=[(kag_X_test, kag_y_test)],\n",
1059 |     "         early_stopping_rounds=5) \n",
1060 |     "     )"
1061 |    ]
1062 |   },
1063 |   {
1064 |    "cell_type": "code",
1065 |    "execution_count": null,
1066 |    "metadata": {
1067 |     "scrolled": false
1068 |    },
1069 |    "outputs": [],
1070 |    "source": [
1071 |     "cv.best_params_"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "code",
1076 |    "execution_count": null,
1077 |    "metadata": {
1078 |     "lines_to_next_cell": 0
1079 |    },
1080 |    "outputs": [],
1081 |    "source": [
1082 |     "# vs default\n",
1083 |     "params = {'learning_rate': 0.3,\n",
1084 |     " 'max_depth': 2,\n",
1085 |     " 'n_estimators': 200,\n",
1086 |     " 'n_jobs': -1,\n",
1087 |     " 'random_state': 42,\n",
1088 |     " 'reg_lambda': 0,\n",
1089 |     " 'subsample': 0.7}\n",
1090 |     "xgb_def2 = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n",
1091 |     "xgb_def2.fit(kag_X_train, kag_y_train)\n",
1092 |     "\n",
1093 |     "xgb_grid2 = xgb.XGBClassifier(**params, eval_metric='logloss', use_label_encoder=False)\n",
1094 |     "xgb_grid2.fit(kag_X_train, kag_y_train)\n",
1095 |     "xgb_def2.score(kag_X_test, kag_y_test), xgb_grid2.score(kag_X_test, kag_y_test)"
1096 |    ]
1097 |   },
1098 |   {
1099 |    "cell_type": "code",
1100 |    "execution_count": null,
1101 |    "metadata": {
1102 |     "lines_to_next_cell": 2
1103 |    },
1104 |    "outputs": [],
1105 |    "source": []
1106 |   },
1107 |   {
1108 |    "cell_type": "markdown",
1109 |    "metadata": {},
1110 |    "source": [
1111 |     "## Bonus: Tuning with Hyperopt\n"
1112 |    ]
1113 |   },
1114 |   {
1115 |    "cell_type": "code",
1116 |    "execution_count": null,
1117 |    "metadata": {},
1118 |    "outputs": [],
1119 |    "source": [
1120 |     "!pip install hyperopt"
1121 |    ]
1122 |   },
1123 |   {
1124 |    "cell_type": "code",
1125 |    "execution_count": null,
1126 |    "metadata": {},
1127 |    "outputs": [],
1128 |    "source": [
1129 |     "from hyperopt import fmin, tpe, hp, STATUS_OK, Trials\n",
1130 |     "from sklearn.metrics import accuracy_score  \n",
1131 |     "#https://bradleyboehmke.github.io/xgboost_databricks_tuning/index.html#slide21\n",
1132 |     "space = {\n",
1133 |     "    'learning_rate': hp.loguniform('learning_rate', -7, 0),\n",
1134 |     "    'max_depth': hp.quniform('max_depth', 1, 12, 1),\n",
1135 |     "    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),\n",
1136 |     "    'subsample': hp.uniform('subsample', 0.5, 1),\n",
1137 |     "    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),\n",
1138 |     "    'gamma': hp.loguniform('gamma', -10, 10),\n",
1139 |     "    'reg_alpha': hp.loguniform('alpha', -10, 10),\n",
1140 |     "    'reg_lambda': hp.loguniform('lambda', -10, 10),\n",
1141 |     "    'objective': 'binary:logistic',\n",
1142 |     "    'eval_metric': 'auc',\n",
1143 |     "    'seed': 123,\n",
1144 |     "}"
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "code",
1149 |    "execution_count": null,
1150 |    "metadata": {},
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "def hyperparameter_tuning(space):    \n",
1154 |     "    model = xgb.XGBClassifier(max_depth = int(space['max_depth']), \n",
1155 |     "                gamma = space['gamma'],                                         \n",
1156 |     "                reg_alpha = int(space['reg_alpha']),\n",
1157 |     "                min_child_weight=space['min_child_weight'],                                 \n",
1158 |     "                colsample_bytree=space['colsample_bytree'])\n",
1159 |     "    evaluation = [(kag_X_train, kag_y_train),\n",
1160 |     "            (kag_X_test, kag_y_test)]\n",
1161 |     "    model.fit(kag_X_train, kag_y_train,\n",
1162 |     "                 eval_set=evaluation, eval_metric=\"rmse\",            \n",
1163 |     "                 early_stopping_rounds=10,verbose=False)    \n",
1164 |     "         \n",
1165 |     "    pred = model.predict(kag_X_test)\n",
1166 |     "    accuracy = accuracy_score(kag_y_test, pred>0.5)    \n",
1167 |     "    print (\"SCORE:\", accuracy)    \n",
1168 |     "    #change the metric if you like    \n",
1169 |     "    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}"
1170 |    ]
1171 |   },
1172 |   {
1173 |    "cell_type": "code",
1174 |    "execution_count": null,
1175 |    "metadata": {},
1176 |    "outputs": [],
1177 |    "source": [
1178 |     "trials = Trials()\n",
1179 |     "best = fmin(fn=hyperparameter_tuning,            \n",
1180 |     "    space=space,           \n",
1181 |     "    algo=tpe.suggest,            \n",
1182 |     "    max_evals=1000,            \n",
1183 |     "    trials=trials,\n",
1184 |     "    #timeout=60*5 # 5 minutes\n",
1185 |     "           )\n",
1186 |     "print (best)"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": null,
1192 |    "metadata": {
1193 |     "scrolled": true
1194 |    },
1195 |    "outputs": [],
1196 |    "source": [
1197 |     "best # new"
1198 |    ]
1199 |   },
1200 |   {
1201 |    "cell_type": "code",
1202 |    "execution_count": null,
1203 |    "metadata": {
1204 |     "scrolled": true
1205 |    },
1206 |    "outputs": [],
1207 |    "source": [
1208 |     "hyper_params ={'alpha': 0.19514909424102928,\n",
1209 |     " 'colsample_bytree': 0.8227256149391048,\n",
1210 |     " 'gamma': 0.010701959121627006,\n",
1211 |     " 'lambda': 0.010955985134796302,\n",
1212 |     " 'learning_rate': 0.004570442245136879,\n",
1213 |     " 'max_depth': 3, \n",
1214 |     " 'min_child_weight': 0.2497193683952876,\n",
1215 |     " 'subsample': 0.6416201529297743}\n",
1216 |     "xgb_hyp = xgb.XGBClassifier(**hyper_params, eval_metric='logloss', \n",
1217 |     "                            use_label_encoder=False,\n",
1218 |     "                           n_estimators=2_000)\n",
1219 |     "evaluation = [(kag_X_train, kag_y_train),\n",
1220 |     "            (kag_X_test, kag_y_test)]\n",
1221 |     "xgb_hyp.fit(kag_X_train, kag_y_train, early_stopping_rounds=10,\n",
1222 |     "           eval_set=evaluation)\n",
1223 |     "xgb_hyp.score(kag_X_test, kag_y_test)#"
1224 |    ]
1225 |   },
1226 |   {
1227 |    "cell_type": "code",
1228 |    "execution_count": null,
1229 |    "metadata": {
1230 |     "scrolled": false
1231 |    },
1232 |    "outputs": [],
1233 |    "source": [
1234 |     "xgb_hyp.score(kag_X_test, kag_y_test)"
1235 |    ]
1236 |   },
1237 |   {
1238 |    "cell_type": "code",
1239 |    "execution_count": null,
1240 |    "metadata": {
1241 |     "scrolled": false
1242 |    },
1243 |    "outputs": [],
1244 |    "source": [
1245 |     "# vs default and grid\n",
1246 |     "xgb_def2.score(kag_X_test, kag_y_test), xgb_grid2.score(kag_X_test, kag_y_test)"
1247 |    ]
1248 |   },
1249 |   {
1250 |    "cell_type": "code",
1251 |    "execution_count": null,
1252 |    "metadata": {},
1253 |    "outputs": [],
1254 |    "source": [
1255 |     "grid = xgb_grid2.get_params()\n",
1256 |     "hyp = xgb_hyp.get_params()\n",
1257 |     "for k in grid:\n",
1258 |     "    print(f'{k=:20} grid:{grid[k] or \"\":20} hyp:{hyp[k] or \"\"}')"
1259 |    ]
1260 |   },
1261 |   {
1262 |    "cell_type": "code",
1263 |    "execution_count": null,
1264 |    "metadata": {},
1265 |    "outputs": [],
1266 |    "source": []
1267 |   },
1268 |   {
1269 |    "cell_type": "code",
1270 |    "execution_count": null,
1271 |    "metadata": {},
1272 |    "outputs": [],
1273 |    "source": []
1274 |   },
1275 |   {
1276 |    "cell_type": "code",
1277 |    "execution_count": null,
1278 |    "metadata": {},
1279 |    "outputs": [],
1280 |    "source": []
1281 |   },
1282 |   {
1283 |    "cell_type": "code",
1284 |    "execution_count": null,
1285 |    "metadata": {},
1286 |    "outputs": [],
1287 |    "source": []
1288 |   },
1289 |   {
1290 |    "cell_type": "markdown",
1291 |    "metadata": {},
1292 |    "source": [
1293 |     "## Model Evaluation\n",
1294 |     "Now that we've tuned our model, let's look at how it performs"
1295 |    ]
1296 |   },
1297 |   {
1298 |    "cell_type": "code",
1299 |    "execution_count": null,
1300 |    "metadata": {
1301 |     "scrolled": true
1302 |    },
1303 |    "outputs": [],
1304 |    "source": [
1305 |     "hyper_params ={'alpha': 0.19514909424102928,\n",
1306 |     " 'colsample_bytree': 0.8227256149391048,\n",
1307 |     " 'gamma': 0.010701959121627006,\n",
1308 |     " 'lambda': 0.010955985134796302,\n",
1309 |     " 'learning_rate': 0.004570442245136879,\n",
1310 |     " 'max_depth': 3, \n",
1311 |     " 'min_child_weight': 0.2497193683952876,\n",
1312 |     " 'subsample': 0.6416201529297743}\n",
1313 |     "xgb_hyp = xgb.XGBClassifier(**hyper_params, eval_metric='logloss', \n",
1314 |     "                            use_label_encoder=False,\n",
1315 |     "                           n_estimators=2_000)\n",
1316 |     "evaluation = [(kag_X_train, kag_y_train),\n",
1317 |     "            (kag_X_test, kag_y_test)]\n",
1318 |     "xgb_hyp.fit(kag_X_train, kag_y_train, early_stopping_rounds=10,\n",
1319 |     "           eval_set=evaluation)"
1320 |    ]
1321 |   },
1322 |   {
1323 |    "cell_type": "code",
1324 |    "execution_count": null,
1325 |    "metadata": {
1326 |     "scrolled": false
1327 |    },
1328 |    "outputs": [],
1329 |    "source": [
1330 |     "metrics.accuracy_score(kag_y_test, xgb_hyp.predict(kag_X_test))"
1331 |    ]
1332 |   },
1333 |   {
1334 |    "cell_type": "code",
1335 |    "execution_count": null,
1336 |    "metadata": {
1337 |     "scrolled": true
1338 |    },
1339 |    "outputs": [],
1340 |    "source": [
1341 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1342 |     "classifier.confusion_matrix(xgb_hyp, kag_X_train, kag_y_train,\n",
1343 |     "                            kag_X_test, kag_y_test,\n",
1344 |     "                            classes=['SE', 'DS']\n",
1345 |     "                           )"
1346 |    ]
1347 |   },
1348 |   {
1349 |    "cell_type": "code",
1350 |    "execution_count": null,
1351 |    "metadata": {
1352 |     "scrolled": true
1353 |    },
1354 |    "outputs": [],
1355 |    "source": [
1356 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1357 |     "metrics.RocCurveDisplay.from_estimator(xgb_hyp,\n",
1358 |     "                       kag_X_test, kag_y_test,ax=ax)"
1359 |    ]
1360 |   },
1361 |   {
1362 |    "cell_type": "code",
1363 |    "execution_count": null,
1364 |    "metadata": {
1365 |     "scrolled": false
1366 |    },
1367 |    "outputs": [],
1368 |    "source": [
1369 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1370 |     "classifier.precision_recall_curve(xgb_hyp, kag_X_train, kag_y_train,\n",
1371 |     "                   kag_X_test, kag_y_test,\n",
1372 |     "                   classes=['SE', 'DS'],\n",
1373 |     "                   micro=False, macro=False\n",
1374 |     "                   )"
1375 |    ]
1376 |   },
1377 |   {
1378 |    "cell_type": "code",
1379 |    "execution_count": null,
1380 |    "metadata": {
1381 |     "scrolled": false
1382 |    },
1383 |    "outputs": [],
1384 |    "source": [
1385 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1386 |     "classifier.classification_report(xgb_hyp, kag_X_train, kag_y_train,\n",
1387 |     "                   kag_X_test, kag_y_test,\n",
1388 |     "                   classes=['SE', 'DS'],\n",
1389 |     "                   micro=False, macro=False\n",
1390 |     "                   )"
1391 |    ]
1392 |   },
1393 |   {
1394 |    "cell_type": "code",
1395 |    "execution_count": null,
1396 |    "metadata": {},
1397 |    "outputs": [],
1398 |    "source": []
1399 |   },
1400 |   {
1401 |    "cell_type": "code",
1402 |    "execution_count": null,
1403 |    "metadata": {},
1404 |    "outputs": [],
1405 |    "source": []
1406 |   },
1407 |   {
1408 |    "cell_type": "code",
1409 |    "execution_count": null,
1410 |    "metadata": {},
1411 |    "outputs": [],
1412 |    "source": []
1413 |   },
1414 |   {
1415 |    "cell_type": "code",
1416 |    "execution_count": null,
1417 |    "metadata": {},
1418 |    "outputs": [],
1419 |    "source": []
1420 |   },
1421 |   {
1422 |    "cell_type": "code",
1423 |    "execution_count": null,
1424 |    "metadata": {},
1425 |    "outputs": [],
1426 |    "source": []
1427 |   },
1428 |   {
1429 |    "cell_type": "markdown",
1430 |    "metadata": {},
1431 |    "source": [
1432 |     "## Training For Different Metrics\n",
1433 |     "\n",
1434 |     "We tuned our model. But we tuned it against accuracy. What if we want to optimize for recall?"
1435 |    ]
1436 |   },
1437 |   {
1438 |    "cell_type": "code",
1439 |    "execution_count": null,
1440 |    "metadata": {
1441 |     "scrolled": true
1442 |    },
1443 |    "outputs": [],
1444 |    "source": [
1445 |     "# accuracy tuning\n",
1446 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1447 |     "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n",
1448 |     "                    kag_X_train, kag_y_train,\n",
1449 |     "    #                param_name='max_depth', param_range=[1,2,5,10]\n",
1450 |     "                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]\n",
1451 |     "    )"
1452 |    ]
1453 |   },
1454 |   {
1455 |    "cell_type": "code",
1456 |    "execution_count": null,
1457 |    "metadata": {
1458 |     "scrolled": true
1459 |    },
1460 |    "outputs": [],
1461 |    "source": [
1462 |     "# recall tuning\n",
1463 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1464 |     "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n",
1465 |     "                    kag_X_train, kag_y_train,\n",
1466 |     "                    scoring='recall',\n",
1467 |     "                    #param_name='max_depth', param_range=[1,2,5,10]\n",
1468 |     "                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]\n",
1469 |     "                   )"
1470 |    ]
1471 |   },
1472 |   {
1473 |    "cell_type": "code",
1474 |    "execution_count": null,
1475 |    "metadata": {
1476 |     "scrolled": true
1477 |    },
1478 |    "outputs": [],
1479 |    "source": [
1480 |     "fig, ax = plt.subplots(figsize=(8,4))\n",
1481 |     "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n",
1482 |     "                    kag_X_train, kag_y_train,\n",
1483 |     "                    scoring='f1',\n",
1484 |     "                    #param_name='max_depth', param_range=[1,2,5,10]\n",
1485 |     "                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]\n",
1486 |     "                   )"
1487 |    ]
1488 |   },
1489 |   {
1490 |    "cell_type": "code",
1491 |    "execution_count": null,
1492 |    "metadata": {},
1493 |    "outputs": [],
1494 |    "source": []
1495 |   },
1496 |   {
1497 |    "cell_type": "code",
1498 |    "execution_count": null,
1499 |    "metadata": {},
1500 |    "outputs": [],
1501 |    "source": []
1502 |   },
1503 |   {
1504 |    "cell_type": "code",
1505 |    "execution_count": null,
1506 |    "metadata": {},
1507 |    "outputs": [],
1508 |    "source": []
1509 |   },
1510 |   {
1511 |    "cell_type": "markdown",
1512 |    "metadata": {},
1513 |    "source": [
1514 |     "## Model Interpretation"
1515 |    ]
1516 |   },
1517 |   {
1518 |    "cell_type": "code",
1519 |    "execution_count": null,
1520 |    "metadata": {
1521 |     "lines_to_next_cell": 2,
1522 |     "scrolled": true
1523 |    },
1524 |    "outputs": [],
1525 |    "source": [
1526 |     "# Trees are great when they overfit... They can explain what they overfit\n",
1527 |     "# (You can use these for \"surrogate models\")\n",
1528 |     "hi_variance = tree.DecisionTreeClassifier(max_depth=None)\n",
1529 |     "hi_variance.fit(kag_X_train, kag_y_train)\n",
1530 |     "hi_variance.score(kag_X_test, kag_y_test)"
1531 |    ]
1532 |   },
1533 |   {
1534 |    "cell_type": "code",
1535 |    "execution_count": null,
1536 |    "metadata": {
1537 |     "scrolled": false
1538 |    },
1539 |    "outputs": [],
1540 |    "source": [
1541 |     "# Feature importance shows the magnitude (not direction) of impact\n",
1542 |     "(pd.Series(hi_variance.feature_importances_, index=kag_X_train.columns)\n",
1543 |     " .sort_values()\n",
1544 |     " .plot.barh()\n",
1545 |     ")"
1546 |    ]
1547 |   },
1548 |   {
1549 |    "cell_type": "code",
1550 |    "execution_count": null,
1551 |    "metadata": {
1552 |     "scrolled": false
1553 |    },
1554 |    "outputs": [],
1555 |    "source": [
1556 |     "# XGBoost also supports feature importance\n",
1557 |     "xgb_def = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n",
1558 |     "xgb_def.fit(kag_X_train, kag_y_train)"
1559 |    ]
1560 |   },
1561 |   {
1562 |    "cell_type": "code",
1563 |    "execution_count": null,
1564 |    "metadata": {
1565 |     "scrolled": true
1566 |    },
1567 |    "outputs": [],
1568 |    "source": [
1569 |     "(pd.Series(xgb_def.feature_importances_, index=kag_X_train.columns)\n",
1570 |     " .sort_values()\n",
1571 |     " .plot.barh()\n",
1572 |     ")"
1573 |    ]
1574 |   },
1575 |   {
1576 |    "cell_type": "code",
1577 |    "execution_count": null,
1578 |    "metadata": {},
1579 |    "outputs": [],
1580 |    "source": [
1581 |     "# * \"weight\" is the number of times a feature appears in a tree\n",
1582 |     "# * \"gain\" is the average gain of splits which use the feature\n",
1583 |     "# * \"cover\" is the average coverage of splits which use the feature\n",
1584 |     "xgb.plot_importance(xgb_def, importance_type='cover')"
1585 |    ]
1586 |   },
1587 |   {
1588 |    "cell_type": "code",
1589 |    "execution_count": null,
1590 |    "metadata": {},
1591 |    "outputs": [],
1592 |    "source": []
1593 |   },
1594 |   {
1595 |    "cell_type": "code",
1596 |    "execution_count": null,
1597 |    "metadata": {},
1598 |    "outputs": [],
1599 |    "source": []
1600 |   },
1601 |   {
1602 |    "cell_type": "code",
1603 |    "execution_count": null,
1604 |    "metadata": {},
1605 |    "outputs": [],
1606 |    "source": []
1607 |   },
1608 |   {
1609 |    "cell_type": "code",
1610 |    "execution_count": null,
1611 |    "metadata": {},
1612 |    "outputs": [],
1613 |    "source": []
1614 |   },
1615 |   {
1616 |    "cell_type": "code",
1617 |    "execution_count": null,
1618 |    "metadata": {},
1619 |    "outputs": [],
1620 |    "source": []
1621 |   },
1622 |   {
1623 |    "cell_type": "code",
1624 |    "execution_count": null,
1625 |    "metadata": {},
1626 |    "outputs": [],
1627 |    "source": []
1628 |   },
1629 |   {
1630 |    "cell_type": "markdown",
1631 |    "metadata": {},
1632 |    "source": [
1633 |     "## xgbfir (Feature Interactions Reshaped)\n",
1634 |     " *Gain*: Total gain of each feature or feature interaction\n",
1635 |     " \n",
1636 |     " *FScore*: Amount of possible splits taken on a feature or feature Interaction\n",
1637 |     " \n",
1638 |     " *wFScore*: Amount of possible splits taken on a feature or feature nteraction weighted by the probability of the splits to take place\n",
1639 |     " \n",
1640 |     " *Average wFScore*: wFScore divided by FScore\n",
1641 |     " \n",
1642 |     " *Average Gain*: Gain divided by FScore\n",
1643 |     " \n",
1644 |     " *Expected Gain*: Total gain of each feature or feature interaction weighted by the probability to gather the gain\n"
1645 |    ]
1646 |   },
1647 |   {
1648 |    "cell_type": "code",
1649 |    "execution_count": null,
1650 |    "metadata": {},
1651 |    "outputs": [],
1652 |    "source": [
1653 |     "!pip install openpyxl"
1654 |    ]
1655 |   },
1656 |   {
1657 |    "cell_type": "code",
1658 |    "execution_count": null,
1659 |    "metadata": {
1660 |     "scrolled": true
1661 |    },
1662 |    "outputs": [],
1663 |    "source": [
1664 |     "import xgbfir\n",
1665 |     "xgbfir.saveXgbFI(xgb_def, feature_names=kag_X_train.columns, OutputXlsxFile='fir.xlsx')\n",
1666 |     "pd.read_excel('fir.xlsx')"
1667 |    ]
1668 |   },
1669 |   {
1670 |    "cell_type": "code",
1671 |    "execution_count": null,
1672 |    "metadata": {
1673 |     "scrolled": true
1674 |    },
1675 |    "outputs": [],
1676 |    "source": [
1677 |     "pd.read_excel('fir.xlsx', sheet_name='Interaction Depth 1')"
1678 |    ]
1679 |   },
1680 |   {
1681 |    "cell_type": "code",
1682 |    "execution_count": null,
1683 |    "metadata": {
1684 |     "scrolled": true
1685 |    },
1686 |    "outputs": [],
1687 |    "source": [
1688 |     "pd.read_excel('fir.xlsx', sheet_name='Interaction Depth 2')"
1689 |    ]
1690 |   },
1691 |   {
1692 |    "cell_type": "code",
1693 |    "execution_count": null,
1694 |    "metadata": {},
1695 |    "outputs": [],
1696 |    "source": []
1697 |   },
1698 |   {
1699 |    "cell_type": "code",
1700 |    "execution_count": null,
1701 |    "metadata": {},
1702 |    "outputs": [],
1703 |    "source": []
1704 |   },
1705 |   {
1706 |    "cell_type": "code",
1707 |    "execution_count": null,
1708 |    "metadata": {},
1709 |    "outputs": [],
1710 |    "source": []
1711 |   },
1712 |   {
1713 |    "cell_type": "code",
1714 |    "execution_count": null,
1715 |    "metadata": {},
1716 |    "outputs": [],
1717 |    "source": []
1718 |   },
1719 |   {
1720 |    "cell_type": "code",
1721 |    "execution_count": null,
1722 |    "metadata": {},
1723 |    "outputs": [],
1724 |    "source": []
1725 |   },
1726 |   {
1727 |    "cell_type": "code",
1728 |    "execution_count": null,
1729 |    "metadata": {},
1730 |    "outputs": [],
1731 |    "source": []
1732 |   },
1733 |   {
1734 |    "cell_type": "markdown",
1735 |    "metadata": {},
1736 |    "source": [
1737 |     "# SHAP (SHapley Additive exPlantations)\n",
1738 |     "Should be *globally* consistent and accurate\n",
1739 |     "\n",
1740 |     " Shapley value (SHAP).\n",
1741 |     " \n",
1742 |     " From game theory, indicates how to distribute attribution of label\n",
1743 |     "\n"
1744 |    ]
1745 |   },
1746 |   {
1747 |    "cell_type": "code",
1748 |    "execution_count": null,
1749 |    "metadata": {},
1750 |    "outputs": [],
1751 |    "source": [
1752 |     "import shap\n",
1753 |     "shap.initjs()\n",
1754 |     "\n",
1755 |     "# make sure you initialize the js side\n",
1756 |     "shap_ex = shap.TreeExplainer(xgb_def)\n",
1757 |     "vals = shap_ex.shap_values(kag_X_test)"
1758 |    ]
1759 |   },
1760 |   {
1761 |    "cell_type": "code",
1762 |    "execution_count": null,
1763 |    "metadata": {},
1764 |    "outputs": [],
1765 |    "source": [
1766 |     "# Let's explain an individual\n",
1767 |     "kag_X_test.iloc[0]"
1768 |    ]
1769 |   },
1770 |   {
1771 |    "cell_type": "code",
1772 |    "execution_count": null,
1773 |    "metadata": {},
1774 |    "outputs": [],
1775 |    "source": [
1776 |     "xgb_def.predict(kag_X_test.iloc[[0]])  # predicts SE... why?"
1777 |    ]
1778 |   },
1779 |   {
1780 |    "cell_type": "code",
1781 |    "execution_count": null,
1782 |    "metadata": {},
1783 |    "outputs": [],
1784 |    "source": [
1785 |     "# label is also SE\n",
1786 |     "kag_y_test.iloc[0]"
1787 |    ]
1788 |   },
1789 |   {
1790 |    "cell_type": "code",
1791 |    "execution_count": null,
1792 |    "metadata": {
1793 |     "scrolled": false
1794 |    },
1795 |    "outputs": [],
1796 |    "source": [
1797 |     "# values show direction of feature impact\n",
1798 |     "# for this individual\n",
1799 |     "pd.Series(vals[0], index=kag_X_test.columns).plot.barh()"
1800 |    ]
1801 |   },
1802 |   {
1803 |    "cell_type": "code",
1804 |    "execution_count": null,
1805 |    "metadata": {},
1806 |    "outputs": [],
1807 |    "source": [
1808 |     "# the base value. We sum up the scores.\n",
1809 |     "# > 0 Positive Case\n",
1810 |     "shap_ex.expected_value"
1811 |    ]
1812 |   },
1813 |   {
1814 |    "cell_type": "code",
1815 |    "execution_count": null,
1816 |    "metadata": {},
1817 |    "outputs": [],
1818 |    "source": [
1819 |     "# < 0 therefore ... SE\n",
1820 |     "shap_ex.expected_value + vals[0].sum()"
1821 |    ]
1822 |   },
1823 |   {
1824 |    "cell_type": "code",
1825 |    "execution_count": null,
1826 |    "metadata": {
1827 |     "scrolled": true
1828 |    },
1829 |    "outputs": [],
1830 |    "source": [
1831 |     "# use matplotlib if having js issues\n",
1832 |     "# blue - SE\n",
1833 |     "# red - DS\n",
1834 |     "shap.force_plot(shap_ex.expected_value, \n",
1835 |     "               vals[0,:], kag_X_test.iloc[0], #matplotlib=True\n",
1836 |     "               )"
1837 |    ]
1838 |   },
1839 |   {
1840 |    "cell_type": "code",
1841 |    "execution_count": null,
1842 |    "metadata": {
1843 |     "scrolled": false
1844 |    },
1845 |    "outputs": [],
1846 |    "source": [
1847 |     "# Explain a feature\n",
1848 |     "shap.dependence_plot('years_exp', vals, kag_X_test)"
1849 |    ]
1850 |   },
1851 |   {
1852 |    "cell_type": "code",
1853 |    "execution_count": null,
1854 |    "metadata": {
1855 |     "scrolled": false
1856 |    },
1857 |    "outputs": [],
1858 |    "source": [
1859 |     "# Explain another feature\n",
1860 |     "shap.dependence_plot('age', vals, kag_X_test)"
1861 |    ]
1862 |   },
1863 |   {
1864 |    "cell_type": "code",
1865 |    "execution_count": null,
1866 |    "metadata": {
1867 |     "scrolled": false
1868 |    },
1869 |    "outputs": [],
1870 |    "source": [
1871 |     "# Explain a feature with an interaction\n",
1872 |     "shap.dependence_plot('compensation', vals, kag_X_test, interaction_index='age')"
1873 |    ]
1874 |   },
1875 |   {
1876 |    "cell_type": "code",
1877 |    "execution_count": null,
1878 |    "metadata": {},
1879 |    "outputs": [],
1880 |    "source": [
1881 |     "# Explain global features\n",
1882 |     "shap.summary_plot(vals, kag_X_test)"
1883 |    ]
1884 |   },
1885 |   {
1886 |    "cell_type": "code",
1887 |    "execution_count": null,
1888 |    "metadata": {},
1889 |    "outputs": [],
1890 |    "source": []
1891 |   },
1892 |   {
1893 |    "cell_type": "code",
1894 |    "execution_count": null,
1895 |    "metadata": {},
1896 |    "outputs": [],
1897 |    "source": []
1898 |   },
1899 |   {
1900 |    "cell_type": "markdown",
1901 |    "metadata": {},
1902 |    "source": [
1903 |     "# Summary\n",
1904 |     "\n",
1905 |     "XGBoost is very powerful. Combining with other tools will take you a long way.\n",
1906 |     "\n",
1907 |     "Explore your data and your results.\n",
1908 |     "\n",
1909 |     "Lots of libraries. Some are better integrated.\n",
1910 |     "\n",
1911 |     "Suggestions:\n",
1912 |     "\n",
1913 |     "* Pandas skills come in useful for manipulating data\n",
1914 |     "* Make sure you discuss business value with stake holders\n",
1915 |     "\n",
1916 |     "\n",
1917 |     "Questions?\n",
1918 |     "\n",
1919 |     "\n",
1920 |     "Connect on LinkedIn or Twitter `@__mharrison__`"
1921 |    ]
1922 |   },
1923 |   {
1924 |    "cell_type": "code",
1925 |    "execution_count": null,
1926 |    "metadata": {},
1927 |    "outputs": [],
1928 |    "source": [
1929 |     "import random\n",
1930 |     "random.randrange(1,9)"
1931 |    ]
1932 |   },
1933 |   {
1934 |    "cell_type": "code",
1935 |    "execution_count": null,
1936 |    "metadata": {},
1937 |    "outputs": [],
1938 |    "source": [
1939 |     "random.randrange(1,5)"
1940 |    ]
1941 |   }
1942 |  ],
1943 |  "metadata": {
1944 |   "jupytext": {
1945 |    "encoding": "# -*- coding: utf-8 -*-",
1946 |    "formats": "ipynb,py:light"
1947 |   },
1948 |   "kernelspec": {
1949 |    "display_name": "Python 3",
1950 |    "language": "python",
1951 |    "name": "python3"
1952 |   },
1953 |   "language_info": {
1954 |    "codemirror_mode": {
1955 |     "name": "ipython",
1956 |     "version": 3
1957 |    },
1958 |    "file_extension": ".py",
1959 |    "mimetype": "text/x-python",
1960 |    "name": "python",
1961 |    "nbconvert_exporter": "python",
1962 |    "pygments_lexer": "ipython3",
1963 |    "version": "3.8.10"
1964 |   }
1965 |  },
1966 |  "nbformat": 4,
1967 |  "nbformat_minor": 4
1968 | }
1969 | 


--------------------------------------------------------------------------------