├── README.md ├── kaggle-survey-2018.zip └── End2EndXGBoost-Solution.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # 2022-odsc-xgboost 2 | -------------------------------------------------------------------------------- /kaggle-survey-2018.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattharrison/2022-odsc-xgboost/main/kaggle-survey-2018.zip -------------------------------------------------------------------------------- /End2EndXGBoost-Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# End to End XGBoost\n", 8 | "\n", 9 | "https://github.com/mattharrison/\n", 10 | "\n", 11 | "©2022 MetaSnake\n", 12 | "\n", 13 | "`@__mharrison__`" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import xgboost\n", 23 | "xgboost.__version__" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Libraries\n", 80 | "We will also use SHAP, xgbfir, openpyxl, hyperopt" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# for colab\n", 90 | "!pip install dtreeviz feature_engine pybaobabdt xgbfir shap" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "lines_to_next_cell": 0 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "from feature_engine import encoding, imputation\n", 102 | "import matplotlib.pyplot as plt\n", 103 | "import numpy as np\n", 104 | "import pandas as pd\n", 105 | "\n", 106 | "from sklearn import base, compose, datasets, ensemble, \\\n", 107 | " metrics, model_selection, pipeline, preprocessing, tree\n", 108 | "import xgboost as xgb\n", 109 | "import yellowbrick.model_selection as ms\n", 110 | "from yellowbrick import classifier\n", 111 | "\n", 112 | "import urllib\n", 113 | "import zipfile" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "lines_to_next_cell": 2 135 | }, 136 | "outputs": [], 137 | "source": [] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Datasets\n", 144 | "\n", 145 | "I'll be demoing with Kaggle 2018 survey data\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "url = 'https://github.com/mattharrison/datasets/raw/master/data/kaggle-survey-2018.zip'\n", 155 | "fin = urllib.request.urlopen(url)\n", 156 | "#fin = open('kaggle-survey-2018.zip', mode='rb')\n", 157 | "data = fin.read()\n", 158 | "with open('kaggle-survey-2018.zip', mode='wb') as fout:\n", 159 | " fout.write(data)\n", 160 | "with zipfile.ZipFile('kaggle-survey-2018.zip') as z:\n", 161 | " print(z.namelist())\n", 162 | " kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))\n", 163 | " kag_questions = kag.iloc[0]\n", 164 | " raw = kag.iloc[1:]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "scrolled": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "def topn(ser, n=5, default='other'):\n", 176 | " counts = ser.value_counts()\n", 177 | " return ser.where(ser.isin(counts.index[:n]), default)\n", 178 | "\n", 179 | "def tweak_kag(df):\n", 180 | " return (df\n", 181 | " #.query('Q3.isin([\"United States of America\", \"China\", \"India\"]) '\\\n", 182 | " # 'and Q6.isin([\"Data Scientist\", \"Software Engineer\"])')\n", 183 | " .loc[df.Q3.isin([\"United States of America\", \"China\", \"India\"]) &\n", 184 | " df.Q6.isin([\"Data Scientist\", \"Software Engineer\"])]\n", 185 | " .pipe(lambda df_:\n", 186 | " df_.assign(**(df_.Q1.pipe(pd.get_dummies, drop_first=True, prefix='gender')),\n", 187 | " age=df_.Q2.str.slice(0,2).astype(int),\n", 188 | " **(df_.Q3.pipe(pd.get_dummies, drop_first=True, prefix='country')),\n", 189 | " education=df_.Q4.replace({'Master’s degree': 18,\n", 190 | " 'Bachelor’s degree': 16,\n", 191 | " 'Doctoral degree': 20,\n", 192 | " 'Some college/university study without earning a bachelor’s degree': 13,\n", 193 | " 'Professional degree': 19,\n", 194 | " 'I prefer not to answer': None,\n", 195 | " 'No formal education past high school': 12}),\n", 196 | " **(df_.Q5\n", 197 | " .pipe(topn, n=3)\n", 198 | " .replace({\n", 199 | " 'Computer science (software engineering, etc.)': 'cs',\n", 200 | " 'Engineering (non-computer focused)': 'eng',\n", 201 | " 'Mathematics or statistics': 'stat'})\n", 202 | " .pipe(pd.get_dummies, drop_first=True, prefix='major')),\n", 203 | " title=df_.Q6,\n", 204 | " years_exp=(df_.Q8.str.replace('+','', regex=False)\n", 205 | " .str.split('-', expand=True)\n", 206 | " .iloc[:,0]\n", 207 | " .astype(float)),\n", 208 | " compensation=(df_.Q9.str.replace('+','', regex=False)\n", 209 | " .str.replace(',','', regex=False)\n", 210 | " .str.replace('500000', '500', regex=False)\n", 211 | " .str.replace('I do not wish to disclose my approximate yearly compensation', '0', regex=False)\n", 212 | " .str.split('-', expand=True)\n", 213 | " .iloc[:,0]\n", 214 | " .fillna(0)\n", 215 | " .astype(int)\n", 216 | " .mul(1_000)\n", 217 | " ),\n", 218 | " python=df_.Q16_Part_1.fillna(0).replace('Python', 1),\n", 219 | " r=df_.Q16_Part_2.fillna(0).replace('R', 1),\n", 220 | " sql=df_.Q16_Part_3.fillna(0).replace('SQL', 1)\n", 221 | " )#assign\n", 222 | " \n", 223 | " )#pipe\n", 224 | " .rename(columns=lambda col:col.replace(' ', '_'))\n", 225 | " .loc[:, 'gender_Male':] \n", 226 | " .dropna()\n", 227 | " )\n", 228 | "kag = tweak_kag(raw)\n", 229 | "kag_X = kag.drop(columns='title')\n", 230 | "kag_y = (kag.title == 'Data Scientist')\n", 231 | "kag_X_train, kag_X_test, kag_y_train, kag_y_test = model_selection.train_test_split(\n", 232 | " kag_X, kag_y, stratify=kag_y, random_state=42)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "kag_y" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Stumps, Trees, and Forests\n", 270 | "\n", 271 | "Decision trees use a greedy algorithm to split on a feature (column) that results in the most \"pure\" split." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "# True - DS\n", 281 | "kag_y.value_counts()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "lines_to_next_cell": 2, 289 | "scrolled": true 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "stump = tree.DecisionTreeClassifier(max_depth=1)\n", 294 | "stump.fit(kag_X_train, kag_y_train)\n", 295 | "stump.score(kag_X_test, kag_y_test)\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "# False - SE, Data Scientist - DS\n", 305 | "stump.classes_" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "scrolled": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "features = list(c for c in kag_X_train.columns)\n", 317 | "_ = tree.plot_tree(stump, feature_names=features, filled=True, \n", 318 | " class_names=['SE', 'DS'])" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Underfit\n", 340 | "A stump is too simple. It has too much *bias*.\n", 341 | "\n", 342 | "Solutions:\n", 343 | "\n", 344 | "* Add more features\n", 345 | "* Use a more complex model\n", 346 | "\n", 347 | "For a tree we can let it grow deeper which should do both." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "## Overfitting\n", 362 | "\n", 363 | "A model is too complicated. It has too much variance.\n", 364 | "\n", 365 | "Solutions:\n", 366 | "\n", 367 | "* Simplify or constrain (*regularize*)\n", 368 | "* Add more samples\n", 369 | "\n", 370 | "For a tree we can prune back the growth so that the leaf nodes are overly specific." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "lines_to_next_cell": 2, 378 | "scrolled": false 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "hi_variance = tree.DecisionTreeClassifier(max_depth=None)\n", 383 | "hi_variance.fit(kag_X_train, kag_y_train)\n", 384 | "hi_variance.score(kag_X_test, kag_y_test)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "scrolled": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "features = list(c for c in kag_X_train.columns)\n", 396 | "_ = tree.plot_tree(hi_variance, feature_names=features, filled=True, \n", 397 | " class_names=['SE', 'DS'])" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "scrolled": false 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "# limit view to first 2\n", 409 | "features = list(c for c in kag_X_train.columns)\n", 410 | "_ = tree.plot_tree(hi_variance, feature_names=features, filled=True, \n", 411 | " class_names=['SE', 'DS'], max_depth=2)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "\n", 433 | "## Tree Hyperparameters\n", 434 | "\n", 435 | "*max_\\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)\n", 436 | "\n", 437 | "*min_\\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)\n", 438 | "\n", 439 | "* 'max_depth=None' - Tree depth\n", 440 | "* 'max_features=None' - Amount of features to examine for split\n", 441 | "* 'max_leaf_nodes=None' - Number of leafs\n", 442 | "* 'min_impurity_decrease=0' - Split when *impurity* is >= this value. (*Impurity* : 0 - 100% accurate, .3 - 70%. Going from 70% to 100% accurate is a decrease of .3) \n", 443 | "* 'min_samples_leaf=1', - Minimum samples at each leaf.\n", 444 | "* 'min_samples_split=2' - Minimum samples required to split a node.\n", 445 | "* 'min_weight_fraction_leaf=0' - The fraction fo the total weights required to be a leaf.\n" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "scrolled": true 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "print(dir(stump))" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "## Random Forest\n", 478 | "\n", 479 | "Uses *bagging* to ensemble many trees in an attempt to lower variance." 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": { 486 | "lines_to_next_cell": 2, 487 | "scrolled": false 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "rf = ensemble.RandomForestClassifier(random_state=42)\n", 492 | "rf.fit(kag_X_train, kag_y_train)\n", 493 | "rf.score(kag_X_test, kag_y_test)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "scrolled": true 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "len(rf.estimators_)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "metadata": { 511 | "scrolled": false 512 | }, 513 | "outputs": [], 514 | "source": [ 515 | "features = list(c for c in kag_X_train.columns)\n", 516 | "_ = tree.plot_tree(rf.estimators_[0], feature_names=features, filled=True, \n", 517 | " class_names=['SE', 'DS'])" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "## Random Forest Hyperparameters\n", 532 | "\n", 533 | "*max_\\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)\n", 534 | "\n", 535 | "*min_\\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)\n", 536 | "\n", 537 | "* 'n_estimators=100' - Number of trees - should be *max_estimators*\n", 538 | "* 'oob_score=False' - Can estimate score when training (by using rows that weren't randomly selected). No need to hold out data\n", 539 | "* 'warm_start=False' - Can add more trees w/o starting over\n", 540 | "\n", 541 | "From tree:\n", 542 | "\n", 543 | "* 'max_depth=None' - Tree depth (1 to Infinity (`None`))\n", 544 | "* 'max_features=\"sqrt\"' - Amount of features to examine for split (1 to number of features (int). Float of percent (0. to 1.0). \"log2\" log2(n_features) or \"sqrt\" sqrt(n_features). (Default square root number of features.)\n", 545 | "* 'max_leaf_nodes=None' - Number of leafs. Default (`None`) is unlimited.\n", 546 | "* 'min_impurity_decrease=0' - Split when *impurity* is >= this value. (0.0 to 1.0) (*Impurity* : 0 - 100% accurate, .3 - 70%) \n", 547 | "* 'min_samples_leaf=1', - Minimum samples at each leaf. (1 to n_samples).\n", 548 | "* 'min_samples_split=2' - Minimum samples required to split a node. (1 to n_samples)\n", 549 | "* 'min_weight_fraction_leaf=0' - The fraction (0.0 to 1.0) of the total weights required to be a leaf." 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "scrolled": false 557 | }, 558 | "outputs": [], 559 | "source": [ 560 | "print(dir(rf))" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": { 567 | "scrolled": true 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "# visualize how changing n_estimators affects score\n", 572 | "results = []\n", 573 | "rf_ws = ensemble.RandomForestClassifier(random_state=42, warm_start=True, n_estimators=1)\n", 574 | "rf_ws.fit(kag_X_train, kag_y_train)\n", 575 | "for i in range(2,100):\n", 576 | " rf_ws.set_params(n_estimators=i)\n", 577 | " rf_ws.fit(kag_X_train, kag_y_train)\n", 578 | " # see other metrics\n", 579 | " results.append(metrics.f1_score(kag_y_test, rf_ws.predict(kag_X_test)))\n", 580 | "pd.Series(results, index=range(2, 100)).plot(figsize=(8,4)) " 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": { 587 | "scrolled": true 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "# visualize how changing max_depth affects score\n", 592 | "results = []\n", 593 | "for i in range(1,20):\n", 594 | " rf_ws = ensemble.RandomForestClassifier(random_state=42, \n", 595 | " max_depth=i)\n", 596 | " rf_ws.fit(kag_X_train, kag_y_train)\n", 597 | " results.append(metrics.f1_score(kag_y_test, rf_ws.predict(kag_X_test)))\n", 598 | "pd.Series(results, index=range(1,20)).plot(figsize=(8,4)) " 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": { 605 | "lines_to_next_cell": 2 606 | }, 607 | "outputs": [], 608 | "source": [] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": {}, 621 | "outputs": [], 622 | "source": [] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "## XGBoost\n", 657 | "\n", 658 | "Uses *boosting* to train a series of (weak) trees that try to correct the error of the previous output. (For classification this is mapped to a probability)\n", 659 | "\n", 660 | "Like golfing (you continue to putt or use a different club depending on first error). Decision tree would be a single tee off. Random forest would be averaging the tee offs. \n", 661 | "\n", 662 | "* Regularization\n", 663 | "* Parallel Processing\n", 664 | "* Missing Number Support\n", 665 | "* Category Support" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": { 672 | "lines_to_next_cell": 2, 673 | "scrolled": true 674 | }, 675 | "outputs": [], 676 | "source": [ 677 | "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n", 678 | "xg.fit(kag_X_train, kag_y_train)\n", 679 | "xg.score(kag_X_test, kag_y_test)" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": { 686 | "lines_to_next_cell": 2, 687 | "scrolled": false 688 | }, 689 | "outputs": [], 690 | "source": [ 691 | "# Let's try w/ depth of 2 and 2 trees\n", 692 | "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, max_depth=2, n_estimators=2)\n", 693 | "xg.fit(kag_X_train, kag_y_train)\n", 694 | "xg.score(kag_X_test, kag_y_test)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "# first tree\n", 704 | "xgb.to_graphviz(xg, size='1,1', num_trees=0, fontsize='1')" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": { 711 | "scrolled": true 712 | }, 713 | "outputs": [], 714 | "source": [ 715 | "# second tree\n", 716 | "xgb.to_graphviz(xg, size='1,1', num_trees=1, fontsize='1')" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "# let's go down the left path with\n", 726 | "# this data\n", 727 | "row = pd.Series({'gender_Male': 0.0, 'gender_Prefer_not_to_say': 0.0, \n", 728 | " 'gender_Prefer_to_self-describe': 0.0, 'age': 30.0, 'country_India': 0.0, \n", 729 | " 'country_United_States_of_America': 1.0, 'education': 16.0, 'major_eng': 0.0, \n", 730 | " 'major_other': 0.0, 'major_stat': 0.0, 'years_exp': 0.0, 'compensation': 0.0, \n", 731 | " 'python': 0.0, 'r': 0.0, 'sql': 0.0}).to_frame().T\n", 732 | "row" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "# result for DS = .4522\n", 742 | "# < .5 ... so Software Engineer!\n", 743 | "# this is [prob death, prob survival]\n", 744 | "xg.predict_proba(row)" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "xg.predict(row)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": { 760 | "lines_to_next_cell": 0, 761 | "scrolled": false 762 | }, 763 | "outputs": [], 764 | "source": [ 765 | "# sum up leafs and throw into \n", 766 | "# Example: no r, low ed, low exp\n", 767 | "# -.251 + 0.0602\n", 768 | "\n", 769 | "vals = np.linspace(-10, 10)\n", 770 | "def inv_logit(p):\n", 771 | " return np.exp(p) / (1 + np.exp(p))\n", 772 | "\n", 773 | "x = -.251 + 0.0602\n", 774 | "y = inv_logit(-.251 + 0.0602)\n", 775 | "print(f'({x:.2}, {y:.2})')\n", 776 | "fig, ax = plt.subplots(figsize=(6,4))\n", 777 | "ax.plot(vals, inv_logit(vals))\n", 778 | "ax.plot([x], [y], marker='o')\n", 779 | "ax.set_xlim([-5, 5])\n", 780 | "_ = ax.set_xticks([-3, -2, -1, 0, 1, 2, 3])\n", 781 | "_ = ax.set_yticks([0,.4, .5, .6, 1])" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": {}, 788 | "outputs": [], 789 | "source": [] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": { 795 | "lines_to_next_cell": 2 796 | }, 797 | "outputs": [], 798 | "source": [] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [] 806 | }, 807 | { 808 | "cell_type": "markdown", 809 | "metadata": {}, 810 | "source": [ 811 | "## Early Stopping\n", 812 | "Because you can keep \"putting\" you can keep track of how far away you are from the hole and stop when you are closest." 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": { 819 | "lines_to_next_cell": 2, 820 | "scrolled": false 821 | }, 822 | "outputs": [], 823 | "source": [ 824 | "# defaults\n", 825 | "# 100 putts\n", 826 | "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n", 827 | "xg.fit(kag_X_train, kag_y_train)\n", 828 | "xg.score(kag_X_test, kag_y_test)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "scrolled": true 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "# Early stopping\n", 840 | "# Go up to 100 but stop after you haven't improved for 20 hits\n", 841 | "# Min value at round 9\n", 842 | "\n", 843 | "xg = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False,\n", 844 | " early_stopping_rounds=20)\n", 845 | "xg.fit(kag_X_train, kag_y_train,\n", 846 | " eval_set=[(kag_X_train, kag_y_train),\n", 847 | " (kag_X_test, kag_y_test)])\n", 848 | "xg.score(kag_X_test, kag_y_test)" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "metadata": { 855 | "scrolled": true 856 | }, 857 | "outputs": [], 858 | "source": [ 859 | "xg.best_ntree_limit" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "metadata": { 866 | "scrolled": true 867 | }, 868 | "outputs": [], 869 | "source": [ 870 | "# we can get the evaluation metrics\n", 871 | "# validation_0 is for training data\n", 872 | "# validation_1 is for testing data\n", 873 | "results = xg.evals_result()\n", 874 | "results" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "metadata": {}, 881 | "outputs": [], 882 | "source": [ 883 | "# Testing score is best at 11 trees\n", 884 | "results = xg.evals_result()\n", 885 | "ax = pd.DataFrame({'training': results['validation_0']['logloss'],\n", 886 | " 'testing': results['validation_1']['logloss'],\n", 887 | " }).shift().plot(figsize=(5,4))\n", 888 | "ax.set_xlabel('ntrees')" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": null, 894 | "metadata": {}, 895 | "outputs": [], 896 | "source": [] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "metadata": {}, 909 | "outputs": [], 910 | "source": [] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": null, 915 | "metadata": {}, 916 | "outputs": [], 917 | "source": [] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "metadata": {}, 922 | "source": [ 923 | "## XGBoost Hyperparameters\n", 924 | "\n", 925 | "*max_\\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)\n", 926 | "\n", 927 | "*min_\\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)\n", 928 | "\n", 929 | "* Boosting\n", 930 | "\n", 931 | " * ``n_estimators=100`` - number of trees (or boosting rounds). Larger is more complex. Default 100. Use ``early_stopping_rounds`` with ``.fit`` to prevent overfitting.\n", 932 | "\n", 933 | " * ``learning_rate=.3`` (called ``eta`` too) - after each boosting step, shrink feature weights. Larger is more conservative. Can be used with n_estimators to adjust time for convergence [0,1], default .3\n", 934 | "\n", 935 | " * ``gamma=0`` / ``min_split_loss`` - L0 regularization. Global regularization. Minimum loss required for split. Larger is more conservative. [0, ∞], default 0 - No regularization.\n", 936 | "\n", 937 | "\n", 938 | "* Regularization\n", 939 | "\n", 940 | " * ``reg_lambda=1`` - L2 regularization (Root of squared weights). Increase to be more conservative. Default 1\n", 941 | " * ``reg_alpha=0`` - L1 regularization (Mean of weights). Increase to be more conservative. Default 0\n", 942 | "\n", 943 | "* Sampling - Use different rows\n", 944 | "\n", 945 | " * ``subsample=1`` - Use % of samples (this is rows!) for next boosting round. Lower to more conservative. [0, 1], default 1. (When not equal to 1.0, model does *stochastic gradient descent*, ie. there is some randomness in the model.)\n", 946 | "\n", 947 | "\n", 948 | "New tree (sampling) parameters - Use different columns (not rows!):\n", 949 | "\n", 950 | " * ``colsample_bytree=1`` - Fraction of columns for each boosting round.\n", 951 | " \n", 952 | " * ``colsample_bylevel=1`` - Fraction of columns for each depth level.\n", 953 | " \n", 954 | " * ``colsample_bynode=1`` - Fraction of columns for each node.\n", 955 | " \n", 956 | "\n", 957 | "From tree:\n", 958 | "\n", 959 | " * ``max_depth=6`` - depth of tree. Larger is more complex (more likely to overfit). How many feature interactions you can have. Each level doubles time. [0, ∞], default 6\n", 960 | " * ``min_child_weight=1`` - Stop splitting after certain amount of purity. Larger will be more conservative.\n", 961 | "\n", 962 | "\n", 963 | "Imbalanced data:\n", 964 | "\n", 965 | "* ``scale_pos_weight=1`` - ratio negative/positive. Default 1\n", 966 | "* Use ``'auc'`` or ``'aucpr'`` for ``eval_metric`` metric (rather than classification default ``'logless'``)\n", 967 | "* ``max_delta_step=0`` - try values from 1-10. Default 0\n", 968 | "\n", 969 | "\n", 970 | "\n" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": null, 976 | "metadata": { 977 | "scrolled": true 978 | }, 979 | "outputs": [], 980 | "source": [ 981 | "# try gamma on xgb\n", 982 | "fig, ax = plt.subplots(figsize=(8,4))\n", 983 | "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),\n", 984 | " kag_X, kag_y,\n", 985 | " param_name='gamma', param_range=[0, .5, 1,2,5,10, 20])" 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": null, 991 | "metadata": { 992 | "scrolled": true 993 | }, 994 | "outputs": [], 995 | "source": [ 996 | "fig, ax = plt.subplots(figsize=(8,4))\n", 997 | "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n", 998 | " kag_X, kag_y,\n", 999 | " param_name='max_depth', param_range=[1,2,3,4,5,10])" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": null, 1005 | "metadata": { 1006 | "scrolled": true 1007 | }, 1008 | "outputs": [], 1009 | "source": [ 1010 | "# note this depends on n_estimators\n", 1011 | "# should really use early stopping but yellowbrick doesn't support this 😢\n", 1012 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1013 | "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n", 1014 | " kag_X, kag_y,\n", 1015 | " param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1])" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": null, 1021 | "metadata": {}, 1022 | "outputs": [], 1023 | "source": [ 1024 | "params = {'learning_rate': 0.1,\n", 1025 | " 'max_depth': 3,\n", 1026 | " 'n_estimators': 200,\n", 1027 | " 'n_jobs': -1,\n", 1028 | " 'random_state': 42,\n", 1029 | " 'reg_lambda': 0,\n", 1030 | " 'subsample': 1}" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "code", 1035 | "execution_count": null, 1036 | "metadata": { 1037 | "lines_to_next_cell": 2, 1038 | "scrolled": true 1039 | }, 1040 | "outputs": [], 1041 | "source": [ 1042 | "# this takes a while to run (about 2 minutes)\n", 1043 | "# can set scoring in GridSearchCV to \n", 1044 | "# recall, precision, f1, accuracy\n", 1045 | "params = {'reg_lambda': [0], # No effect\n", 1046 | " 'learning_rate': [.1, .3], # makes each boost more conservative (0 - no shrinkage) \n", 1047 | " #'colsample_bylevel': [.3, 1], # use 0, 50%, or 100% of columns in boost step\n", 1048 | " 'subsample': [.7, 1],\n", 1049 | " #'gamma': [0, 1],\n", 1050 | " 'max_depth': [1, 2, 3],\n", 1051 | " 'random_state': [42],\n", 1052 | " 'n_jobs': [-1],\n", 1053 | " #'early_stopping_rounds':[10],\n", 1054 | " 'n_estimators': [200]}\n", 1055 | "kag_xgb2 = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n", 1056 | "cv = (model_selection.GridSearchCV(kag_xgb2, params, cv=3, n_jobs=-1)\n", 1057 | " .fit(kag_X_train, kag_y_train,\n", 1058 | " eval_set=[(kag_X_test, kag_y_test)],\n", 1059 | " early_stopping_rounds=5) \n", 1060 | " )" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": null, 1066 | "metadata": { 1067 | "scrolled": false 1068 | }, 1069 | "outputs": [], 1070 | "source": [ 1071 | "cv.best_params_" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": null, 1077 | "metadata": { 1078 | "lines_to_next_cell": 0 1079 | }, 1080 | "outputs": [], 1081 | "source": [ 1082 | "# vs default\n", 1083 | "params = {'learning_rate': 0.3,\n", 1084 | " 'max_depth': 2,\n", 1085 | " 'n_estimators': 200,\n", 1086 | " 'n_jobs': -1,\n", 1087 | " 'random_state': 42,\n", 1088 | " 'reg_lambda': 0,\n", 1089 | " 'subsample': 0.7}\n", 1090 | "xgb_def2 = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n", 1091 | "xgb_def2.fit(kag_X_train, kag_y_train)\n", 1092 | "\n", 1093 | "xgb_grid2 = xgb.XGBClassifier(**params, eval_metric='logloss', use_label_encoder=False)\n", 1094 | "xgb_grid2.fit(kag_X_train, kag_y_train)\n", 1095 | "xgb_def2.score(kag_X_test, kag_y_test), xgb_grid2.score(kag_X_test, kag_y_test)" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": null, 1101 | "metadata": { 1102 | "lines_to_next_cell": 2 1103 | }, 1104 | "outputs": [], 1105 | "source": [] 1106 | }, 1107 | { 1108 | "cell_type": "markdown", 1109 | "metadata": {}, 1110 | "source": [ 1111 | "## Bonus: Tuning with Hyperopt\n" 1112 | ] 1113 | }, 1114 | { 1115 | "cell_type": "code", 1116 | "execution_count": null, 1117 | "metadata": {}, 1118 | "outputs": [], 1119 | "source": [ 1120 | "!pip install hyperopt" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "execution_count": null, 1126 | "metadata": {}, 1127 | "outputs": [], 1128 | "source": [ 1129 | "from hyperopt import fmin, tpe, hp, STATUS_OK, Trials\n", 1130 | "from sklearn.metrics import accuracy_score \n", 1131 | "#https://bradleyboehmke.github.io/xgboost_databricks_tuning/index.html#slide21\n", 1132 | "space = {\n", 1133 | " 'learning_rate': hp.loguniform('learning_rate', -7, 0),\n", 1134 | " 'max_depth': hp.quniform('max_depth', 1, 12, 1),\n", 1135 | " 'min_child_weight': hp.loguniform('min_child_weight', -2, 3),\n", 1136 | " 'subsample': hp.uniform('subsample', 0.5, 1),\n", 1137 | " 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),\n", 1138 | " 'gamma': hp.loguniform('gamma', -10, 10),\n", 1139 | " 'reg_alpha': hp.loguniform('alpha', -10, 10),\n", 1140 | " 'reg_lambda': hp.loguniform('lambda', -10, 10),\n", 1141 | " 'objective': 'binary:logistic',\n", 1142 | " 'eval_metric': 'auc',\n", 1143 | " 'seed': 123,\n", 1144 | "}" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": null, 1150 | "metadata": {}, 1151 | "outputs": [], 1152 | "source": [ 1153 | "def hyperparameter_tuning(space): \n", 1154 | " model = xgb.XGBClassifier(max_depth = int(space['max_depth']), \n", 1155 | " gamma = space['gamma'], \n", 1156 | " reg_alpha = int(space['reg_alpha']),\n", 1157 | " min_child_weight=space['min_child_weight'], \n", 1158 | " colsample_bytree=space['colsample_bytree'])\n", 1159 | " evaluation = [(kag_X_train, kag_y_train),\n", 1160 | " (kag_X_test, kag_y_test)]\n", 1161 | " model.fit(kag_X_train, kag_y_train,\n", 1162 | " eval_set=evaluation, eval_metric=\"rmse\", \n", 1163 | " early_stopping_rounds=10,verbose=False) \n", 1164 | " \n", 1165 | " pred = model.predict(kag_X_test)\n", 1166 | " accuracy = accuracy_score(kag_y_test, pred>0.5) \n", 1167 | " print (\"SCORE:\", accuracy) \n", 1168 | " #change the metric if you like \n", 1169 | " return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "execution_count": null, 1175 | "metadata": {}, 1176 | "outputs": [], 1177 | "source": [ 1178 | "trials = Trials()\n", 1179 | "best = fmin(fn=hyperparameter_tuning, \n", 1180 | " space=space, \n", 1181 | " algo=tpe.suggest, \n", 1182 | " max_evals=1000, \n", 1183 | " trials=trials,\n", 1184 | " #timeout=60*5 # 5 minutes\n", 1185 | " )\n", 1186 | "print (best)" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": null, 1192 | "metadata": { 1193 | "scrolled": true 1194 | }, 1195 | "outputs": [], 1196 | "source": [ 1197 | "best # new" 1198 | ] 1199 | }, 1200 | { 1201 | "cell_type": "code", 1202 | "execution_count": null, 1203 | "metadata": { 1204 | "scrolled": true 1205 | }, 1206 | "outputs": [], 1207 | "source": [ 1208 | "hyper_params ={'alpha': 0.19514909424102928,\n", 1209 | " 'colsample_bytree': 0.8227256149391048,\n", 1210 | " 'gamma': 0.010701959121627006,\n", 1211 | " 'lambda': 0.010955985134796302,\n", 1212 | " 'learning_rate': 0.004570442245136879,\n", 1213 | " 'max_depth': 3, \n", 1214 | " 'min_child_weight': 0.2497193683952876,\n", 1215 | " 'subsample': 0.6416201529297743}\n", 1216 | "xgb_hyp = xgb.XGBClassifier(**hyper_params, eval_metric='logloss', \n", 1217 | " use_label_encoder=False,\n", 1218 | " n_estimators=2_000)\n", 1219 | "evaluation = [(kag_X_train, kag_y_train),\n", 1220 | " (kag_X_test, kag_y_test)]\n", 1221 | "xgb_hyp.fit(kag_X_train, kag_y_train, early_stopping_rounds=10,\n", 1222 | " eval_set=evaluation)\n", 1223 | "xgb_hyp.score(kag_X_test, kag_y_test)#" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "execution_count": null, 1229 | "metadata": { 1230 | "scrolled": false 1231 | }, 1232 | "outputs": [], 1233 | "source": [ 1234 | "xgb_hyp.score(kag_X_test, kag_y_test)" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "code", 1239 | "execution_count": null, 1240 | "metadata": { 1241 | "scrolled": false 1242 | }, 1243 | "outputs": [], 1244 | "source": [ 1245 | "# vs default and grid\n", 1246 | "xgb_def2.score(kag_X_test, kag_y_test), xgb_grid2.score(kag_X_test, kag_y_test)" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": null, 1252 | "metadata": {}, 1253 | "outputs": [], 1254 | "source": [ 1255 | "grid = xgb_grid2.get_params()\n", 1256 | "hyp = xgb_hyp.get_params()\n", 1257 | "for k in grid:\n", 1258 | " print(f'{k=:20} grid:{grid[k] or \"\":20} hyp:{hyp[k] or \"\"}')" 1259 | ] 1260 | }, 1261 | { 1262 | "cell_type": "code", 1263 | "execution_count": null, 1264 | "metadata": {}, 1265 | "outputs": [], 1266 | "source": [] 1267 | }, 1268 | { 1269 | "cell_type": "code", 1270 | "execution_count": null, 1271 | "metadata": {}, 1272 | "outputs": [], 1273 | "source": [] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "execution_count": null, 1278 | "metadata": {}, 1279 | "outputs": [], 1280 | "source": [] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "execution_count": null, 1285 | "metadata": {}, 1286 | "outputs": [], 1287 | "source": [] 1288 | }, 1289 | { 1290 | "cell_type": "markdown", 1291 | "metadata": {}, 1292 | "source": [ 1293 | "## Model Evaluation\n", 1294 | "Now that we've tuned our model, let's look at how it performs" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "metadata": { 1301 | "scrolled": true 1302 | }, 1303 | "outputs": [], 1304 | "source": [ 1305 | "hyper_params ={'alpha': 0.19514909424102928,\n", 1306 | " 'colsample_bytree': 0.8227256149391048,\n", 1307 | " 'gamma': 0.010701959121627006,\n", 1308 | " 'lambda': 0.010955985134796302,\n", 1309 | " 'learning_rate': 0.004570442245136879,\n", 1310 | " 'max_depth': 3, \n", 1311 | " 'min_child_weight': 0.2497193683952876,\n", 1312 | " 'subsample': 0.6416201529297743}\n", 1313 | "xgb_hyp = xgb.XGBClassifier(**hyper_params, eval_metric='logloss', \n", 1314 | " use_label_encoder=False,\n", 1315 | " n_estimators=2_000)\n", 1316 | "evaluation = [(kag_X_train, kag_y_train),\n", 1317 | " (kag_X_test, kag_y_test)]\n", 1318 | "xgb_hyp.fit(kag_X_train, kag_y_train, early_stopping_rounds=10,\n", 1319 | " eval_set=evaluation)" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": null, 1325 | "metadata": { 1326 | "scrolled": false 1327 | }, 1328 | "outputs": [], 1329 | "source": [ 1330 | "metrics.accuracy_score(kag_y_test, xgb_hyp.predict(kag_X_test))" 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "code", 1335 | "execution_count": null, 1336 | "metadata": { 1337 | "scrolled": true 1338 | }, 1339 | "outputs": [], 1340 | "source": [ 1341 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1342 | "classifier.confusion_matrix(xgb_hyp, kag_X_train, kag_y_train,\n", 1343 | " kag_X_test, kag_y_test,\n", 1344 | " classes=['SE', 'DS']\n", 1345 | " )" 1346 | ] 1347 | }, 1348 | { 1349 | "cell_type": "code", 1350 | "execution_count": null, 1351 | "metadata": { 1352 | "scrolled": true 1353 | }, 1354 | "outputs": [], 1355 | "source": [ 1356 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1357 | "metrics.RocCurveDisplay.from_estimator(xgb_hyp,\n", 1358 | " kag_X_test, kag_y_test,ax=ax)" 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "code", 1363 | "execution_count": null, 1364 | "metadata": { 1365 | "scrolled": false 1366 | }, 1367 | "outputs": [], 1368 | "source": [ 1369 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1370 | "classifier.precision_recall_curve(xgb_hyp, kag_X_train, kag_y_train,\n", 1371 | " kag_X_test, kag_y_test,\n", 1372 | " classes=['SE', 'DS'],\n", 1373 | " micro=False, macro=False\n", 1374 | " )" 1375 | ] 1376 | }, 1377 | { 1378 | "cell_type": "code", 1379 | "execution_count": null, 1380 | "metadata": { 1381 | "scrolled": false 1382 | }, 1383 | "outputs": [], 1384 | "source": [ 1385 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1386 | "classifier.classification_report(xgb_hyp, kag_X_train, kag_y_train,\n", 1387 | " kag_X_test, kag_y_test,\n", 1388 | " classes=['SE', 'DS'],\n", 1389 | " micro=False, macro=False\n", 1390 | " )" 1391 | ] 1392 | }, 1393 | { 1394 | "cell_type": "code", 1395 | "execution_count": null, 1396 | "metadata": {}, 1397 | "outputs": [], 1398 | "source": [] 1399 | }, 1400 | { 1401 | "cell_type": "code", 1402 | "execution_count": null, 1403 | "metadata": {}, 1404 | "outputs": [], 1405 | "source": [] 1406 | }, 1407 | { 1408 | "cell_type": "code", 1409 | "execution_count": null, 1410 | "metadata": {}, 1411 | "outputs": [], 1412 | "source": [] 1413 | }, 1414 | { 1415 | "cell_type": "code", 1416 | "execution_count": null, 1417 | "metadata": {}, 1418 | "outputs": [], 1419 | "source": [] 1420 | }, 1421 | { 1422 | "cell_type": "code", 1423 | "execution_count": null, 1424 | "metadata": {}, 1425 | "outputs": [], 1426 | "source": [] 1427 | }, 1428 | { 1429 | "cell_type": "markdown", 1430 | "metadata": {}, 1431 | "source": [ 1432 | "## Training For Different Metrics\n", 1433 | "\n", 1434 | "We tuned our model. But we tuned it against accuracy. What if we want to optimize for recall?" 1435 | ] 1436 | }, 1437 | { 1438 | "cell_type": "code", 1439 | "execution_count": null, 1440 | "metadata": { 1441 | "scrolled": true 1442 | }, 1443 | "outputs": [], 1444 | "source": [ 1445 | "# accuracy tuning\n", 1446 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1447 | "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n", 1448 | " kag_X_train, kag_y_train,\n", 1449 | " # param_name='max_depth', param_range=[1,2,5,10]\n", 1450 | " param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]\n", 1451 | " )" 1452 | ] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": null, 1457 | "metadata": { 1458 | "scrolled": true 1459 | }, 1460 | "outputs": [], 1461 | "source": [ 1462 | "# recall tuning\n", 1463 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1464 | "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n", 1465 | " kag_X_train, kag_y_train,\n", 1466 | " scoring='recall',\n", 1467 | " #param_name='max_depth', param_range=[1,2,5,10]\n", 1468 | " param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]\n", 1469 | " )" 1470 | ] 1471 | }, 1472 | { 1473 | "cell_type": "code", 1474 | "execution_count": null, 1475 | "metadata": { 1476 | "scrolled": true 1477 | }, 1478 | "outputs": [], 1479 | "source": [ 1480 | "fig, ax = plt.subplots(figsize=(8,4))\n", 1481 | "ms.validation_curve(xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False), \n", 1482 | " kag_X_train, kag_y_train,\n", 1483 | " scoring='f1',\n", 1484 | " #param_name='max_depth', param_range=[1,2,5,10]\n", 1485 | " param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]\n", 1486 | " )" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "code", 1491 | "execution_count": null, 1492 | "metadata": {}, 1493 | "outputs": [], 1494 | "source": [] 1495 | }, 1496 | { 1497 | "cell_type": "code", 1498 | "execution_count": null, 1499 | "metadata": {}, 1500 | "outputs": [], 1501 | "source": [] 1502 | }, 1503 | { 1504 | "cell_type": "code", 1505 | "execution_count": null, 1506 | "metadata": {}, 1507 | "outputs": [], 1508 | "source": [] 1509 | }, 1510 | { 1511 | "cell_type": "markdown", 1512 | "metadata": {}, 1513 | "source": [ 1514 | "## Model Interpretation" 1515 | ] 1516 | }, 1517 | { 1518 | "cell_type": "code", 1519 | "execution_count": null, 1520 | "metadata": { 1521 | "lines_to_next_cell": 2, 1522 | "scrolled": true 1523 | }, 1524 | "outputs": [], 1525 | "source": [ 1526 | "# Trees are great when they overfit... They can explain what they overfit\n", 1527 | "# (You can use these for \"surrogate models\")\n", 1528 | "hi_variance = tree.DecisionTreeClassifier(max_depth=None)\n", 1529 | "hi_variance.fit(kag_X_train, kag_y_train)\n", 1530 | "hi_variance.score(kag_X_test, kag_y_test)" 1531 | ] 1532 | }, 1533 | { 1534 | "cell_type": "code", 1535 | "execution_count": null, 1536 | "metadata": { 1537 | "scrolled": false 1538 | }, 1539 | "outputs": [], 1540 | "source": [ 1541 | "# Feature importance shows the magnitude (not direction) of impact\n", 1542 | "(pd.Series(hi_variance.feature_importances_, index=kag_X_train.columns)\n", 1543 | " .sort_values()\n", 1544 | " .plot.barh()\n", 1545 | ")" 1546 | ] 1547 | }, 1548 | { 1549 | "cell_type": "code", 1550 | "execution_count": null, 1551 | "metadata": { 1552 | "scrolled": false 1553 | }, 1554 | "outputs": [], 1555 | "source": [ 1556 | "# XGBoost also supports feature importance\n", 1557 | "xgb_def = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)\n", 1558 | "xgb_def.fit(kag_X_train, kag_y_train)" 1559 | ] 1560 | }, 1561 | { 1562 | "cell_type": "code", 1563 | "execution_count": null, 1564 | "metadata": { 1565 | "scrolled": true 1566 | }, 1567 | "outputs": [], 1568 | "source": [ 1569 | "(pd.Series(xgb_def.feature_importances_, index=kag_X_train.columns)\n", 1570 | " .sort_values()\n", 1571 | " .plot.barh()\n", 1572 | ")" 1573 | ] 1574 | }, 1575 | { 1576 | "cell_type": "code", 1577 | "execution_count": null, 1578 | "metadata": {}, 1579 | "outputs": [], 1580 | "source": [ 1581 | "# * \"weight\" is the number of times a feature appears in a tree\n", 1582 | "# * \"gain\" is the average gain of splits which use the feature\n", 1583 | "# * \"cover\" is the average coverage of splits which use the feature\n", 1584 | "xgb.plot_importance(xgb_def, importance_type='cover')" 1585 | ] 1586 | }, 1587 | { 1588 | "cell_type": "code", 1589 | "execution_count": null, 1590 | "metadata": {}, 1591 | "outputs": [], 1592 | "source": [] 1593 | }, 1594 | { 1595 | "cell_type": "code", 1596 | "execution_count": null, 1597 | "metadata": {}, 1598 | "outputs": [], 1599 | "source": [] 1600 | }, 1601 | { 1602 | "cell_type": "code", 1603 | "execution_count": null, 1604 | "metadata": {}, 1605 | "outputs": [], 1606 | "source": [] 1607 | }, 1608 | { 1609 | "cell_type": "code", 1610 | "execution_count": null, 1611 | "metadata": {}, 1612 | "outputs": [], 1613 | "source": [] 1614 | }, 1615 | { 1616 | "cell_type": "code", 1617 | "execution_count": null, 1618 | "metadata": {}, 1619 | "outputs": [], 1620 | "source": [] 1621 | }, 1622 | { 1623 | "cell_type": "code", 1624 | "execution_count": null, 1625 | "metadata": {}, 1626 | "outputs": [], 1627 | "source": [] 1628 | }, 1629 | { 1630 | "cell_type": "markdown", 1631 | "metadata": {}, 1632 | "source": [ 1633 | "## xgbfir (Feature Interactions Reshaped)\n", 1634 | " *Gain*: Total gain of each feature or feature interaction\n", 1635 | " \n", 1636 | " *FScore*: Amount of possible splits taken on a feature or feature Interaction\n", 1637 | " \n", 1638 | " *wFScore*: Amount of possible splits taken on a feature or feature nteraction weighted by the probability of the splits to take place\n", 1639 | " \n", 1640 | " *Average wFScore*: wFScore divided by FScore\n", 1641 | " \n", 1642 | " *Average Gain*: Gain divided by FScore\n", 1643 | " \n", 1644 | " *Expected Gain*: Total gain of each feature or feature interaction weighted by the probability to gather the gain\n" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "code", 1649 | "execution_count": null, 1650 | "metadata": {}, 1651 | "outputs": [], 1652 | "source": [ 1653 | "!pip install openpyxl" 1654 | ] 1655 | }, 1656 | { 1657 | "cell_type": "code", 1658 | "execution_count": null, 1659 | "metadata": { 1660 | "scrolled": true 1661 | }, 1662 | "outputs": [], 1663 | "source": [ 1664 | "import xgbfir\n", 1665 | "xgbfir.saveXgbFI(xgb_def, feature_names=kag_X_train.columns, OutputXlsxFile='fir.xlsx')\n", 1666 | "pd.read_excel('fir.xlsx')" 1667 | ] 1668 | }, 1669 | { 1670 | "cell_type": "code", 1671 | "execution_count": null, 1672 | "metadata": { 1673 | "scrolled": true 1674 | }, 1675 | "outputs": [], 1676 | "source": [ 1677 | "pd.read_excel('fir.xlsx', sheet_name='Interaction Depth 1')" 1678 | ] 1679 | }, 1680 | { 1681 | "cell_type": "code", 1682 | "execution_count": null, 1683 | "metadata": { 1684 | "scrolled": true 1685 | }, 1686 | "outputs": [], 1687 | "source": [ 1688 | "pd.read_excel('fir.xlsx', sheet_name='Interaction Depth 2')" 1689 | ] 1690 | }, 1691 | { 1692 | "cell_type": "code", 1693 | "execution_count": null, 1694 | "metadata": {}, 1695 | "outputs": [], 1696 | "source": [] 1697 | }, 1698 | { 1699 | "cell_type": "code", 1700 | "execution_count": null, 1701 | "metadata": {}, 1702 | "outputs": [], 1703 | "source": [] 1704 | }, 1705 | { 1706 | "cell_type": "code", 1707 | "execution_count": null, 1708 | "metadata": {}, 1709 | "outputs": [], 1710 | "source": [] 1711 | }, 1712 | { 1713 | "cell_type": "code", 1714 | "execution_count": null, 1715 | "metadata": {}, 1716 | "outputs": [], 1717 | "source": [] 1718 | }, 1719 | { 1720 | "cell_type": "code", 1721 | "execution_count": null, 1722 | "metadata": {}, 1723 | "outputs": [], 1724 | "source": [] 1725 | }, 1726 | { 1727 | "cell_type": "code", 1728 | "execution_count": null, 1729 | "metadata": {}, 1730 | "outputs": [], 1731 | "source": [] 1732 | }, 1733 | { 1734 | "cell_type": "markdown", 1735 | "metadata": {}, 1736 | "source": [ 1737 | "# SHAP (SHapley Additive exPlantations)\n", 1738 | "Should be *globally* consistent and accurate\n", 1739 | "\n", 1740 | " Shapley value (SHAP).\n", 1741 | " \n", 1742 | " From game theory, indicates how to distribute attribution of label\n", 1743 | "\n" 1744 | ] 1745 | }, 1746 | { 1747 | "cell_type": "code", 1748 | "execution_count": null, 1749 | "metadata": {}, 1750 | "outputs": [], 1751 | "source": [ 1752 | "import shap\n", 1753 | "shap.initjs()\n", 1754 | "\n", 1755 | "# make sure you initialize the js side\n", 1756 | "shap_ex = shap.TreeExplainer(xgb_def)\n", 1757 | "vals = shap_ex.shap_values(kag_X_test)" 1758 | ] 1759 | }, 1760 | { 1761 | "cell_type": "code", 1762 | "execution_count": null, 1763 | "metadata": {}, 1764 | "outputs": [], 1765 | "source": [ 1766 | "# Let's explain an individual\n", 1767 | "kag_X_test.iloc[0]" 1768 | ] 1769 | }, 1770 | { 1771 | "cell_type": "code", 1772 | "execution_count": null, 1773 | "metadata": {}, 1774 | "outputs": [], 1775 | "source": [ 1776 | "xgb_def.predict(kag_X_test.iloc[[0]]) # predicts SE... why?" 1777 | ] 1778 | }, 1779 | { 1780 | "cell_type": "code", 1781 | "execution_count": null, 1782 | "metadata": {}, 1783 | "outputs": [], 1784 | "source": [ 1785 | "# label is also SE\n", 1786 | "kag_y_test.iloc[0]" 1787 | ] 1788 | }, 1789 | { 1790 | "cell_type": "code", 1791 | "execution_count": null, 1792 | "metadata": { 1793 | "scrolled": false 1794 | }, 1795 | "outputs": [], 1796 | "source": [ 1797 | "# values show direction of feature impact\n", 1798 | "# for this individual\n", 1799 | "pd.Series(vals[0], index=kag_X_test.columns).plot.barh()" 1800 | ] 1801 | }, 1802 | { 1803 | "cell_type": "code", 1804 | "execution_count": null, 1805 | "metadata": {}, 1806 | "outputs": [], 1807 | "source": [ 1808 | "# the base value. We sum up the scores.\n", 1809 | "# > 0 Positive Case\n", 1810 | "shap_ex.expected_value" 1811 | ] 1812 | }, 1813 | { 1814 | "cell_type": "code", 1815 | "execution_count": null, 1816 | "metadata": {}, 1817 | "outputs": [], 1818 | "source": [ 1819 | "# < 0 therefore ... SE\n", 1820 | "shap_ex.expected_value + vals[0].sum()" 1821 | ] 1822 | }, 1823 | { 1824 | "cell_type": "code", 1825 | "execution_count": null, 1826 | "metadata": { 1827 | "scrolled": true 1828 | }, 1829 | "outputs": [], 1830 | "source": [ 1831 | "# use matplotlib if having js issues\n", 1832 | "# blue - SE\n", 1833 | "# red - DS\n", 1834 | "shap.force_plot(shap_ex.expected_value, \n", 1835 | " vals[0,:], kag_X_test.iloc[0], #matplotlib=True\n", 1836 | " )" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": null, 1842 | "metadata": { 1843 | "scrolled": false 1844 | }, 1845 | "outputs": [], 1846 | "source": [ 1847 | "# Explain a feature\n", 1848 | "shap.dependence_plot('years_exp', vals, kag_X_test)" 1849 | ] 1850 | }, 1851 | { 1852 | "cell_type": "code", 1853 | "execution_count": null, 1854 | "metadata": { 1855 | "scrolled": false 1856 | }, 1857 | "outputs": [], 1858 | "source": [ 1859 | "# Explain another feature\n", 1860 | "shap.dependence_plot('age', vals, kag_X_test)" 1861 | ] 1862 | }, 1863 | { 1864 | "cell_type": "code", 1865 | "execution_count": null, 1866 | "metadata": { 1867 | "scrolled": false 1868 | }, 1869 | "outputs": [], 1870 | "source": [ 1871 | "# Explain a feature with an interaction\n", 1872 | "shap.dependence_plot('compensation', vals, kag_X_test, interaction_index='age')" 1873 | ] 1874 | }, 1875 | { 1876 | "cell_type": "code", 1877 | "execution_count": null, 1878 | "metadata": {}, 1879 | "outputs": [], 1880 | "source": [ 1881 | "# Explain global features\n", 1882 | "shap.summary_plot(vals, kag_X_test)" 1883 | ] 1884 | }, 1885 | { 1886 | "cell_type": "code", 1887 | "execution_count": null, 1888 | "metadata": {}, 1889 | "outputs": [], 1890 | "source": [] 1891 | }, 1892 | { 1893 | "cell_type": "code", 1894 | "execution_count": null, 1895 | "metadata": {}, 1896 | "outputs": [], 1897 | "source": [] 1898 | }, 1899 | { 1900 | "cell_type": "markdown", 1901 | "metadata": {}, 1902 | "source": [ 1903 | "# Summary\n", 1904 | "\n", 1905 | "XGBoost is very powerful. Combining with other tools will take you a long way.\n", 1906 | "\n", 1907 | "Explore your data and your results.\n", 1908 | "\n", 1909 | "Lots of libraries. Some are better integrated.\n", 1910 | "\n", 1911 | "Suggestions:\n", 1912 | "\n", 1913 | "* Pandas skills come in useful for manipulating data\n", 1914 | "* Make sure you discuss business value with stake holders\n", 1915 | "\n", 1916 | "\n", 1917 | "Questions?\n", 1918 | "\n", 1919 | "\n", 1920 | "Connect on LinkedIn or Twitter `@__mharrison__`" 1921 | ] 1922 | }, 1923 | { 1924 | "cell_type": "code", 1925 | "execution_count": null, 1926 | "metadata": {}, 1927 | "outputs": [], 1928 | "source": [ 1929 | "import random\n", 1930 | "random.randrange(1,9)" 1931 | ] 1932 | }, 1933 | { 1934 | "cell_type": "code", 1935 | "execution_count": null, 1936 | "metadata": {}, 1937 | "outputs": [], 1938 | "source": [ 1939 | "random.randrange(1,5)" 1940 | ] 1941 | } 1942 | ], 1943 | "metadata": { 1944 | "jupytext": { 1945 | "encoding": "# -*- coding: utf-8 -*-", 1946 | "formats": "ipynb,py:light" 1947 | }, 1948 | "kernelspec": { 1949 | "display_name": "Python 3", 1950 | "language": "python", 1951 | "name": "python3" 1952 | }, 1953 | "language_info": { 1954 | "codemirror_mode": { 1955 | "name": "ipython", 1956 | "version": 3 1957 | }, 1958 | "file_extension": ".py", 1959 | "mimetype": "text/x-python", 1960 | "name": "python", 1961 | "nbconvert_exporter": "python", 1962 | "pygments_lexer": "ipython3", 1963 | "version": "3.8.10" 1964 | } 1965 | }, 1966 | "nbformat": 4, 1967 | "nbformat_minor": 4 1968 | } 1969 | --------------------------------------------------------------------------------