├── .gitignore ├── 01-Constant-features.ipynb ├── 02-One-way-ANOVA.ipynb ├── 03-Lasso.ipynb ├── 04-Feature-shuffling.ipynb ├── 2022_DataTalksClub_FeatureSelection.pdf ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | 4 | # datasets 5 | *.csv 6 | *.zip -------------------------------------------------------------------------------- /01-Constant-features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Constant features" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "\n", 18 | "from sklearn.datasets import make_classification\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "\n", 21 | "from sklearn.feature_selection import VarianceThreshold\n", 22 | "from feature_engine.selection import DropConstantFeatures" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | "
0123456789
01-0.376539-0.620180-0.157567-1.1208051-1.5745781.6780461.0801801
110.762409-0.784210-0.096479-0.40875810.210942-0.850449-0.4613011
212.2279340.547727-0.341481-0.8175771-2.6636782.4400421.6989191
310.061129-0.995868-0.214351-0.5589571-2.1491672.294192-1.3839651
410.0463490.834756-0.104845-0.4555281-0.9110180.8980981.0682591
\n", 131 | "
" 132 | ], 133 | "text/plain": [ 134 | " 0 1 2 3 4 5 6 7 8 \\\n", 135 | "0 1 -0.376539 -0.620180 -0.157567 -1.120805 1 -1.574578 1.678046 1.080180 \n", 136 | "1 1 0.762409 -0.784210 -0.096479 -0.408758 1 0.210942 -0.850449 -0.461301 \n", 137 | "2 1 2.227934 0.547727 -0.341481 -0.817577 1 -2.663678 2.440042 1.698919 \n", 138 | "3 1 0.061129 -0.995868 -0.214351 -0.558957 1 -2.149167 2.294192 -1.383965 \n", 139 | "4 1 0.046349 0.834756 -0.104845 -0.455528 1 -0.911018 0.898098 1.068259 \n", 140 | "\n", 141 | " 9 \n", 142 | "0 1 \n", 143 | "1 1 \n", 144 | "2 1 \n", 145 | "3 1 \n", 146 | "4 1 " 147 | ] 148 | }, 149 | "execution_count": 2, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "# Toy dataset with redundant and constant features\n", 156 | "\n", 157 | "X, y = make_classification(\n", 158 | " n_samples=1000,\n", 159 | " n_features=10,\n", 160 | " n_classes=2,\n", 161 | " random_state=10,\n", 162 | ")\n", 163 | "\n", 164 | "X = pd.DataFrame(X)\n", 165 | "y = pd.Series(y)\n", 166 | "\n", 167 | "# Add constant features\n", 168 | "X[[0, 5, 9]] = 1\n", 169 | "\n", 170 | "X.head()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 3, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "((700, 10), (300, 10))" 182 | ] 183 | }, 184 | "execution_count": 3, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "# separate dataset into train and test\n", 191 | "\n", 192 | "X_train, X_test, y_train, y_test = train_test_split(\n", 193 | " X, y, test_size=0.3, random_state=0,\n", 194 | ")\n", 195 | "\n", 196 | "X_train.shape, X_test.shape" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "## VarianceThreshold from Scikit-learn\n", 204 | "\n", 205 | "Only works with numerical variables. Categorical variables need to be encoded first." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 4, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
VarianceThreshold(threshold=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 217 | ], 218 | "text/plain": [ 219 | "VarianceThreshold(threshold=0)" 220 | ] 221 | }, 222 | "execution_count": 4, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "# To remove constant features\n", 229 | "sel = VarianceThreshold(threshold=0)\n", 230 | "\n", 231 | "# fit finds the features with zero variance\n", 232 | "sel.fit(X_train) " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "7" 244 | ] 245 | }, 246 | "execution_count": 5, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "# get_support is a boolean vector flags \n", 253 | "# the features to keep\n", 254 | "\n", 255 | "# Number of selected features (the non-constant)\n", 256 | "\n", 257 | "sum(sel.get_support())" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 6, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "Int64Index([0, 5, 9], dtype='int64')" 269 | ] 270 | }, 271 | "execution_count": 6, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | } 275 | ], 276 | "source": [ 277 | "# the constant feautures\n", 278 | "\n", 279 | "constant = X_train.columns[~sel.get_support()]\n", 280 | "\n", 281 | "constant" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 7, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "array(['x1', 'x2', 'x3', 'x4', 'x6', 'x7', 'x8'], dtype=object)" 293 | ] 294 | }, 295 | "execution_count": 7, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "sel.get_feature_names_out()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 8, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "((700, 7), (300, 7))" 313 | ] 314 | }, 315 | "execution_count": 8, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "# drop constant features\n", 322 | "\n", 323 | "X_train_t = sel.transform(X_train)\n", 324 | "X_test_t = sel.transform(X_test)\n", 325 | "\n", 326 | "X_train_t.shape, X_test_t.shape" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 9, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/html": [ 337 | "
\n", 338 | "\n", 351 | "\n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | "
x1x2x3x4x6x7x8
00.0398011.501392-0.1892401.546828-1.8311931.9196340.209412
1-0.078494-1.536507-0.4968060.965100-0.873804-1.2468720.629114
2-0.7317120.972453-0.309300-1.432922-0.419046-0.9759840.377169
3-0.1211870.516685-0.800862-0.736170-1.219396-2.312341-1.027631
4-2.0891870.899235-0.2411111.2875360.643273-2.3109120.085618
\n", 417 | "
" 418 | ], 419 | "text/plain": [ 420 | " x1 x2 x3 x4 x6 x7 x8\n", 421 | "0 0.039801 1.501392 -0.189240 1.546828 -1.831193 1.919634 0.209412\n", 422 | "1 -0.078494 -1.536507 -0.496806 0.965100 -0.873804 -1.246872 0.629114\n", 423 | "2 -0.731712 0.972453 -0.309300 -1.432922 -0.419046 -0.975984 0.377169\n", 424 | "3 -0.121187 0.516685 -0.800862 -0.736170 -1.219396 -2.312341 -1.027631\n", 425 | "4 -2.089187 0.899235 -0.241111 1.287536 0.643273 -2.310912 0.085618" 426 | ] 427 | }, 428 | "execution_count": 9, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "# sklearn returns numpy arrays. Convert to dataframe\n", 435 | "\n", 436 | "X_train_t = pd.DataFrame(X_train_t, columns=sel.get_feature_names_out())\n", 437 | "X_test_t = pd.DataFrame(X_test_t, columns=sel.get_feature_names_out())\n", 438 | "\n", 439 | "# show result\n", 440 | "X_train_t.head()" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## DropConstantFeatures from Feature-engine\n", 448 | "\n", 449 | "Works with numerical and categorical variables." 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 10, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/html": [ 460 | "
DropConstantFeatures()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 461 | ], 462 | "text/plain": [ 463 | "DropConstantFeatures()" 464 | ] 465 | }, 466 | "execution_count": 10, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "# To remove constant features\n", 473 | "sel = DropConstantFeatures(tol=1)\n", 474 | "\n", 475 | "# fit finds the features with only 1 value\n", 476 | "sel.fit(X_train) " 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 11, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "[0, 5, 9]" 488 | ] 489 | }, 490 | "execution_count": 11, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "# the constant features\n", 497 | "\n", 498 | "sel.features_to_drop_" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 12, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "((700, 7), (300, 7))" 510 | ] 511 | }, 512 | "execution_count": 12, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "# drop constant features\n", 519 | "\n", 520 | "X_train_t = sel.transform(X_train)\n", 521 | "X_test_t = sel.transform(X_test)\n", 522 | "\n", 523 | "X_train_t.shape, X_test_t.shape" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 13, 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "data": { 533 | "text/html": [ 534 | "
\n", 535 | "\n", 548 | "\n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | "
1234678
1050.0398011.501392-0.1892401.546828-1.8311931.9196340.209412
68-0.078494-1.536507-0.4968060.965100-0.873804-1.2468720.629114
479-0.7317120.972453-0.309300-1.432922-0.419046-0.9759840.377169
399-0.1211870.516685-0.800862-0.736170-1.219396-2.312341-1.027631
434-2.0891870.899235-0.2411111.2875360.643273-2.3109120.085618
\n", 614 | "
" 615 | ], 616 | "text/plain": [ 617 | " 1 2 3 4 6 7 8\n", 618 | "105 0.039801 1.501392 -0.189240 1.546828 -1.831193 1.919634 0.209412\n", 619 | "68 -0.078494 -1.536507 -0.496806 0.965100 -0.873804 -1.246872 0.629114\n", 620 | "479 -0.731712 0.972453 -0.309300 -1.432922 -0.419046 -0.975984 0.377169\n", 621 | "399 -0.121187 0.516685 -0.800862 -0.736170 -1.219396 -2.312341 -1.027631\n", 622 | "434 -2.089187 0.899235 -0.241111 1.287536 0.643273 -2.310912 0.085618" 623 | ] 624 | }, 625 | "execution_count": 13, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "# the result is already a dataframe\n", 632 | "\n", 633 | "X_train_t.head()" 634 | ] 635 | } 636 | ], 637 | "metadata": { 638 | "kernelspec": { 639 | "display_name": "fsml", 640 | "language": "python", 641 | "name": "fsml" 642 | }, 643 | "language_info": { 644 | "codemirror_mode": { 645 | "name": "ipython", 646 | "version": 3 647 | }, 648 | "file_extension": ".py", 649 | "mimetype": "text/x-python", 650 | "name": "python", 651 | "nbconvert_exporter": "python", 652 | "pygments_lexer": "ipython3", 653 | "version": "3.10.5" 654 | }, 655 | "toc": { 656 | "base_numbering": 1, 657 | "nav_menu": {}, 658 | "number_sections": true, 659 | "sideBar": true, 660 | "skip_h1_title": false, 661 | "title_cell": "Table of Contents", 662 | "title_sidebar": "Contents", 663 | "toc_cell": false, 664 | "toc_position": { 665 | "height": "583px", 666 | "left": "0px", 667 | "right": "20px", 668 | "top": "107px", 669 | "width": "319px" 670 | }, 671 | "toc_section_display": "block", 672 | "toc_window_display": true 673 | } 674 | }, 675 | "nbformat": 4, 676 | "nbformat_minor": 2 677 | } 678 | -------------------------------------------------------------------------------- /03-Lasso.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Lasso" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "from sklearn.datasets import fetch_california_housing\n", 19 | "from sklearn.feature_selection import SelectFromModel\n", 20 | "from sklearn.linear_model import Lasso\n", 21 | "from sklearn.model_selection import train_test_split\n", 22 | "from sklearn.preprocessing import StandardScaler" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# load the California House price data\n", 32 | "\n", 33 | "X, y = fetch_california_housing(return_X_y=True, as_frame=True)\n", 34 | "\n", 35 | "# Separate data into train and test sets\n", 36 | "\n", 37 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 49 | ], 50 | "text/plain": [ 51 | "StandardScaler()" 52 | ] 53 | }, 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "# scale the features\n", 61 | "\n", 62 | "scaler = StandardScaler()\n", 63 | "scaler.fit(X_train)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": { 70 | "scrolled": true 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/html": [ 76 | "
SelectFromModel(estimator=Lasso(alpha=0.001, random_state=10))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 77 | ], 78 | "text/plain": [ 79 | "SelectFromModel(estimator=Lasso(alpha=0.001, random_state=10))" 80 | ] 81 | }, 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# here, again I will train a Lasso Linear regression and select\n", 89 | "# the non zero features in one line.\n", 90 | "\n", 91 | "# bear in mind that the linear regression object from sklearn does\n", 92 | "# not allow for regularisation. So If you want to make a regularised\n", 93 | "# linear regression you need to import specifically \"Lasso\"\n", 94 | "\n", 95 | "sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=10))\n", 96 | "sel_.fit(scaler.transform(X_train), y_train)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "array([ True, True, True, True, True, True, True, True])" 108 | ] 109 | }, 110 | "execution_count": 5, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "sel_.get_support()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "total features: 8\n", 129 | "selected features: 8\n", 130 | "features with coefficients shrank to zero: 0\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "# make a list with the selected features and print the outputs\n", 136 | "selected_feat = X_train.columns[(sel_.get_support())]\n", 137 | "\n", 138 | "print('total features: {}'.format((X_train.shape[1])))\n", 139 | "print('selected features: {}'.format(len(selected_feat)))\n", 140 | "print('features with coefficients shrank to zero: {}'.format(\n", 141 | " np.sum(sel_.estimator_.coef_ == 0)))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "((15480, 8), (5160, 8))" 153 | ] 154 | }, 155 | "execution_count": 7, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "# we can then remove the features from the training and testing set\n", 162 | "# like this:\n", 163 | "\n", 164 | "X_train_selected = sel_.transform(scaler.transform(X_train))\n", 165 | "X_test_selected = sel_.transform(scaler.transform(X_test))\n", 166 | "\n", 167 | "X_train_selected.shape, X_test_selected.shape" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "fsml", 174 | "language": "python", 175 | "name": "fsml" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.10.5" 188 | }, 189 | "toc": { 190 | "base_numbering": 1, 191 | "nav_menu": {}, 192 | "number_sections": true, 193 | "sideBar": true, 194 | "skip_h1_title": false, 195 | "title_cell": "Table of Contents", 196 | "title_sidebar": "Contents", 197 | "toc_cell": false, 198 | "toc_position": {}, 199 | "toc_section_display": "block", 200 | "toc_window_display": true 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 2 205 | } 206 | -------------------------------------------------------------------------------- /04-Feature-shuffling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Selection by Feature Shuffling" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "\n", 19 | "from sklearn.base import clone\n", 20 | "from sklearn.datasets import load_breast_cancer\n", 21 | "from sklearn.ensemble import RandomForestClassifier\n", 22 | "from sklearn.metrics import roc_auc_score\n", 23 | "from sklearn.model_selection import train_test_split\n", 24 | "\n", 25 | "from feature_engine.selection import SelectByShuffling" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 50 | "\n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
29311.8517.4675.54432.70.083720.056420.0268800.0228000.18750.05715...13.0625.7584.35517.80.13690.175800.131600.091400.31010.07007
33211.2219.8671.94387.30.105400.067790.0050060.0075830.19400.06028...11.9825.7876.91436.10.14240.096690.013350.020220.32920.06522
56520.1328.25131.201261.00.097800.103400.1440000.0979100.17520.05533...23.6938.25155.001731.00.11660.192200.321500.162800.25720.06637
27813.5917.8486.24572.30.079480.040520.0199700.0123800.15730.05520...15.5026.1098.91739.10.10500.076220.106000.051850.23350.06263
48916.6920.20107.10857.60.074970.071120.0364900.0230700.18460.05325...19.1826.56127.301084.00.10090.292000.247700.087370.46770.07623
\n", 200 | "

5 rows × 30 columns

\n", 201 | "
" 202 | ], 203 | "text/plain": [ 204 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n", 205 | "293 11.85 17.46 75.54 432.7 0.08372 \n", 206 | "332 11.22 19.86 71.94 387.3 0.10540 \n", 207 | "565 20.13 28.25 131.20 1261.0 0.09780 \n", 208 | "278 13.59 17.84 86.24 572.3 0.07948 \n", 209 | "489 16.69 20.20 107.10 857.6 0.07497 \n", 210 | "\n", 211 | " mean compactness mean concavity mean concave points mean symmetry \\\n", 212 | "293 0.05642 0.026880 0.022800 0.1875 \n", 213 | "332 0.06779 0.005006 0.007583 0.1940 \n", 214 | "565 0.10340 0.144000 0.097910 0.1752 \n", 215 | "278 0.04052 0.019970 0.012380 0.1573 \n", 216 | "489 0.07112 0.036490 0.023070 0.1846 \n", 217 | "\n", 218 | " mean fractal dimension ... worst radius worst texture \\\n", 219 | "293 0.05715 ... 13.06 25.75 \n", 220 | "332 0.06028 ... 11.98 25.78 \n", 221 | "565 0.05533 ... 23.69 38.25 \n", 222 | "278 0.05520 ... 15.50 26.10 \n", 223 | "489 0.05325 ... 19.18 26.56 \n", 224 | "\n", 225 | " worst perimeter worst area worst smoothness worst compactness \\\n", 226 | "293 84.35 517.8 0.1369 0.17580 \n", 227 | "332 76.91 436.1 0.1424 0.09669 \n", 228 | "565 155.00 1731.0 0.1166 0.19220 \n", 229 | "278 98.91 739.1 0.1050 0.07622 \n", 230 | "489 127.30 1084.0 0.1009 0.29200 \n", 231 | "\n", 232 | " worst concavity worst concave points worst symmetry \\\n", 233 | "293 0.13160 0.09140 0.3101 \n", 234 | "332 0.01335 0.02022 0.3292 \n", 235 | "565 0.32150 0.16280 0.2572 \n", 236 | "278 0.10600 0.05185 0.2335 \n", 237 | "489 0.24770 0.08737 0.4677 \n", 238 | "\n", 239 | " worst fractal dimension \n", 240 | "293 0.07007 \n", 241 | "332 0.06522 \n", 242 | "565 0.06637 \n", 243 | "278 0.06263 \n", 244 | "489 0.07623 \n", 245 | "\n", 246 | "[5 rows x 30 columns]" 247 | ] 248 | }, 249 | "execution_count": 2, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "# load dataset\n", 256 | "\n", 257 | "breast_cancer = load_breast_cancer()\n", 258 | "X = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)\n", 259 | "y = breast_cancer.target\n", 260 | "\n", 261 | "# Separate data into train and test sets\n", 262 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", 263 | "\n", 264 | "X_train.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 3, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "# the ML model for which we want to select features\n", 274 | "\n", 275 | "rf = RandomForestClassifier(\n", 276 | " n_estimators=10,\n", 277 | " random_state=2909,\n", 278 | ")" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 4, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
SelectByShuffling(estimator=RandomForestClassifier(n_estimators=10,\n",
290 |        "                                                   random_state=2909),\n",
291 |        "                  random_state=1, threshold=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 294 | ], 295 | "text/plain": [ 296 | "SelectByShuffling(estimator=RandomForestClassifier(n_estimators=10,\n", 297 | " random_state=2909),\n", 298 | " random_state=1, threshold=0)" 299 | ] 300 | }, 301 | "execution_count": 4, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | } 305 | ], 306 | "source": [ 307 | "sel = SelectByShuffling(\n", 308 | " estimator=rf, # the ML model\n", 309 | " scoring='roc_auc', # the metric to evaluate\n", 310 | " threshold=0,# the maximum performance drop allowed to select the feature\n", 311 | " cv=3, # cross validation\n", 312 | " random_state=1 # seed\n", 313 | ")\n", 314 | "\n", 315 | "sel.fit(X_train, y_train)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 5, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "0.9822274044237158" 327 | ] 328 | }, 329 | "execution_count": 5, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "# performance of model trained with all features\n", 336 | "\n", 337 | "sel.initial_model_performance_" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 6, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "image/png": "\n", 348 | "text/plain": [ 349 | "
" 350 | ] 351 | }, 352 | "metadata": { 353 | "needs_background": "light" 354 | }, 355 | "output_type": "display_data" 356 | } 357 | ], 358 | "source": [ 359 | "# Changes in performance after shuffling each feature\n", 360 | "\n", 361 | "pd.Series(sel.performance_drifts_).plot.bar(figsize=(20,6))\n", 362 | "plt.title('Performance change after shuffling features')\n", 363 | "plt.ylabel('ROC-AUC change when feature was shuffled')\n", 364 | "plt.show()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 7, 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "['mean smoothness',\n", 376 | " 'mean compactness',\n", 377 | " 'mean concavity',\n", 378 | " 'mean symmetry',\n", 379 | " 'mean fractal dimension',\n", 380 | " 'radius error',\n", 381 | " 'texture error',\n", 382 | " 'perimeter error',\n", 383 | " 'smoothness error',\n", 384 | " 'compactness error',\n", 385 | " 'concavity error',\n", 386 | " 'concave points error',\n", 387 | " 'symmetry error',\n", 388 | " 'fractal dimension error',\n", 389 | " 'worst area',\n", 390 | " 'worst compactness',\n", 391 | " 'worst symmetry',\n", 392 | " 'worst fractal dimension']" 393 | ] 394 | }, 395 | "execution_count": 7, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "# the features to remove\n", 402 | "\n", 403 | "sel.features_to_drop_" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 8, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/plain": [ 414 | "Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',\n", 415 | " 'mean smoothness', 'mean compactness', 'mean concavity',\n", 416 | " 'mean concave points', 'mean symmetry', 'mean fractal dimension',\n", 417 | " 'radius error', 'texture error', 'perimeter error', 'area error',\n", 418 | " 'smoothness error', 'compactness error', 'concavity error',\n", 419 | " 'concave points error', 'symmetry error', 'fractal dimension error',\n", 420 | " 'worst radius', 'worst texture', 'worst perimeter', 'worst area',\n", 421 | " 'worst smoothness', 'worst compactness', 'worst concavity',\n", 422 | " 'worst concave points', 'worst symmetry', 'worst fractal dimension'],\n", 423 | " dtype='object')" 424 | ] 425 | }, 426 | "execution_count": 8, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "# the selected features\n", 433 | "\n", 434 | "(pd.Series(sel.performance_drifts_)>0).index" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 9, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/html": [ 445 | "
\n", 446 | "\n", 459 | "\n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | "
mean radiusmean texturemean perimetermean areamean concave pointsarea errorworst radiusworst textureworst perimeterworst smoothnessworst concavityworst concave points
51213.4020.5288.64556.70.0817233.6716.4129.66113.300.157400.510600.20510
45713.2125.2584.10537.90.0206817.5814.3534.2391.290.128900.139000.06005
43914.0215.6689.59606.50.0265219.2514.9119.3196.530.103400.062600.08216
29814.2618.1791.22633.10.0137420.5616.2225.26105.800.094450.156500.07530
3713.0318.4282.61523.80.0292314.1613.3022.8184.460.097010.048330.05013
\n", 555 | "
" 556 | ], 557 | "text/plain": [ 558 | " mean radius mean texture mean perimeter mean area \\\n", 559 | "512 13.40 20.52 88.64 556.7 \n", 560 | "457 13.21 25.25 84.10 537.9 \n", 561 | "439 14.02 15.66 89.59 606.5 \n", 562 | "298 14.26 18.17 91.22 633.1 \n", 563 | "37 13.03 18.42 82.61 523.8 \n", 564 | "\n", 565 | " mean concave points area error worst radius worst texture \\\n", 566 | "512 0.08172 33.67 16.41 29.66 \n", 567 | "457 0.02068 17.58 14.35 34.23 \n", 568 | "439 0.02652 19.25 14.91 19.31 \n", 569 | "298 0.01374 20.56 16.22 25.26 \n", 570 | "37 0.02923 14.16 13.30 22.81 \n", 571 | "\n", 572 | " worst perimeter worst smoothness worst concavity worst concave points \n", 573 | "512 113.30 0.15740 0.51060 0.20510 \n", 574 | "457 91.29 0.12890 0.13900 0.06005 \n", 575 | "439 96.53 0.10340 0.06260 0.08216 \n", 576 | "298 105.80 0.09445 0.15650 0.07530 \n", 577 | "37 84.46 0.09701 0.04833 0.05013 " 578 | ] 579 | }, 580 | "execution_count": 9, 581 | "metadata": {}, 582 | "output_type": "execute_result" 583 | } 584 | ], 585 | "source": [ 586 | "# reduce dataset\n", 587 | "\n", 588 | "X_train_t = sel.transform(X_train)\n", 589 | "X_test_t = sel.transform(X_test)\n", 590 | "\n", 591 | "X_test_t.head()" 592 | ] 593 | } 594 | ], 595 | "metadata": { 596 | "kernelspec": { 597 | "display_name": "fsml", 598 | "language": "python", 599 | "name": "fsml" 600 | }, 601 | "language_info": { 602 | "codemirror_mode": { 603 | "name": "ipython", 604 | "version": 3 605 | }, 606 | "file_extension": ".py", 607 | "mimetype": "text/x-python", 608 | "name": "python", 609 | "nbconvert_exporter": "python", 610 | "pygments_lexer": "ipython3", 611 | "version": "3.10.5" 612 | }, 613 | "toc": { 614 | "base_numbering": 1, 615 | "nav_menu": {}, 616 | "number_sections": true, 617 | "sideBar": true, 618 | "skip_h1_title": false, 619 | "title_cell": "Table of Contents", 620 | "title_sidebar": "Contents", 621 | "toc_cell": false, 622 | "toc_position": {}, 623 | "toc_section_display": "block", 624 | "toc_window_display": true 625 | } 626 | }, 627 | "nbformat": 4, 628 | "nbformat_minor": 2 629 | } 630 | -------------------------------------------------------------------------------- /2022_DataTalksClub_FeatureSelection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/DataTalks.Club2022/a0cea4cdfccaf17fb714d0012142ce4a20a7b8c2/2022_DataTalksClub_FeatureSelection.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) - Soledad Galli 4 | Feature Selection in Machine Learning - Book: 5 | https://leanpub.com/feature-selection-in-machine-learning 6 | 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | 18 | 3. Neither the name of the copyright holder nor the names of its 19 | contributors may be used to endorse or promote products derived from 20 | this software without specific prior written permission. 21 | 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![PythonVersion](https://img.shields.io/badge/python-3.6%20|3.7%20|%203.8%20|%203.9-success) 2 | [![License https://github.com/solegalli/DataTalks.Club2022/blob/main/LICENSE](https://img.shields.io/badge/license-BSD-success.svg)](https://github.com/solegalli/DataTalks.Club2022/blob/main/LICENSE) 3 | [![Sponsorship https://www.trainindata.com/](https://img.shields.io/badge/Powered%20By-TrainInData-orange.svg)](https://www.trainindata.com/) 4 | 5 | ## Feature Selection in Machine Learning 6 | 7 | ## Links 8 | 9 | - [Book](https://leanpub.com/feature-selection-in-machine-learning) 10 | - [Course](https://www.trainindata.com/p/feature-selection-for-machine-learning) 11 | 12 | ## Talk: Contents 13 | 14 | 1. Constant Features 15 | 2. ANOVA 16 | 3. Lasso 17 | 4. Feature Shuffling 18 | --------------------------------------------------------------------------------