├── .gitignore ├── 01-Constant-features.ipynb ├── 02-One-way-ANOVA.ipynb ├── 03-Lasso.ipynb ├── 04-Feature-shuffling.ipynb ├── 2022_DataTalksClub_FeatureSelection.pdf ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | 4 | # datasets 5 | *.csv 6 | *.zip -------------------------------------------------------------------------------- /01-Constant-features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Constant features" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "\n", 18 | "from sklearn.datasets import make_classification\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "\n", 21 | "from sklearn.feature_selection import VarianceThreshold\n", 22 | "from feature_engine.selection import DropConstantFeatures" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 51 | " | 0 | \n", 52 | "1 | \n", 53 | "2 | \n", 54 | "3 | \n", 55 | "4 | \n", 56 | "5 | \n", 57 | "6 | \n", 58 | "7 | \n", 59 | "8 | \n", 60 | "9 | \n", 61 | "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", 66 | "1 | \n", 67 | "-0.376539 | \n", 68 | "-0.620180 | \n", 69 | "-0.157567 | \n", 70 | "-1.120805 | \n", 71 | "1 | \n", 72 | "-1.574578 | \n", 73 | "1.678046 | \n", 74 | "1.080180 | \n", 75 | "1 | \n", 76 | "
1 | \n", 79 | "1 | \n", 80 | "0.762409 | \n", 81 | "-0.784210 | \n", 82 | "-0.096479 | \n", 83 | "-0.408758 | \n", 84 | "1 | \n", 85 | "0.210942 | \n", 86 | "-0.850449 | \n", 87 | "-0.461301 | \n", 88 | "1 | \n", 89 | "
2 | \n", 92 | "1 | \n", 93 | "2.227934 | \n", 94 | "0.547727 | \n", 95 | "-0.341481 | \n", 96 | "-0.817577 | \n", 97 | "1 | \n", 98 | "-2.663678 | \n", 99 | "2.440042 | \n", 100 | "1.698919 | \n", 101 | "1 | \n", 102 | "
3 | \n", 105 | "1 | \n", 106 | "0.061129 | \n", 107 | "-0.995868 | \n", 108 | "-0.214351 | \n", 109 | "-0.558957 | \n", 110 | "1 | \n", 111 | "-2.149167 | \n", 112 | "2.294192 | \n", 113 | "-1.383965 | \n", 114 | "1 | \n", 115 | "
4 | \n", 118 | "1 | \n", 119 | "0.046349 | \n", 120 | "0.834756 | \n", 121 | "-0.104845 | \n", 122 | "-0.455528 | \n", 123 | "1 | \n", 124 | "-0.911018 | \n", 125 | "0.898098 | \n", 126 | "1.068259 | \n", 127 | "1 | \n", 128 | "
VarianceThreshold(threshold=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
VarianceThreshold(threshold=0)
\n", 355 | " | x1 | \n", 356 | "x2 | \n", 357 | "x3 | \n", 358 | "x4 | \n", 359 | "x6 | \n", 360 | "x7 | \n", 361 | "x8 | \n", 362 | "
---|---|---|---|---|---|---|---|
0 | \n", 367 | "0.039801 | \n", 368 | "1.501392 | \n", 369 | "-0.189240 | \n", 370 | "1.546828 | \n", 371 | "-1.831193 | \n", 372 | "1.919634 | \n", 373 | "0.209412 | \n", 374 | "
1 | \n", 377 | "-0.078494 | \n", 378 | "-1.536507 | \n", 379 | "-0.496806 | \n", 380 | "0.965100 | \n", 381 | "-0.873804 | \n", 382 | "-1.246872 | \n", 383 | "0.629114 | \n", 384 | "
2 | \n", 387 | "-0.731712 | \n", 388 | "0.972453 | \n", 389 | "-0.309300 | \n", 390 | "-1.432922 | \n", 391 | "-0.419046 | \n", 392 | "-0.975984 | \n", 393 | "0.377169 | \n", 394 | "
3 | \n", 397 | "-0.121187 | \n", 398 | "0.516685 | \n", 399 | "-0.800862 | \n", 400 | "-0.736170 | \n", 401 | "-1.219396 | \n", 402 | "-2.312341 | \n", 403 | "-1.027631 | \n", 404 | "
4 | \n", 407 | "-2.089187 | \n", 408 | "0.899235 | \n", 409 | "-0.241111 | \n", 410 | "1.287536 | \n", 411 | "0.643273 | \n", 412 | "-2.310912 | \n", 413 | "0.085618 | \n", 414 | "
DropConstantFeatures()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DropConstantFeatures()
\n", 552 | " | 1 | \n", 553 | "2 | \n", 554 | "3 | \n", 555 | "4 | \n", 556 | "6 | \n", 557 | "7 | \n", 558 | "8 | \n", 559 | "
---|---|---|---|---|---|---|---|
105 | \n", 564 | "0.039801 | \n", 565 | "1.501392 | \n", 566 | "-0.189240 | \n", 567 | "1.546828 | \n", 568 | "-1.831193 | \n", 569 | "1.919634 | \n", 570 | "0.209412 | \n", 571 | "
68 | \n", 574 | "-0.078494 | \n", 575 | "-1.536507 | \n", 576 | "-0.496806 | \n", 577 | "0.965100 | \n", 578 | "-0.873804 | \n", 579 | "-1.246872 | \n", 580 | "0.629114 | \n", 581 | "
479 | \n", 584 | "-0.731712 | \n", 585 | "0.972453 | \n", 586 | "-0.309300 | \n", 587 | "-1.432922 | \n", 588 | "-0.419046 | \n", 589 | "-0.975984 | \n", 590 | "0.377169 | \n", 591 | "
399 | \n", 594 | "-0.121187 | \n", 595 | "0.516685 | \n", 596 | "-0.800862 | \n", 597 | "-0.736170 | \n", 598 | "-1.219396 | \n", 599 | "-2.312341 | \n", 600 | "-1.027631 | \n", 601 | "
434 | \n", 604 | "-2.089187 | \n", 605 | "0.899235 | \n", 606 | "-0.241111 | \n", 607 | "1.287536 | \n", 608 | "0.643273 | \n", 609 | "-2.310912 | \n", 610 | "0.085618 | \n", 611 | "
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
SelectFromModel(estimator=Lasso(alpha=0.001, random_state=10))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SelectFromModel(estimator=Lasso(alpha=0.001, random_state=10))
Lasso(alpha=0.001, random_state=10)
Lasso(alpha=0.001, random_state=10)
\n", 54 | " | mean radius | \n", 55 | "mean texture | \n", 56 | "mean perimeter | \n", 57 | "mean area | \n", 58 | "mean smoothness | \n", 59 | "mean compactness | \n", 60 | "mean concavity | \n", 61 | "mean concave points | \n", 62 | "mean symmetry | \n", 63 | "mean fractal dimension | \n", 64 | "... | \n", 65 | "worst radius | \n", 66 | "worst texture | \n", 67 | "worst perimeter | \n", 68 | "worst area | \n", 69 | "worst smoothness | \n", 70 | "worst compactness | \n", 71 | "worst concavity | \n", 72 | "worst concave points | \n", 73 | "worst symmetry | \n", 74 | "worst fractal dimension | \n", 75 | "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
293 | \n", 80 | "11.85 | \n", 81 | "17.46 | \n", 82 | "75.54 | \n", 83 | "432.7 | \n", 84 | "0.08372 | \n", 85 | "0.05642 | \n", 86 | "0.026880 | \n", 87 | "0.022800 | \n", 88 | "0.1875 | \n", 89 | "0.05715 | \n", 90 | "... | \n", 91 | "13.06 | \n", 92 | "25.75 | \n", 93 | "84.35 | \n", 94 | "517.8 | \n", 95 | "0.1369 | \n", 96 | "0.17580 | \n", 97 | "0.13160 | \n", 98 | "0.09140 | \n", 99 | "0.3101 | \n", 100 | "0.07007 | \n", 101 | "
332 | \n", 104 | "11.22 | \n", 105 | "19.86 | \n", 106 | "71.94 | \n", 107 | "387.3 | \n", 108 | "0.10540 | \n", 109 | "0.06779 | \n", 110 | "0.005006 | \n", 111 | "0.007583 | \n", 112 | "0.1940 | \n", 113 | "0.06028 | \n", 114 | "... | \n", 115 | "11.98 | \n", 116 | "25.78 | \n", 117 | "76.91 | \n", 118 | "436.1 | \n", 119 | "0.1424 | \n", 120 | "0.09669 | \n", 121 | "0.01335 | \n", 122 | "0.02022 | \n", 123 | "0.3292 | \n", 124 | "0.06522 | \n", 125 | "
565 | \n", 128 | "20.13 | \n", 129 | "28.25 | \n", 130 | "131.20 | \n", 131 | "1261.0 | \n", 132 | "0.09780 | \n", 133 | "0.10340 | \n", 134 | "0.144000 | \n", 135 | "0.097910 | \n", 136 | "0.1752 | \n", 137 | "0.05533 | \n", 138 | "... | \n", 139 | "23.69 | \n", 140 | "38.25 | \n", 141 | "155.00 | \n", 142 | "1731.0 | \n", 143 | "0.1166 | \n", 144 | "0.19220 | \n", 145 | "0.32150 | \n", 146 | "0.16280 | \n", 147 | "0.2572 | \n", 148 | "0.06637 | \n", 149 | "
278 | \n", 152 | "13.59 | \n", 153 | "17.84 | \n", 154 | "86.24 | \n", 155 | "572.3 | \n", 156 | "0.07948 | \n", 157 | "0.04052 | \n", 158 | "0.019970 | \n", 159 | "0.012380 | \n", 160 | "0.1573 | \n", 161 | "0.05520 | \n", 162 | "... | \n", 163 | "15.50 | \n", 164 | "26.10 | \n", 165 | "98.91 | \n", 166 | "739.1 | \n", 167 | "0.1050 | \n", 168 | "0.07622 | \n", 169 | "0.10600 | \n", 170 | "0.05185 | \n", 171 | "0.2335 | \n", 172 | "0.06263 | \n", 173 | "
489 | \n", 176 | "16.69 | \n", 177 | "20.20 | \n", 178 | "107.10 | \n", 179 | "857.6 | \n", 180 | "0.07497 | \n", 181 | "0.07112 | \n", 182 | "0.036490 | \n", 183 | "0.023070 | \n", 184 | "0.1846 | \n", 185 | "0.05325 | \n", 186 | "... | \n", 187 | "19.18 | \n", 188 | "26.56 | \n", 189 | "127.30 | \n", 190 | "1084.0 | \n", 191 | "0.1009 | \n", 192 | "0.29200 | \n", 193 | "0.24770 | \n", 194 | "0.08737 | \n", 195 | "0.4677 | \n", 196 | "0.07623 | \n", 197 | "
5 rows × 30 columns
\n", 201 | "SelectByShuffling(estimator=RandomForestClassifier(n_estimators=10,\n", 290 | " random_state=2909),\n", 291 | " random_state=1, threshold=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SelectByShuffling(estimator=RandomForestClassifier(n_estimators=10,\n", 292 | " random_state=2909),\n", 293 | " random_state=1, threshold=0)
RandomForestClassifier(n_estimators=10, random_state=2909)
RandomForestClassifier(n_estimators=10, random_state=2909)
\n", 463 | " | mean radius | \n", 464 | "mean texture | \n", 465 | "mean perimeter | \n", 466 | "mean area | \n", 467 | "mean concave points | \n", 468 | "area error | \n", 469 | "worst radius | \n", 470 | "worst texture | \n", 471 | "worst perimeter | \n", 472 | "worst smoothness | \n", 473 | "worst concavity | \n", 474 | "worst concave points | \n", 475 | "
---|---|---|---|---|---|---|---|---|---|---|---|---|
512 | \n", 480 | "13.40 | \n", 481 | "20.52 | \n", 482 | "88.64 | \n", 483 | "556.7 | \n", 484 | "0.08172 | \n", 485 | "33.67 | \n", 486 | "16.41 | \n", 487 | "29.66 | \n", 488 | "113.30 | \n", 489 | "0.15740 | \n", 490 | "0.51060 | \n", 491 | "0.20510 | \n", 492 | "
457 | \n", 495 | "13.21 | \n", 496 | "25.25 | \n", 497 | "84.10 | \n", 498 | "537.9 | \n", 499 | "0.02068 | \n", 500 | "17.58 | \n", 501 | "14.35 | \n", 502 | "34.23 | \n", 503 | "91.29 | \n", 504 | "0.12890 | \n", 505 | "0.13900 | \n", 506 | "0.06005 | \n", 507 | "
439 | \n", 510 | "14.02 | \n", 511 | "15.66 | \n", 512 | "89.59 | \n", 513 | "606.5 | \n", 514 | "0.02652 | \n", 515 | "19.25 | \n", 516 | "14.91 | \n", 517 | "19.31 | \n", 518 | "96.53 | \n", 519 | "0.10340 | \n", 520 | "0.06260 | \n", 521 | "0.08216 | \n", 522 | "
298 | \n", 525 | "14.26 | \n", 526 | "18.17 | \n", 527 | "91.22 | \n", 528 | "633.1 | \n", 529 | "0.01374 | \n", 530 | "20.56 | \n", 531 | "16.22 | \n", 532 | "25.26 | \n", 533 | "105.80 | \n", 534 | "0.09445 | \n", 535 | "0.15650 | \n", 536 | "0.07530 | \n", 537 | "
37 | \n", 540 | "13.03 | \n", 541 | "18.42 | \n", 542 | "82.61 | \n", 543 | "523.8 | \n", 544 | "0.02923 | \n", 545 | "14.16 | \n", 546 | "13.30 | \n", 547 | "22.81 | \n", 548 | "84.46 | \n", 549 | "0.09701 | \n", 550 | "0.04833 | \n", 551 | "0.05013 | \n", 552 | "