├── .gitignore ├── LICENSE ├── README.md ├── article ├── css │ └── book.css ├── images │ ├── R_class_acc_random.svg │ ├── R_class_dorp_random.svg │ ├── R_class_drop_random.svg │ ├── R_class_gini_random.svg │ ├── R_regr_MSE_random.svg │ ├── R_regr_RSS_random.svg │ ├── R_regr_drop_random.svg │ ├── cancer-dependencies.png │ ├── cancer_corr.svg │ ├── cancer_dep.svg │ ├── cancer_dep_less4.svg │ ├── cancer_dflt_imp.svg │ ├── cancer_dropcol_imp.svg │ ├── cancer_imp.svg │ ├── cls_dflt.svg │ ├── cls_dflt_R.svg │ ├── cls_dflt_random.pdf │ ├── cls_dflt_random.svg │ ├── cls_dflt_random_R.pdf │ ├── cls_dflt_random_R.svg │ ├── cls_dflt_random_R_annotated.png │ ├── cls_dflt_random_annotated.png │ ├── cls_drop_R.svg │ ├── cls_drop_random_R.svg │ ├── cls_dropcol.svg │ ├── cls_dropcol_random.svg │ ├── cls_permute.svg │ ├── cls_permute_R.svg │ ├── cls_permute_random.svg │ ├── cls_permute_random_R.pdf │ ├── cls_permute_random_R.svg │ ├── collinear_dflt.svg │ ├── collinear_dflt_longitude_dup.svg │ ├── collinear_dflt_longitude_noise_0.0005.svg │ ├── collinear_dflt_longitude_noise_0.0010.svg │ ├── collinear_dflt_longitude_noise_3.0000.svg │ ├── collinear_dropcol.svg │ ├── collinear_dropcol_bathrooms_dup.svg │ ├── collinear_dropcol_longitude_dup.svg │ ├── collinear_permute.svg │ ├── collinear_permute_bedrooms_noise_1.0000.svg │ ├── collinear_permute_bedrooms_noise_2.0000.svg │ ├── collinear_permute_bedrooms_noise_3.0000.svg │ ├── collinear_permute_longitude_dup.svg │ ├── collinear_permute_longitude_noise_0.0005.svg │ ├── collinear_permute_longitude_noise_0.0010.svg │ ├── corr_matrix.png │ ├── corrheatmap.png │ ├── corrheatmap.svg │ ├── dependencies.png │ ├── diagrams.graffle │ ├── grouped_dup_imp.svg │ ├── grouped_imp.svg │ ├── imp.svg │ ├── latlong_imp.svg │ ├── regr_dflt.pdf │ ├── regr_dflt.svg │ ├── regr_dflt_R.pdf │ ├── regr_dflt_R.svg │ ├── regr_dflt_random.pdf │ ├── regr_dflt_random.svg │ ├── regr_dflt_random_R.pdf │ ├── regr_dflt_random_R.svg │ ├── regr_dflt_random_R_annotated.png │ ├── regr_dflt_random_annotated.png │ ├── regr_drop_R.svg │ ├── regr_drop_random_R.svg │ ├── regr_dropcol.svg │ ├── regr_dropcol_random.svg │ ├── regr_permute.svg │ ├── regr_permute_R.svg │ ├── regr_permute_random.svg │ ├── regr_permute_random_R.svg │ ├── rent-pimp-sample-size.svg │ ├── subset_imp.svg │ └── test-dataset-pimp-sample-size.svg └── index.html ├── notebooks ├── .ipynb_checkpoints │ └── R_to_Python-checkpoint.ipynb ├── R_to_Python.ipynb ├── collinear.ipynb ├── data │ ├── imp_R_class_acc.csv │ ├── imp_R_class_drop.csv │ ├── imp_R_class_gini.csv │ ├── imp_R_regr_MSE.csv │ ├── imp_R_regr_RSS.csv │ ├── imp_R_regr_drop.csv │ ├── rent-cls.csv │ └── rent.csv ├── imp_R_regr_MSE.csv ├── imp_R_regr_RSS.csv ├── permutation-importances-classifier.Rmd ├── permutation-importances-classifier.html ├── permutation-importances-classifier.ipynb ├── permutation-importances-regressor.Rmd ├── permutation-importances-regressor.html ├── permutation-importances-regressor.ipynb ├── pimp.ipynb ├── pimp_plots.ipynb └── rfpimp-collinear.ipynb └── src ├── LICENSE ├── play_plot.py ├── rfpimp.py ├── run_feat_dep.py ├── run_perm_imp.py ├── run_perm_imp_cancer.py ├── run_rent_imp.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Terence Parr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feature importances for scikit-learn machine learning models 2 | 3 | By Terence Parr and Kerem Turgutlu. See [Explained.ai](http://explained.ai) for more stuff. 4 | 5 | The scikit-learn Random Forest feature importances strategy is mean decrease in impurity (or gini importance) mechanism, which is unreliable. 6 | To get reliable results, use permutation importance, provided in the `rfpimp` package in the `src` dir. Install with: 7 | 8 | `pip install rfpimp` 9 | 10 | We include permutation and drop-column importance measures that work with any sklearn model. Yes, `rfpimp` is an increasingly-ill-suited name, but we still like it. 11 | 12 | ## Description 13 | 14 | See Beware Default Random Forest Importances for a deeper discussion of the issues surrounding feature importances in random forests (authored by Terence Parr, Kerem Turgutlu, Christopher Csiszar, and Jeremy Howard). 15 | 16 | The mean-decrease-in-impurity importance of a feature is computed by measuring how effective the feature is at reducing uncertainty (classifiers) or variance (regressors) when creating decision trees within random forests. The problem is that this mechanism, while fast, does not always give an accurate picture of importance. Strobl et al pointed out in Bias in random forest variable importance measures: Illustrations, sources and a solution that “the variable importance measures of Breiman's original random forest method ... are not reliable in situations where potential predictor variables vary in their scale of measurement or their number of categories.” 17 | 18 | A more reliable method is permutation importance, which measures the importance of a feature as follows. Record a baseline accuracy (classifier) or R2 score (regressor) by passing a validation set or the out-of-bag (OOB) samples through the random forest. Permute the column values of a single predictor feature and then pass all test samples back through the random forest and recompute the accuracy or R2. The importance of that feature is the difference between the baseline and the drop in overall accuracy or R2 caused by permuting the column. The permutation mechanism is much more computationally expensive than the mean decrease in impurity mechanism, but the results are more reliable. 19 | 20 | ## Sample code 21 | 22 | See the [notebooks directory](https://github.com/parrt/random-forest-importances/blob/master/notebooks) for things like [Collinear features](https://github.com/parrt/random-forest-importances/blob/master/notebooks/collinear.ipynb) and [Plotting feature importances](https://github.com/parrt/random-forest-importances/blob/master/notebooks/pimp_plots.ipynb). 23 | 24 | Here's some sample Python code that uses the `rfpimp` package contained in the `src` directory. The data can be found in rent.csv, which is a subset of the data from Kaggle's Two Sigma Connect: Rental Listing Inquiries competition. 25 | 26 | 27 | ```python 28 | from rfpimp import * 29 | import pandas as pd 30 | from sklearn.ensemble import RandomForestRegressor 31 | from sklearn.model_selection import train_test_split 32 | 33 | df_orig = pd.read_csv("/Users/parrt/github/random-forest-importances/notebooks/data/rent.csv") 34 | 35 | df = df_orig.copy() 36 | 37 | # attentuate affect of outliers in price 38 | df['price'] = np.log(df['price']) 39 | 40 | df_train, df_test = train_test_split(df, test_size=0.20) 41 | 42 | features = ['bathrooms','bedrooms','longitude','latitude', 43 | 'price'] 44 | df_train = df_train[features] 45 | df_test = df_test[features] 46 | 47 | X_train, y_train = df_train.drop('price',axis=1), df_train['price'] 48 | X_test, y_test = df_test.drop('price',axis=1), df_test['price'] 49 | X_train['random'] = np.random.random(size=len(X_train)) 50 | X_test['random'] = np.random.random(size=len(X_test)) 51 | 52 | rf = RandomForestRegressor(n_estimators=100, n_jobs=-1) 53 | rf.fit(X_train, y_train) 54 | 55 | imp = importances(rf, X_test, y_test) # permutation 56 | viz = plot_importances(imp) 57 | viz.view() 58 | 59 | 60 | df_train, df_test = train_test_split(df_orig, test_size=0.20) 61 | features = ['bathrooms','bedrooms','price','longitude','latitude', 62 | 'interest_level'] 63 | df_train = df_train[features] 64 | df_test = df_test[features] 65 | 66 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level'] 67 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level'] 68 | # Add column of random numbers 69 | X_train['random'] = np.random.random(size=len(X_train)) 70 | X_test['random'] = np.random.random(size=len(X_test)) 71 | 72 | rf = RandomForestClassifier(n_estimators=100, 73 | min_samples_leaf=5, 74 | n_jobs=-1, 75 | oob_score=True) 76 | rf.fit(X_train, y_train) 77 | 78 | imp = importances(rf, X_test, y_test, n_samples=-1) 79 | viz = plot_importances(imp) 80 | viz.view() 81 | ``` 82 | ### Feature correlation 83 | 84 | See [Feature collinearity heatmap](notebooks/rfpimp-collinear.ipynb). We can get the Spearman's correlation matrix: 85 | 86 | 87 | 88 | ### Feature dependencies 89 | 90 | The features we use in machine learning are rarely completely independent, which makes interpreting feature importance tricky. We could compute correlation coefficients, but that only identifies linear relationships. A way to at least identify if a feature, x, is dependent on other features is to train a model using x as a dependent variable and all other features as independent variables. Because random forests give us an easy out of bag error estimate, the feature dependence functions rely on random forest models. The R^2 prediction error from the model indicates how easy it is to predict feature x using the other features. The higher the score, the more dependent feature x is. 91 | 92 | You can also get a feature dependence matrix / heatmap that returns a non-symmetric data frame where each row is the importance of each var to the row's var used as a model target. Example: 93 | 94 | 95 | -------------------------------------------------------------------------------- /article/css/book.css: -------------------------------------------------------------------------------- 1 | /* 2 | .fig-container { 3 | display: flex; 4 | overflow-x: scroll; 5 | flex: auto; 6 | width: 100%; 7 | flex-wrap: wrap; 8 | flex-flow: row wrap; 9 | } 10 | .fig-container > div { 11 | flex: 1; 12 | } 13 | */ 14 | #toc { 15 | background: #f9f9f9 none repeat scroll 0 0; 16 | border: 1px solid #aaa; 17 | display: table; 18 | font-size: 83%; 19 | padding: 5px; 20 | width: auto; 21 | } 22 | .toc_title { 23 | font-weight: 700; 24 | text-align: center; 25 | } 26 | #toc li, #toc ul, #toc ul li{ 27 | list-style: outside none none !important; 28 | } 29 | #toc a { 30 | color: #3A4145; 31 | text-decoration: none; 32 | } 33 | #toc a:hover { 34 | text-decoration: underline; 35 | } 36 | #author { 37 | font-size: 110%; 38 | } 39 | body { 40 | font-family: "Merriweather", serif; 41 | font-size: 100%; 42 | letter-spacing: 0.01rem; 43 | line-height: 1.5em; 44 | color: #3A4145; 45 | background: #FFFFFF; 46 | max-width: 800px; 47 | margin-left: 2%; 48 | -webkit-text-size-adjust: 100%; 49 | } 50 | p { 51 | margin: 15px 5px 15px 0px; 52 | } 53 | .watermark { 54 | font-size: 100%; 55 | right: 10px; 56 | opacity: 0.4; 57 | color: BLACK; 58 | position: absolute; 59 | top: 30px; 60 | } 61 | .caption { 62 | font-size: 80%; 63 | } 64 | @media (max-width: 415px) { 65 | /* iphone 6+ 414x628 browser window, iphone 6 is 375 x 559 in browser */ 66 | body { 67 | font-size: 100%; 68 | } 69 | .watermark { 70 | position: static; 71 | } 72 | } 73 | @media (max-width: 736px) { 74 | /* iphone7+ 736 wide */ 75 | body { 76 | font-size: 100%; 77 | } 78 | .watermark { 79 | position: static; 80 | } 81 | } 82 | h1 { 83 | line-height: 1.2em; 84 | } 85 | .inlinecode { 86 | font-family: "Monaco", monospace; 87 | font-size: 80%; 88 | } 89 | .codeblk { 90 | font-family: "Monaco", monospace; 91 | font-size: 80%; 92 | box-sizing: border-box; 93 | margin: 0 0 1em 0; 94 | border: #E3EDF3 1px solid; 95 | width: 100%; 96 | padding: 2px; 97 | background: #F7FAFB; 98 | border-radius: 2px; 99 | line-height: 1.3em; 100 | white-space: pre-wrap; 101 | } 102 | table { 103 | border-spacing: 15px 0px 104 | } 105 | table th { 106 | line-height: 1em; 107 | } 108 | .scrollbar_wrapper { 109 | overflow: auto; 110 | } 111 | table.figure { 112 | border: 0; 113 | width: 100%; 114 | } 115 | table.figure td { 116 | width: 50%; 117 | vertical-align: top; 118 | line-height: 1.1em; 119 | font-size: 80%; 120 | } 121 | 122 | table.dataframe { 123 | max-width: none; 124 | padding: 0.5em 0.5em; 125 | border: none; 126 | display: table; 127 | border-spacing: 0px 0px; 128 | border-collapse:separate; 129 | } 130 | table.dataframe td { 131 | font-family: "Helvetica", sans-serif; 132 | font-size: 80%; 133 | text-align: right; 134 | text-size-adjust: 100%; 135 | vertical-align: top; 136 | padding: 2px 5px; 137 | border-bottom: 0px; 138 | line-height: 1.1em; 139 | } 140 | table.dataframe th { 141 | font-family: "Helvetica", sans-serif; 142 | font-size: 80%; 143 | text-align: right; 144 | line-height: 1em; 145 | text-size-adjust: 100%; 146 | vertical-align: bottom; 147 | padding: 0px 5px; 148 | } 149 | -------------------------------------------------------------------------------- /article/images/cancer-dependencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cancer-dependencies.png -------------------------------------------------------------------------------- /article/images/cls_dflt_R.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | bathrooms 49 | bedrooms 50 | longitude 51 | latitude 52 | price 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 0 64 | 1000 65 | 2000 66 | 3000 67 | 4000 68 | 69 | -------------------------------------------------------------------------------- /article/images/cls_dflt_random.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random.pdf -------------------------------------------------------------------------------- /article/images/cls_dflt_random_R.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random_R.pdf -------------------------------------------------------------------------------- /article/images/cls_dflt_random_R_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random_R_annotated.png -------------------------------------------------------------------------------- /article/images/cls_dflt_random_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random_annotated.png -------------------------------------------------------------------------------- /article/images/cls_drop_R.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | bathrooms 47 | bedrooms 48 | longitude 49 | latitude 50 | price 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 0.00 61 | 0.02 62 | 0.04 63 | 0.06 64 | 65 | -------------------------------------------------------------------------------- /article/images/cls_permute_R.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | bathrooms 45 | latitude 46 | longitude 47 | bedrooms 48 | price 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 0.00 58 | 0.05 59 | 0.10 60 | 61 | -------------------------------------------------------------------------------- /article/images/cls_permute_random_R.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_permute_random_R.pdf -------------------------------------------------------------------------------- /article/images/collinear_dropcol.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 36 | 37 | 38 | 44 | 45 | 46 | 52 | 53 | 54 | 60 | 61 | 62 | 63 | 64 | 65 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 97 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 134 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 189 | 221 | 241 | 252 | 273 | 299 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 366 | 385 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 466 | 482 | 512 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 603 | 604 | 605 | 608 | 609 | 610 | 613 | 614 | 615 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | -------------------------------------------------------------------------------- /article/images/collinear_permute.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 36 | 37 | 38 | 44 | 45 | 46 | 52 | 53 | 54 | 60 | 61 | 62 | 63 | 64 | 65 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 97 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 195 | 219 | 245 | 261 | 282 | 312 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 371 | 390 | 423 | 434 | 454 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 530 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 588 | 589 | 590 | 593 | 594 | 595 | 598 | 599 | 600 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | -------------------------------------------------------------------------------- /article/images/collinear_permute_bedrooms_noise_3.0000.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 36 | 37 | 38 | 44 | 45 | 46 | 52 | 53 | 54 | 60 | 61 | 62 | 68 | 69 | 70 | 71 | 72 | 73 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 105 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 146 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 186 | 207 | 226 | 259 | 270 | 290 | 311 | 337 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 409 | 425 | 455 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 540 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 608 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 648 | 649 | 650 | 653 | 654 | 655 | 658 | 659 | 660 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | -------------------------------------------------------------------------------- /article/images/corr_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/corr_matrix.png -------------------------------------------------------------------------------- /article/images/corrheatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/corrheatmap.png -------------------------------------------------------------------------------- /article/images/dependencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/dependencies.png -------------------------------------------------------------------------------- /article/images/diagrams.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/diagrams.graffle -------------------------------------------------------------------------------- /article/images/regr_dflt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt.pdf -------------------------------------------------------------------------------- /article/images/regr_dflt_R.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_R.pdf -------------------------------------------------------------------------------- /article/images/regr_dflt_R.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | bedrooms 46 | latitude 47 | longitude 48 | bathrooms 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 0 59 | 1000 60 | 2000 61 | 3000 62 | 4000 63 | 64 | -------------------------------------------------------------------------------- /article/images/regr_dflt_random.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random.pdf -------------------------------------------------------------------------------- /article/images/regr_dflt_random_R.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random_R.pdf -------------------------------------------------------------------------------- /article/images/regr_dflt_random_R_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random_R_annotated.png -------------------------------------------------------------------------------- /article/images/regr_dflt_random_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random_annotated.png -------------------------------------------------------------------------------- /article/images/regr_drop_R.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | bathrooms 43 | bedrooms 44 | latitude 45 | longitude 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 0.00 54 | 0.05 55 | 0.10 56 | 57 | -------------------------------------------------------------------------------- /article/images/regr_dropcol_random.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 36 | 37 | 38 | 44 | 45 | 46 | 52 | 53 | 54 | 60 | 61 | 62 | 68 | 69 | 70 | 71 | 72 | 73 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 105 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 142 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 182 | 203 | 222 | 255 | 266 | 286 | 307 | 333 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 459 | 475 | 505 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 614 | 615 | 616 | 619 | 620 | 621 | 624 | 625 | 626 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | -------------------------------------------------------------------------------- /article/images/regr_permute.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 36 | 37 | 38 | 44 | 45 | 46 | 52 | 53 | 54 | 60 | 61 | 62 | 63 | 64 | 65 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 97 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 145 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 185 | 206 | 225 | 258 | 269 | 289 | 310 | 336 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 462 | 478 | 508 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 599 | 600 | 601 | 604 | 605 | 606 | 609 | 610 | 611 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | -------------------------------------------------------------------------------- /article/images/regr_permute_R.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | bathrooms 44 | bedrooms 45 | latitude 46 | longitude 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 0.00 56 | 0.05 57 | 0.10 58 | 0.15 59 | 60 | -------------------------------------------------------------------------------- /article/images/regr_permute_random.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 36 | 37 | 38 | 44 | 45 | 46 | 52 | 53 | 54 | 60 | 61 | 62 | 68 | 69 | 70 | 71 | 72 | 73 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 105 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 146 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 207 | 228 | 247 | 280 | 291 | 311 | 332 | 358 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 430 | 446 | 476 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 561 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 637 | 638 | 639 | 642 | 643 | 644 | 647 | 648 | 649 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | -------------------------------------------------------------------------------- /notebooks/data/imp_R_class_acc.csv: -------------------------------------------------------------------------------- 1 | "","MeanDecreaseAccuracy" 2 | "bathrooms",12.2722647086318 3 | "bedrooms",74.0723470287167 4 | "price",70.8236306582764 5 | "longitude",40.752136232213 6 | "latitude",47.9714935062742 7 | "random",2.45974994364518 8 | -------------------------------------------------------------------------------- /notebooks/data/imp_R_class_drop.csv: -------------------------------------------------------------------------------- 1 | "","Feature","Importance" 2 | "1","bathrooms",-0.00620954989555456 3 | "2","bedrooms",0.0242944066060036 4 | "3","longitude",0.0201834528683037 5 | "4","latitude",0.0208219205776745 6 | "5","random",-0.0149920610836133 7 | "6","price",0.0542021893740692 8 | -------------------------------------------------------------------------------- /notebooks/data/imp_R_class_gini.csv: -------------------------------------------------------------------------------- 1 | "","MeanDecreaseGini" 2 | "bathrooms",342.994453966487 3 | "bedrooms",957.076393066943 4 | "price",4676.08772137443 5 | "longitude",3803.1220992054 6 | "latitude",3845.96138345266 7 | "random",3945.05069793106 8 | -------------------------------------------------------------------------------- /notebooks/data/imp_R_regr_MSE.csv: -------------------------------------------------------------------------------- 1 | "","X.IncMSE" 2 | "bathrooms",0.0893860247102489 3 | "bedrooms",0.105265104336058 4 | "longitude",0.140712080833606 5 | "latitude",0.12343161145159 6 | "random",-5.68879621046926e-05 7 | -------------------------------------------------------------------------------- /notebooks/data/imp_R_regr_RSS.csv: -------------------------------------------------------------------------------- 1 | "","IncNodePurity" 2 | "bathrooms",3652.79853374327 3 | "bedrooms",1357.02440889748 4 | "longitude",2155.35387929004 5 | "latitude",1528.75792511292 6 | "random",365.143595904819 7 | -------------------------------------------------------------------------------- /notebooks/data/imp_R_regr_drop.csv: -------------------------------------------------------------------------------- 1 | "","Feature","Importance" 2 | "1","bathrooms",0.0423248371964218 3 | "2","bedrooms",0.0913196641210466 4 | "3","longitude",0.130007078476311 5 | "4","latitude",0.106649812499106 6 | "5","random",-0.0226813443094983 7 | -------------------------------------------------------------------------------- /notebooks/imp_R_regr_MSE.csv: -------------------------------------------------------------------------------- 1 | "","X.IncMSE" 2 | "bathrooms",0.0867924590431635 3 | "bedrooms",0.105465708124454 4 | "longitude",0.130956279570924 5 | "latitude",0.113012571751182 6 | "random",0.000373724167032719 7 | -------------------------------------------------------------------------------- /notebooks/imp_R_regr_RSS.csv: -------------------------------------------------------------------------------- 1 | "","IncNodePurity" 2 | "bathrooms",3564.40533324239 3 | "bedrooms",1426.25307431068 4 | "longitude",2164.30801062486 5 | "latitude",1539.71789928082 6 | "random",375.64002923271 7 | -------------------------------------------------------------------------------- /notebooks/permutation-importances-classifier.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Terence Parr Feat Imp" 3 | author: "Christopher Csiszar" 4 | date: "3/22/2018" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, warning = F, message = F) 10 | ``` 11 | 12 | ## Biases in RF importance calculations 13 | 14 | Simple RF Classification done in R to see how biases their RF feature importance metrics are 15 | 16 | ```{r cars} 17 | # RF Classification - feature importances 18 | 19 | library(tidyverse) 20 | library(randomForest) 21 | library(cowplot) 22 | library(gridExtra) 23 | 24 | setwd("~/Downloads/") 25 | rent <- read.csv('rent.csv') 26 | class(rent$interest_level) 27 | #interest.map <- c("low"=1, "medium"=2, "high"=3) 28 | 29 | #ent$interest_level <- interest.map[as.character(rent$interest_level)] 30 | 31 | summary(rent) 32 | 33 | ## plotting functions ## 34 | 35 | create_rfplot <- function(rf, type){ 36 | imp <- importance(rf, type=type, scale = F) 37 | featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1]) 38 | 39 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) + 40 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) + 41 | coord_flip() + 42 | theme_light(base_size=20) + 43 | theme(axis.title.x=element_blank(), 44 | axis.title.y=element_blank(), 45 | axis.text.x = element_text(size = 15, color = "black"), 46 | axis.text.y = element_text(size = 15, color = "black")) 47 | return(p) 48 | } 49 | 50 | create_ggplot <- function(featureImportance){ 51 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) + 52 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) + 53 | coord_flip() + 54 | theme_light(base_size=20) + 55 | theme(axis.title.x=element_blank(), 56 | axis.title.y=element_blank(), 57 | axis.text.x = element_text(size = 15, color = "black"), 58 | axis.text.y = element_text(size = 15, color = "black")) 59 | return(p) 60 | } 61 | ``` 62 | 63 | ```{r} 64 | rent$interest_level <- as.factor(rent$interest_level) 65 | head(rent) 66 | ``` 67 | 68 | 69 | ## Type = 1, mean decrease in Accuracy 70 | 71 | ```{r cars3} 72 | ####### no random column ######### 73 | set.seed(1) 74 | rent$random <- sample(100, size = nrow(rent), replace = TRUE) 75 | 76 | #Fit Random Forest Model 77 | rf1 = randomForest(interest_level ~ ., 78 | ntree = 40, 79 | data = rent[, 1:6], 80 | nodesize = 1, importance = TRUE) 81 | #print(rf) 82 | 83 | importance(rf1, type = 1) 84 | #round(importance(rf), 2) 85 | 86 | # Variable Importance 87 | k = varImpPlot(rf1, 88 | sort = T, 89 | main="Top - Variable Importance") 90 | 91 | #p1 <- create_rfplot(rf1, type = 1) 92 | #ggsave('../article/images/cls_permute_R.svg', 93 | # plot = p1, device = 'svg', height = 4, width = 6) 94 | ######## with random column ######## 95 | 96 | 97 | #Fit Random Forest Model 98 | rf2 = randomForest(interest_level ~ ., 99 | ntree = 40, 100 | data = rent, 101 | nodesize = 1, importance = TRUE) 102 | #print(rf) 103 | 104 | #importance(rf2, type = 1) 105 | 106 | imp1 <- data.frame(importance(rf2, type = 2)) 107 | write.csv(imp1, file="imp_R_class_gini.csv") 108 | #round(importance(rf), 2) 109 | 110 | imp1 <- data.frame(importance(rf2, type = 1)) 111 | write.csv(imp1, file="imp_R_class_acc.csv") 112 | 113 | # Variable Importance 114 | #varImpPlot(rf, 115 | #sort = T, 116 | #main="Top - Variable Importance") 117 | 118 | #p2 <- create_rfplot(rf2, type = 1) 119 | #ggsave('../article/images/cls_permute_random_R.svg', 120 | # plot = p2, device = 'svg', height = 4, width = 6) 121 | ``` 122 | 123 | ## Type = 2, mean decrease in Gini 124 | 125 | ```{r cars4} 126 | ####### no random column ######### 127 | 128 | 129 | #p1 <- create_rfplot(rf1, type = 2) 130 | #ggsave('../article/images/cls_dflt_R.svg', 131 | #plot = p1, device = 'svg', height = 4, width = 6) 132 | 133 | ######## with random column ######## 134 | 135 | #imp1 <- data.frame(importance(rf2, type = 2)) 136 | #write.csv(imp1, file="imp_R_class_gini.csv") 137 | 138 | #p2 <- create_rfplot(rf2, type = 2) 139 | #ggsave('../article/images/cls_dflt_random_R.svg', 140 | #plot = p2, device = 'svg', height = 4, width = 6) 141 | 142 | ``` 143 | 144 | ## Cost by dropping column analysis 145 | 146 | ```{r cars5, eval=FALSE} 147 | ####### no random column ######### 148 | get_drop_imp <- function(rent, columns){ 149 | X <- rent[,c(columns, 'interest_level')] # data 150 | rf <- randomForest(interest_level~., data = X, 151 | ntree = 40, mtry=2, nodesize=1, importance=T) 152 | full_rsq <- -1*mean(rf$err.rate) # 153 | 154 | imp <- c() 155 | for (c in columns){ 156 | X_sub <- X[, !(colnames(X) == c)] 157 | rf <- randomForest(interest_level~., data = X_sub, 158 | ntree = 40, mtry=2, nodesize=1, importance=T) 159 | sub_rsq <- -1*mean(rf$err.rate) # 160 | diff_rsq <- full_rsq - sub_rsq 161 | imp <- c(imp, diff_rsq) 162 | } 163 | featureImportance <- data.frame(Feature=columns, Importance=imp) 164 | return(featureImportance) 165 | } 166 | 167 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'price') 168 | featureImportance <- get_drop_imp(rent[, 1:6], columns) 169 | 170 | write.csv(featureImportance, file="imp_R_class_gini.csv") 171 | #p1 <- create_ggplot(featureImportance) 172 | #ggsave('../article/images/cls_drop_R.svg', 173 | #plot = p1, device = 'svg', height = 4, width = 6) 174 | 175 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'random', 'price') 176 | featureImportance <- get_drop_imp(rent, columns) 177 | 178 | write.csv(featureImportance, file="imp_R_class_drop.csv") 179 | #p2 <- create_ggplot(featureImportance) 180 | #ggsave('../article/images/cls_drop_random_R.svg', 181 | #plot = p2, device = 'svg', height = 4, width = 6) 182 | ``` 183 | 184 | ## Takeaways 185 | 186 | It appears that RF feature importance in R has several different metrics when evaluating. It seems that the "decrease in accuracy" metric places the `random` column dead last, as expected, while the "decrease in Gini" metric is terribly biased due to high cardinality, placing the `random` column as second most important. 187 | 188 | Another thing to note is, due to low cardinality, `bedrooms` is a less important feature for Gini decrease metrics. 189 | 190 | More of RF feature importance interpretation in R: 191 | 192 | https://cran.r-project.org/web/packages/randomForest/randomForest.pdf 193 | 194 | https://stats.stackexchange.com/questions/197827/how-to-interpret-mean-decrease-in-accuracy-and-mean-decrease-gini-in-random-fore 195 | https://stackoverflow.com/questions/736514/r-random-forests-variable-importance 196 | -------------------------------------------------------------------------------- /notebooks/permutation-importances-regressor.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: pdf_notebook 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE, warning = F, message = F) 8 | ``` 9 | 10 | ```{r warning=FALSE, message=FALSE} 11 | library(tidyverse) 12 | library(randomForest) 13 | library(cowplot) 14 | library(gridExtra) 15 | ``` 16 | 17 | ```{r} 18 | rents <- read.csv('./data/rent.csv') 19 | glimpse(rents) 20 | ``` 21 | 22 | ```{r} 23 | features <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'price') 24 | df <- rents[,features] 25 | df$price <- log(df$price) 26 | # with random column 27 | df['random'] <- runif(nrow(df)) 28 | head(df) 29 | ``` 30 | 31 | ## PLOTTING FUNCTIONS 32 | 33 | ```{r} 34 | create_rfplot <- function(rf, type){ 35 | imp <- importance(rf, type=type, scale = F) 36 | featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1]) 37 | 38 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) + 39 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) + 40 | coord_flip() + 41 | theme_light(base_size=20) + 42 | theme(axis.title.x=element_blank(), 43 | axis.title.y=element_blank(), 44 | axis.text.x = element_text(size = 15, color = "black"), 45 | axis.text.y = element_text(size = 15, color = "black")) 46 | return(p) 47 | } 48 | 49 | create_ggplot <- function(featureImportance){ 50 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) + 51 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) + 52 | coord_flip() + 53 | theme_light(base_size=20) + 54 | theme(axis.title.x=element_blank(), 55 | axis.title.y=element_blank(), 56 | axis.text.x = element_text(size = 15, color = "black"), 57 | axis.text.y = element_text(size = 15, color = "black")) 58 | return(p) 59 | } 60 | ``` 61 | 62 | ## BUILT-IN IMPORTANCE 63 | 64 | **Important Note: Unscaled Feature importances are used while assessing built-in feature importances** 65 | 66 | "Here are the definitions of the variable importance measures. The first measure is computed from permuting OOB data: For each tree, the prediction error on the out-of-bag portion of the data is recorded (error rate for classification, MSE for regression). Then the same is done after permuting each predictor variable. The difference between the two are then averaged over all trees, and normalized by the standard deviation of the differences. If the standard deviation of the differences is equal to 0 for a variable, the division is not done (but the average is almost always equal to 0 in that case). 67 | 68 | The second measure is the total decrease in node impurities from splitting on the variable, averaged over all trees. For classification, the node impurity is measured by the Gini index. For regression, it is measured by residual sum of squares." 69 | 70 | From : http://ugrad.stat.ubc.ca/R/library/randomForest/html/importance.html 71 | 72 | 73 | #### TYPE 1 = Mean decrease in MSE by **Permutation** 74 | 75 | ```{r} 76 | # without random column 77 | rf1 <- randomForest(price~., data = df[, 1:5], mtry=4, 78 | ntree = 40, importance=T) 79 | importance(rf1, scale=F) 80 | p1 <- create_rfplot(rf1, type = 1) 81 | #ggsave('../article/images/regr_permute_R.svg', 82 | #plot = p1, device = 'svg', height = 4, width = 6) 83 | ``` 84 | 85 | ```{r} 86 | # with random column 87 | rf2 <- randomForest(price~., data = df, mtry = 4, 88 | ntree = 40, importance=T) 89 | importance(rf2, scale=F) 90 | p2 <- create_rfplot(rf2, type = 1) 91 | #ggsave('../article/images/regr_permute_random_R.svg', 92 | #plot = p2, device = 'svg', height = 4, width = 6) 93 | imp1 <- data.frame(importance(rf2, type = 1, scale=F)) 94 | write.csv(imp1, file="./data/imp_R_regr_MSE.csv") 95 | ``` 96 | 97 | #### TYPE 2 = Mean decrease in node impurity (RSS) by splitting on columns, **Python's default** 98 | 99 | ```{r} 100 | # without random column 101 | rf1 <- randomForest(price~., data = df[, 1:5], mtry=4, 102 | ntree = 40, importance=T) 103 | p1 <- create_rfplot(rf1, type = 2) 104 | #ggsave('../article/images/regr_dflt_R.svg', 105 | #plot = p1, device = 'svg', height = 4, width = 6) 106 | 107 | ``` 108 | 109 | 110 | ```{r} 111 | # with random column 112 | rf2 <- randomForest(price~., data = df, mtry = 4, 113 | ntree = 40, importance=T) 114 | p2 <- create_rfplot(rf2, type = 2) 115 | #ggsave('../article/images/regr_dflt_random_R.svg', 116 | #plot = p2, device = 'svg', height = 4, width = 6) 117 | imp1 <- data.frame(importance(rf2, type = 2,scale=F)) 118 | write.csv(imp1, file="./data/imp_R_regr_RSS.csv") 119 | ``` 120 | 121 | 122 | 123 | ## EXAMINE COST BY DROPPING 124 | 125 | 126 | ```{r, eval=F} 127 | # PARAMS : ntree = 40, mtry = 2, nodesize = 1 128 | 129 | get_drop_imp <- function(df, columns){ 130 | X <- df[,c(columns, 'price')] # data 131 | rf <- randomForest(price~., data = X, 132 | ntree = 40, mtry=2, nodesize=1, importance=T) 133 | full_rsq <- mean(rf$rsq) # R-squared 134 | 135 | imp <- c() 136 | for (c in columns){ 137 | X_sub <- X[, !(colnames(X) == c)] 138 | rf <- randomForest(price~., data = X_sub, 139 | ntree = 40, mtry=2, nodesize=1, importance=T) 140 | sub_rsq <- mean(rf$rsq) # R-squared 141 | diff_rsq <- full_rsq - sub_rsq 142 | imp <- c(imp, diff_rsq) 143 | } 144 | featureImportance <- data.frame(Feature=columns, Importance=imp) 145 | return(featureImportance) 146 | } 147 | ``` 148 | 149 | ```{r, eval=F} 150 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude') 151 | featureImportance <- get_drop_imp(df, columns) 152 | p1 <- create_ggplot(featureImportance) 153 | #ggsave('../article/images/regr_drop_R.svg', 154 | #plot = p1, device = 'svg', height = 4, width = 6) 155 | ``` 156 | 157 | ```{r, eval=F} 158 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'random') 159 | featureImportance <- get_drop_imp(df, columns) 160 | p2 <- create_ggplot(featureImportance) 161 | #ggsave('../article/images/regr_drop_random_R.svg', 162 | #plot = p2, device = 'svg', height = 4, width = 6) 163 | 164 | write.csv(featureImportance, file="./data/imp_R_regr_drop.csv") 165 | ``` 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /src/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Terence Parr and Kerem Turgutlu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/play_plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import seaborn as sns 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error 7 | from sklearn.model_selection import cross_val_score 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.base import clone 10 | from rfpimp import * 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.datasets import load_breast_cancer 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.metrics import accuracy_score 15 | 16 | df_all = pd.read_csv("../notebooks/data/rent-cls.csv") 17 | 18 | num_features = ['bathrooms','bedrooms','latitude','longitude','price'] 19 | target = 'interest_level' 20 | 21 | df = df_all[num_features + [target]] 22 | 23 | 24 | def test1(): 25 | # compute median per num bedrooms 26 | df_median_price_per_bedrooms = df.groupby(by='bedrooms')['price'].median().reset_index() 27 | beds_to_median = df_median_price_per_bedrooms.to_dict(orient='dict')['price'] 28 | df['median_price_per_bedrooms'] = df['bedrooms'].map(beds_to_median) 29 | # compute ratio of price to median price for that num of bedrooms 30 | df['price_to_median_beds'] = df['price'] / df['median_price_per_bedrooms'] 31 | # ratio of num bedrooms to price 32 | df["beds_per_price"] = df["bedrooms"] / df["price"] 33 | # total rooms (bed, bath) 34 | df["beds_baths"] = df["bedrooms"]+df["bathrooms"] 35 | del df['median_price_per_bedrooms'] # don't need after computation 36 | 37 | df_train, df_test = train_test_split(df, test_size=0.15) 38 | 39 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level'] 40 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level'] 41 | 42 | rf = RandomForestClassifier(n_estimators=50, n_jobs=-1, 43 | max_features=1.0, 44 | min_samples_leaf=10, oob_score=True) 45 | rf.fit(X_train, y_train) 46 | 47 | I = importances(rf, X_test, y_test) 48 | return I 49 | 50 | 51 | def test2(): 52 | df_train, df_test = train_test_split(df, test_size=0.15) 53 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level'] 54 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level'] 55 | rf = RandomForestClassifier(n_estimators=50, n_jobs=-1, 56 | max_features=1.0, 57 | min_samples_leaf=10, oob_score=True) 58 | rf.fit(X_train, y_train) 59 | I = importances(rf, X_test, y_test, features=['bedrooms','bathrooms',['latitude', 'longitude']]) 60 | return I 61 | 62 | 63 | def test3(): 64 | 65 | cancer = load_breast_cancer() 66 | 67 | X, y = cancer.data, cancer.target 68 | # show first 5 columns only 69 | # df = pd.DataFrame(X[:, 0:10], columns=cancer.feature_names[0:10]) 70 | df = pd.DataFrame(X, columns=cancer.feature_names) 71 | #df['diagnosis'] = cancer.target 72 | X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3) 73 | 74 | cl = RandomForestClassifier(n_estimators=20) 75 | cl.fit(X_train, y_train) 76 | 77 | I = importances(cl, X_test, y_test) 78 | return I 79 | 80 | 81 | viz = plot_importances(test1()) 82 | viz.save(filename='/tmp/t.svg') 83 | I = test2() 84 | viz = plot_importances(I) 85 | # viz.save(filename='/tmp/t2.svg') 86 | viz.view() 87 | 88 | # I = test3() 89 | # viz = plot_importances(I) 90 | # viz.save(filename='/tmp/t3.svg') 91 | 92 | #cancer = load_breast_cancer() 93 | # X, y = cancer.data, cancer.target 94 | # df = pd.DataFrame(X, columns=cancer.feature_names) 95 | #viz = plot_dependence_heatmap(D, figsize=(12, 12)) 96 | 97 | # D = feature_dependence_matrix(df, n_samples=5000) 98 | # viz = plot_dependence_heatmap(D, figsize=(4,4)) 99 | # viz.view() 100 | 101 | #print(feature_dependence_matrix(df)) -------------------------------------------------------------------------------- /src/run_feat_dep.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import seaborn as sns 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error 7 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_object_dtype, is_categorical_dtype 8 | from sklearn.model_selection import cross_val_score 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.base import clone 11 | 12 | from timeit import default_timer as timer 13 | 14 | from rfpimp import * 15 | 16 | df = pd.read_feather("/Users/parrt/github/mlbook-private/data/bulldozer-train-num.feather") 17 | 18 | X_train, y_train = df.drop('SalePrice', axis=1), df['SalePrice'] 19 | 20 | rf = RandomForestRegressor(n_estimators=50, 21 | n_jobs=-1, 22 | oob_score=True, 23 | max_features=.4) 24 | 25 | start = timer() # ------------ 26 | 27 | D = oob_dependences(rf, X_train, 2000) # like 10 seconds 28 | DM = feature_dependence_matrix(X_train, rf, 2000) # like 15 minutes 29 | 30 | end = timer() # ------------ 31 | print(f"{end - start:.2f}s") 32 | 33 | print(D) 34 | print(DM) 35 | -------------------------------------------------------------------------------- /src/run_perm_imp.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import seaborn as sns 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error 7 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_object_dtype, is_categorical_dtype 8 | from sklearn.model_selection import cross_val_score 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.base import clone 11 | import cProfile, pstats, io 12 | from pstats import SortKey 13 | 14 | from timeit import default_timer as timer 15 | 16 | from rfpimp import * 17 | 18 | df = pd.read_feather("/Users/parrt/github/mlbook-private/data/bulldozer-train-num.feather") 19 | 20 | rf = RandomForestRegressor(n_estimators=50, 21 | n_jobs=-1, 22 | oob_score=True, 23 | max_features=.4) 24 | X_train, y_train = df.drop('SalePrice', axis=1), df['SalePrice'] 25 | 26 | print("Data loaded") 27 | 28 | rf.fit(X_train, y_train) 29 | 30 | print("Model fit") 31 | 32 | start = timer() # ------------ 33 | 34 | #I = oob_importances(rf, X_train, y_train, n_samples=3000) 35 | profiler = cProfile.Profile() 36 | profiler.enable() 37 | I = importances(rf, X_train, y_train, n_samples=3000) 38 | profiler.disable() 39 | 40 | end = timer() # ------------ 41 | print(f"{end - start:.2f}s") 42 | 43 | s = io.StringIO() 44 | sortby = SortKey.TIME 45 | ps = pstats.Stats(profiler, stream=s).sort_stats(sortby) 46 | ps.print_stats() 47 | print(s.getvalue()) 48 | 49 | viz = plot_importances(I) 50 | viz.view() 51 | -------------------------------------------------------------------------------- /src/run_perm_imp_cancer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import seaborn as sns 5 | from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier 6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error 7 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_object_dtype, is_categorical_dtype 8 | from sklearn.model_selection import cross_val_score 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.base import clone 11 | 12 | from timeit import default_timer as timer 13 | 14 | from rfpimp import * 15 | 16 | df = pd.read_csv("/Users/parrt/github/qiforest/data/cancer.csv") 17 | N = len(df)-20 18 | target='diagnosis' 19 | anomaly = df[df[target] == 1] 20 | normal = df[df[target] == 0] 21 | df = pd.concat([anomaly[0:20], normal[0:N]]) 22 | 23 | X, y = df.drop('diagnosis', axis=1), df['diagnosis'] 24 | 25 | weights = 1 / (np.bincount(y) / len(X)) 26 | rf = RandomForestClassifier(n_estimators=50, 27 | n_jobs=-1, 28 | oob_score=True, 29 | class_weight={0: weights[0], 1: weights[1]}, 30 | max_features=.4) 31 | rf.fit(X, y) 32 | start = timer() # ------------ 33 | 34 | jeremy_trick_RF_sample_size(100) 35 | I = oob_importances(rf, X, y, n_samples=3000) 36 | print(I) 37 | jeremy_trick_reset_RF_sample_size() 38 | 39 | # sample_weights = df.loc[df.target==0, ] 40 | I = importances(rf, X, y, features=X.columns, n_samples=3000) 41 | print(I) 42 | end = timer() # ------------ 43 | print(f"{end - start:.2f}s") 44 | 45 | viz = plot_importances(I) 46 | viz.view() 47 | -------------------------------------------------------------------------------- /src/run_rent_imp.py: -------------------------------------------------------------------------------- 1 | from rfpimp import * 2 | import pandas as pd 3 | from sklearn.ensemble import RandomForestRegressor 4 | from sklearn.model_selection import train_test_split 5 | 6 | df_orig = pd.read_csv("/Users/parrt/github/random-forest-importances/notebooks/data/rent.csv") 7 | 8 | df = df_orig.copy() 9 | 10 | # attentuate affect of outliers in price 11 | df['price'] = np.log(df['price']) 12 | 13 | df_train, df_test = train_test_split(df, test_size=0.20) 14 | 15 | features = ['bathrooms','bedrooms','longitude','latitude', 16 | 'price'] 17 | df_train = df_train[features] 18 | df_test = df_test[features] 19 | 20 | X_train, y_train = df_train.drop('price',axis=1), df_train['price'] 21 | X_test, y_test = df_test.drop('price',axis=1), df_test['price'] 22 | X_train['random'] = np.random.random(size=len(X_train)) 23 | X_test['random'] = np.random.random(size=len(X_test)) 24 | 25 | rf = RandomForestRegressor(n_estimators=100, n_jobs=-1) 26 | rf.fit(X_train, y_train) 27 | 28 | imp = importances(rf, X_test, y_test) # permutation 29 | viz = plot_importances(imp) 30 | viz.view() 31 | 32 | 33 | df_train, df_test = train_test_split(df_orig, test_size=0.20) 34 | features = ['bathrooms','bedrooms','price','longitude','latitude', 35 | 'interest_level'] 36 | df_train = df_train[features] 37 | df_test = df_test[features] 38 | 39 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level'] 40 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level'] 41 | # Add column of random numbers 42 | X_train['random'] = np.random.random(size=len(X_train)) 43 | X_test['random'] = np.random.random(size=len(X_test)) 44 | 45 | rf = RandomForestClassifier(n_estimators=100, 46 | min_samples_leaf=5, 47 | n_jobs=-1, 48 | oob_score=True) 49 | rf.fit(X_train, y_train) 50 | 51 | imp = importances(rf, X_test, y_test, n_samples=-1) 52 | viz = plot_importances(imp) 53 | viz.view() 54 | 55 | -------------------------------------------------------------------------------- /src/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file = README.md 3 | -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | long_description = """A library that provides feature importances, based upon 4 | the permutation importance strategy, for general scikit-learn 5 | models and implementations specifically for random forest out-of-bag scores. 6 | Built by Terence Parr and Kerem Turgutlu. 7 | See Beware Default 8 | Random Forest Importances for a deeper discussion of the issues surrounding 9 | feature importances in random forests. 10 | """ 11 | 12 | setup( 13 | name='rfpimp', 14 | version='1.3.7', 15 | url='https://github.com/parrt/random-forest-importances', 16 | license='MIT', 17 | py_modules=['rfpimp'], 18 | python_requires='>=3.6', 19 | author='Terence Parr, Kerem Turgutlu', 20 | author_email='parrt@antlr.org, kcturgutlu@dons.usfca.edu', 21 | install_requires=['numpy','pandas','scikit-learn','matplotlib'], 22 | description='Permutation and drop-column importance for scikit-learn random forests and other models', 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | keywords='scikit-learn random forest feature permutation importances', 26 | classifiers=['License :: OSI Approved :: MIT License', 27 | 'Intended Audience :: Developers'] 28 | ) 29 | --------------------------------------------------------------------------------