├── .gitignore
├── LICENSE
├── README.md
├── article
├── css
│ └── book.css
├── images
│ ├── R_class_acc_random.svg
│ ├── R_class_dorp_random.svg
│ ├── R_class_drop_random.svg
│ ├── R_class_gini_random.svg
│ ├── R_regr_MSE_random.svg
│ ├── R_regr_RSS_random.svg
│ ├── R_regr_drop_random.svg
│ ├── cancer-dependencies.png
│ ├── cancer_corr.svg
│ ├── cancer_dep.svg
│ ├── cancer_dep_less4.svg
│ ├── cancer_dflt_imp.svg
│ ├── cancer_dropcol_imp.svg
│ ├── cancer_imp.svg
│ ├── cls_dflt.svg
│ ├── cls_dflt_R.svg
│ ├── cls_dflt_random.pdf
│ ├── cls_dflt_random.svg
│ ├── cls_dflt_random_R.pdf
│ ├── cls_dflt_random_R.svg
│ ├── cls_dflt_random_R_annotated.png
│ ├── cls_dflt_random_annotated.png
│ ├── cls_drop_R.svg
│ ├── cls_drop_random_R.svg
│ ├── cls_dropcol.svg
│ ├── cls_dropcol_random.svg
│ ├── cls_permute.svg
│ ├── cls_permute_R.svg
│ ├── cls_permute_random.svg
│ ├── cls_permute_random_R.pdf
│ ├── cls_permute_random_R.svg
│ ├── collinear_dflt.svg
│ ├── collinear_dflt_longitude_dup.svg
│ ├── collinear_dflt_longitude_noise_0.0005.svg
│ ├── collinear_dflt_longitude_noise_0.0010.svg
│ ├── collinear_dflt_longitude_noise_3.0000.svg
│ ├── collinear_dropcol.svg
│ ├── collinear_dropcol_bathrooms_dup.svg
│ ├── collinear_dropcol_longitude_dup.svg
│ ├── collinear_permute.svg
│ ├── collinear_permute_bedrooms_noise_1.0000.svg
│ ├── collinear_permute_bedrooms_noise_2.0000.svg
│ ├── collinear_permute_bedrooms_noise_3.0000.svg
│ ├── collinear_permute_longitude_dup.svg
│ ├── collinear_permute_longitude_noise_0.0005.svg
│ ├── collinear_permute_longitude_noise_0.0010.svg
│ ├── corr_matrix.png
│ ├── corrheatmap.png
│ ├── corrheatmap.svg
│ ├── dependencies.png
│ ├── diagrams.graffle
│ ├── grouped_dup_imp.svg
│ ├── grouped_imp.svg
│ ├── imp.svg
│ ├── latlong_imp.svg
│ ├── regr_dflt.pdf
│ ├── regr_dflt.svg
│ ├── regr_dflt_R.pdf
│ ├── regr_dflt_R.svg
│ ├── regr_dflt_random.pdf
│ ├── regr_dflt_random.svg
│ ├── regr_dflt_random_R.pdf
│ ├── regr_dflt_random_R.svg
│ ├── regr_dflt_random_R_annotated.png
│ ├── regr_dflt_random_annotated.png
│ ├── regr_drop_R.svg
│ ├── regr_drop_random_R.svg
│ ├── regr_dropcol.svg
│ ├── regr_dropcol_random.svg
│ ├── regr_permute.svg
│ ├── regr_permute_R.svg
│ ├── regr_permute_random.svg
│ ├── regr_permute_random_R.svg
│ ├── rent-pimp-sample-size.svg
│ ├── subset_imp.svg
│ └── test-dataset-pimp-sample-size.svg
└── index.html
├── notebooks
├── .ipynb_checkpoints
│ └── R_to_Python-checkpoint.ipynb
├── R_to_Python.ipynb
├── collinear.ipynb
├── data
│ ├── imp_R_class_acc.csv
│ ├── imp_R_class_drop.csv
│ ├── imp_R_class_gini.csv
│ ├── imp_R_regr_MSE.csv
│ ├── imp_R_regr_RSS.csv
│ ├── imp_R_regr_drop.csv
│ ├── rent-cls.csv
│ └── rent.csv
├── imp_R_regr_MSE.csv
├── imp_R_regr_RSS.csv
├── permutation-importances-classifier.Rmd
├── permutation-importances-classifier.html
├── permutation-importances-classifier.ipynb
├── permutation-importances-regressor.Rmd
├── permutation-importances-regressor.html
├── permutation-importances-regressor.ipynb
├── pimp.ipynb
├── pimp_plots.ipynb
└── rfpimp-collinear.ipynb
└── src
├── LICENSE
├── play_plot.py
├── rfpimp.py
├── run_feat_dep.py
├── run_perm_imp.py
├── run_perm_imp_cancer.py
├── run_rent_imp.py
├── setup.cfg
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | __pycache__/
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Terence Parr
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Feature importances for scikit-learn machine learning models
2 |
3 | By Terence Parr and Kerem Turgutlu. See [Explained.ai](http://explained.ai) for more stuff.
4 |
5 | The scikit-learn Random Forest feature importances strategy is mean decrease in impurity (or gini importance) mechanism, which is unreliable.
6 | To get reliable results, use permutation importance, provided in the `rfpimp` package in the `src` dir. Install with:
7 |
8 | `pip install rfpimp`
9 |
10 | We include permutation and drop-column importance measures that work with any sklearn model. Yes, `rfpimp` is an increasingly-ill-suited name, but we still like it.
11 |
12 | ## Description
13 |
14 | See Beware Default Random Forest Importances for a deeper discussion of the issues surrounding feature importances in random forests (authored by Terence Parr, Kerem Turgutlu, Christopher Csiszar, and Jeremy Howard).
15 |
16 | The mean-decrease-in-impurity importance of a feature is computed by measuring how effective the feature is at reducing uncertainty (classifiers) or variance (regressors) when creating decision trees within random forests. The problem is that this mechanism, while fast, does not always give an accurate picture of importance. Strobl et al pointed out in Bias in random forest variable importance measures: Illustrations, sources and a solution that “the variable importance measures of Breiman's original random forest method ... are not reliable in situations where potential predictor variables vary in their scale of measurement or their number of categories.”
17 |
18 | A more reliable method is permutation importance, which measures the importance of a feature as follows. Record a baseline accuracy (classifier) or R2 score (regressor) by passing a validation set or the out-of-bag (OOB) samples through the random forest. Permute the column values of a single predictor feature and then pass all test samples back through the random forest and recompute the accuracy or R2. The importance of that feature is the difference between the baseline and the drop in overall accuracy or R2 caused by permuting the column. The permutation mechanism is much more computationally expensive than the mean decrease in impurity mechanism, but the results are more reliable.
19 |
20 | ## Sample code
21 |
22 | See the [notebooks directory](https://github.com/parrt/random-forest-importances/blob/master/notebooks) for things like [Collinear features](https://github.com/parrt/random-forest-importances/blob/master/notebooks/collinear.ipynb) and [Plotting feature importances](https://github.com/parrt/random-forest-importances/blob/master/notebooks/pimp_plots.ipynb).
23 |
24 | Here's some sample Python code that uses the `rfpimp` package contained in the `src` directory. The data can be found in rent.csv, which is a subset of the data from Kaggle's Two Sigma Connect: Rental Listing Inquiries competition.
25 |
26 |
27 | ```python
28 | from rfpimp import *
29 | import pandas as pd
30 | from sklearn.ensemble import RandomForestRegressor
31 | from sklearn.model_selection import train_test_split
32 |
33 | df_orig = pd.read_csv("/Users/parrt/github/random-forest-importances/notebooks/data/rent.csv")
34 |
35 | df = df_orig.copy()
36 |
37 | # attentuate affect of outliers in price
38 | df['price'] = np.log(df['price'])
39 |
40 | df_train, df_test = train_test_split(df, test_size=0.20)
41 |
42 | features = ['bathrooms','bedrooms','longitude','latitude',
43 | 'price']
44 | df_train = df_train[features]
45 | df_test = df_test[features]
46 |
47 | X_train, y_train = df_train.drop('price',axis=1), df_train['price']
48 | X_test, y_test = df_test.drop('price',axis=1), df_test['price']
49 | X_train['random'] = np.random.random(size=len(X_train))
50 | X_test['random'] = np.random.random(size=len(X_test))
51 |
52 | rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
53 | rf.fit(X_train, y_train)
54 |
55 | imp = importances(rf, X_test, y_test) # permutation
56 | viz = plot_importances(imp)
57 | viz.view()
58 |
59 |
60 | df_train, df_test = train_test_split(df_orig, test_size=0.20)
61 | features = ['bathrooms','bedrooms','price','longitude','latitude',
62 | 'interest_level']
63 | df_train = df_train[features]
64 | df_test = df_test[features]
65 |
66 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level']
67 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level']
68 | # Add column of random numbers
69 | X_train['random'] = np.random.random(size=len(X_train))
70 | X_test['random'] = np.random.random(size=len(X_test))
71 |
72 | rf = RandomForestClassifier(n_estimators=100,
73 | min_samples_leaf=5,
74 | n_jobs=-1,
75 | oob_score=True)
76 | rf.fit(X_train, y_train)
77 |
78 | imp = importances(rf, X_test, y_test, n_samples=-1)
79 | viz = plot_importances(imp)
80 | viz.view()
81 | ```
82 | ### Feature correlation
83 |
84 | See [Feature collinearity heatmap](notebooks/rfpimp-collinear.ipynb). We can get the Spearman's correlation matrix:
85 |
86 |
87 |
88 | ### Feature dependencies
89 |
90 | The features we use in machine learning are rarely completely independent, which makes interpreting feature importance tricky. We could compute correlation coefficients, but that only identifies linear relationships. A way to at least identify if a feature, x, is dependent on other features is to train a model using x as a dependent variable and all other features as independent variables. Because random forests give us an easy out of bag error estimate, the feature dependence functions rely on random forest models. The R^2 prediction error from the model indicates how easy it is to predict feature x using the other features. The higher the score, the more dependent feature x is.
91 |
92 | You can also get a feature dependence matrix / heatmap that returns a non-symmetric data frame where each row is the importance of each var to the row's var used as a model target. Example:
93 |
94 |
95 |
--------------------------------------------------------------------------------
/article/css/book.css:
--------------------------------------------------------------------------------
1 | /*
2 | .fig-container {
3 | display: flex;
4 | overflow-x: scroll;
5 | flex: auto;
6 | width: 100%;
7 | flex-wrap: wrap;
8 | flex-flow: row wrap;
9 | }
10 | .fig-container > div {
11 | flex: 1;
12 | }
13 | */
14 | #toc {
15 | background: #f9f9f9 none repeat scroll 0 0;
16 | border: 1px solid #aaa;
17 | display: table;
18 | font-size: 83%;
19 | padding: 5px;
20 | width: auto;
21 | }
22 | .toc_title {
23 | font-weight: 700;
24 | text-align: center;
25 | }
26 | #toc li, #toc ul, #toc ul li{
27 | list-style: outside none none !important;
28 | }
29 | #toc a {
30 | color: #3A4145;
31 | text-decoration: none;
32 | }
33 | #toc a:hover {
34 | text-decoration: underline;
35 | }
36 | #author {
37 | font-size: 110%;
38 | }
39 | body {
40 | font-family: "Merriweather", serif;
41 | font-size: 100%;
42 | letter-spacing: 0.01rem;
43 | line-height: 1.5em;
44 | color: #3A4145;
45 | background: #FFFFFF;
46 | max-width: 800px;
47 | margin-left: 2%;
48 | -webkit-text-size-adjust: 100%;
49 | }
50 | p {
51 | margin: 15px 5px 15px 0px;
52 | }
53 | .watermark {
54 | font-size: 100%;
55 | right: 10px;
56 | opacity: 0.4;
57 | color: BLACK;
58 | position: absolute;
59 | top: 30px;
60 | }
61 | .caption {
62 | font-size: 80%;
63 | }
64 | @media (max-width: 415px) {
65 | /* iphone 6+ 414x628 browser window, iphone 6 is 375 x 559 in browser */
66 | body {
67 | font-size: 100%;
68 | }
69 | .watermark {
70 | position: static;
71 | }
72 | }
73 | @media (max-width: 736px) {
74 | /* iphone7+ 736 wide */
75 | body {
76 | font-size: 100%;
77 | }
78 | .watermark {
79 | position: static;
80 | }
81 | }
82 | h1 {
83 | line-height: 1.2em;
84 | }
85 | .inlinecode {
86 | font-family: "Monaco", monospace;
87 | font-size: 80%;
88 | }
89 | .codeblk {
90 | font-family: "Monaco", monospace;
91 | font-size: 80%;
92 | box-sizing: border-box;
93 | margin: 0 0 1em 0;
94 | border: #E3EDF3 1px solid;
95 | width: 100%;
96 | padding: 2px;
97 | background: #F7FAFB;
98 | border-radius: 2px;
99 | line-height: 1.3em;
100 | white-space: pre-wrap;
101 | }
102 | table {
103 | border-spacing: 15px 0px
104 | }
105 | table th {
106 | line-height: 1em;
107 | }
108 | .scrollbar_wrapper {
109 | overflow: auto;
110 | }
111 | table.figure {
112 | border: 0;
113 | width: 100%;
114 | }
115 | table.figure td {
116 | width: 50%;
117 | vertical-align: top;
118 | line-height: 1.1em;
119 | font-size: 80%;
120 | }
121 |
122 | table.dataframe {
123 | max-width: none;
124 | padding: 0.5em 0.5em;
125 | border: none;
126 | display: table;
127 | border-spacing: 0px 0px;
128 | border-collapse:separate;
129 | }
130 | table.dataframe td {
131 | font-family: "Helvetica", sans-serif;
132 | font-size: 80%;
133 | text-align: right;
134 | text-size-adjust: 100%;
135 | vertical-align: top;
136 | padding: 2px 5px;
137 | border-bottom: 0px;
138 | line-height: 1.1em;
139 | }
140 | table.dataframe th {
141 | font-family: "Helvetica", sans-serif;
142 | font-size: 80%;
143 | text-align: right;
144 | line-height: 1em;
145 | text-size-adjust: 100%;
146 | vertical-align: bottom;
147 | padding: 0px 5px;
148 | }
149 |
--------------------------------------------------------------------------------
/article/images/cancer-dependencies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cancer-dependencies.png
--------------------------------------------------------------------------------
/article/images/cls_dflt_R.svg:
--------------------------------------------------------------------------------
1 |
2 |
69 |
--------------------------------------------------------------------------------
/article/images/cls_dflt_random.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random.pdf
--------------------------------------------------------------------------------
/article/images/cls_dflt_random_R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random_R.pdf
--------------------------------------------------------------------------------
/article/images/cls_dflt_random_R_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random_R_annotated.png
--------------------------------------------------------------------------------
/article/images/cls_dflt_random_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_dflt_random_annotated.png
--------------------------------------------------------------------------------
/article/images/cls_drop_R.svg:
--------------------------------------------------------------------------------
1 |
2 |
65 |
--------------------------------------------------------------------------------
/article/images/cls_permute_R.svg:
--------------------------------------------------------------------------------
1 |
2 |
61 |
--------------------------------------------------------------------------------
/article/images/cls_permute_random_R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/cls_permute_random_R.pdf
--------------------------------------------------------------------------------
/article/images/collinear_dropcol.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
627 |
--------------------------------------------------------------------------------
/article/images/collinear_permute.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
612 |
--------------------------------------------------------------------------------
/article/images/collinear_permute_bedrooms_noise_3.0000.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
672 |
--------------------------------------------------------------------------------
/article/images/corr_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/corr_matrix.png
--------------------------------------------------------------------------------
/article/images/corrheatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/corrheatmap.png
--------------------------------------------------------------------------------
/article/images/dependencies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/dependencies.png
--------------------------------------------------------------------------------
/article/images/diagrams.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/diagrams.graffle
--------------------------------------------------------------------------------
/article/images/regr_dflt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt.pdf
--------------------------------------------------------------------------------
/article/images/regr_dflt_R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_R.pdf
--------------------------------------------------------------------------------
/article/images/regr_dflt_R.svg:
--------------------------------------------------------------------------------
1 |
2 |
64 |
--------------------------------------------------------------------------------
/article/images/regr_dflt_random.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random.pdf
--------------------------------------------------------------------------------
/article/images/regr_dflt_random_R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random_R.pdf
--------------------------------------------------------------------------------
/article/images/regr_dflt_random_R_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random_R_annotated.png
--------------------------------------------------------------------------------
/article/images/regr_dflt_random_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parrt/random-forest-importances/92ef2de1cd47ce5fd8e654a01f7876fa246b8665/article/images/regr_dflt_random_annotated.png
--------------------------------------------------------------------------------
/article/images/regr_drop_R.svg:
--------------------------------------------------------------------------------
1 |
2 |
57 |
--------------------------------------------------------------------------------
/article/images/regr_dropcol_random.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
638 |
--------------------------------------------------------------------------------
/article/images/regr_permute.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
623 |
--------------------------------------------------------------------------------
/article/images/regr_permute_R.svg:
--------------------------------------------------------------------------------
1 |
2 |
60 |
--------------------------------------------------------------------------------
/article/images/regr_permute_random.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
661 |
--------------------------------------------------------------------------------
/notebooks/data/imp_R_class_acc.csv:
--------------------------------------------------------------------------------
1 | "","MeanDecreaseAccuracy"
2 | "bathrooms",12.2722647086318
3 | "bedrooms",74.0723470287167
4 | "price",70.8236306582764
5 | "longitude",40.752136232213
6 | "latitude",47.9714935062742
7 | "random",2.45974994364518
8 |
--------------------------------------------------------------------------------
/notebooks/data/imp_R_class_drop.csv:
--------------------------------------------------------------------------------
1 | "","Feature","Importance"
2 | "1","bathrooms",-0.00620954989555456
3 | "2","bedrooms",0.0242944066060036
4 | "3","longitude",0.0201834528683037
5 | "4","latitude",0.0208219205776745
6 | "5","random",-0.0149920610836133
7 | "6","price",0.0542021893740692
8 |
--------------------------------------------------------------------------------
/notebooks/data/imp_R_class_gini.csv:
--------------------------------------------------------------------------------
1 | "","MeanDecreaseGini"
2 | "bathrooms",342.994453966487
3 | "bedrooms",957.076393066943
4 | "price",4676.08772137443
5 | "longitude",3803.1220992054
6 | "latitude",3845.96138345266
7 | "random",3945.05069793106
8 |
--------------------------------------------------------------------------------
/notebooks/data/imp_R_regr_MSE.csv:
--------------------------------------------------------------------------------
1 | "","X.IncMSE"
2 | "bathrooms",0.0893860247102489
3 | "bedrooms",0.105265104336058
4 | "longitude",0.140712080833606
5 | "latitude",0.12343161145159
6 | "random",-5.68879621046926e-05
7 |
--------------------------------------------------------------------------------
/notebooks/data/imp_R_regr_RSS.csv:
--------------------------------------------------------------------------------
1 | "","IncNodePurity"
2 | "bathrooms",3652.79853374327
3 | "bedrooms",1357.02440889748
4 | "longitude",2155.35387929004
5 | "latitude",1528.75792511292
6 | "random",365.143595904819
7 |
--------------------------------------------------------------------------------
/notebooks/data/imp_R_regr_drop.csv:
--------------------------------------------------------------------------------
1 | "","Feature","Importance"
2 | "1","bathrooms",0.0423248371964218
3 | "2","bedrooms",0.0913196641210466
4 | "3","longitude",0.130007078476311
5 | "4","latitude",0.106649812499106
6 | "5","random",-0.0226813443094983
7 |
--------------------------------------------------------------------------------
/notebooks/imp_R_regr_MSE.csv:
--------------------------------------------------------------------------------
1 | "","X.IncMSE"
2 | "bathrooms",0.0867924590431635
3 | "bedrooms",0.105465708124454
4 | "longitude",0.130956279570924
5 | "latitude",0.113012571751182
6 | "random",0.000373724167032719
7 |
--------------------------------------------------------------------------------
/notebooks/imp_R_regr_RSS.csv:
--------------------------------------------------------------------------------
1 | "","IncNodePurity"
2 | "bathrooms",3564.40533324239
3 | "bedrooms",1426.25307431068
4 | "longitude",2164.30801062486
5 | "latitude",1539.71789928082
6 | "random",375.64002923271
7 |
--------------------------------------------------------------------------------
/notebooks/permutation-importances-classifier.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Terence Parr Feat Imp"
3 | author: "Christopher Csiszar"
4 | date: "3/22/2018"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, warning = F, message = F)
10 | ```
11 |
12 | ## Biases in RF importance calculations
13 |
14 | Simple RF Classification done in R to see how biases their RF feature importance metrics are
15 |
16 | ```{r cars}
17 | # RF Classification - feature importances
18 |
19 | library(tidyverse)
20 | library(randomForest)
21 | library(cowplot)
22 | library(gridExtra)
23 |
24 | setwd("~/Downloads/")
25 | rent <- read.csv('rent.csv')
26 | class(rent$interest_level)
27 | #interest.map <- c("low"=1, "medium"=2, "high"=3)
28 |
29 | #ent$interest_level <- interest.map[as.character(rent$interest_level)]
30 |
31 | summary(rent)
32 |
33 | ## plotting functions ##
34 |
35 | create_rfplot <- function(rf, type){
36 | imp <- importance(rf, type=type, scale = F)
37 | featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1])
38 |
39 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
40 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) +
41 | coord_flip() +
42 | theme_light(base_size=20) +
43 | theme(axis.title.x=element_blank(),
44 | axis.title.y=element_blank(),
45 | axis.text.x = element_text(size = 15, color = "black"),
46 | axis.text.y = element_text(size = 15, color = "black"))
47 | return(p)
48 | }
49 |
50 | create_ggplot <- function(featureImportance){
51 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
52 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) +
53 | coord_flip() +
54 | theme_light(base_size=20) +
55 | theme(axis.title.x=element_blank(),
56 | axis.title.y=element_blank(),
57 | axis.text.x = element_text(size = 15, color = "black"),
58 | axis.text.y = element_text(size = 15, color = "black"))
59 | return(p)
60 | }
61 | ```
62 |
63 | ```{r}
64 | rent$interest_level <- as.factor(rent$interest_level)
65 | head(rent)
66 | ```
67 |
68 |
69 | ## Type = 1, mean decrease in Accuracy
70 |
71 | ```{r cars3}
72 | ####### no random column #########
73 | set.seed(1)
74 | rent$random <- sample(100, size = nrow(rent), replace = TRUE)
75 |
76 | #Fit Random Forest Model
77 | rf1 = randomForest(interest_level ~ .,
78 | ntree = 40,
79 | data = rent[, 1:6],
80 | nodesize = 1, importance = TRUE)
81 | #print(rf)
82 |
83 | importance(rf1, type = 1)
84 | #round(importance(rf), 2)
85 |
86 | # Variable Importance
87 | k = varImpPlot(rf1,
88 | sort = T,
89 | main="Top - Variable Importance")
90 |
91 | #p1 <- create_rfplot(rf1, type = 1)
92 | #ggsave('../article/images/cls_permute_R.svg',
93 | # plot = p1, device = 'svg', height = 4, width = 6)
94 | ######## with random column ########
95 |
96 |
97 | #Fit Random Forest Model
98 | rf2 = randomForest(interest_level ~ .,
99 | ntree = 40,
100 | data = rent,
101 | nodesize = 1, importance = TRUE)
102 | #print(rf)
103 |
104 | #importance(rf2, type = 1)
105 |
106 | imp1 <- data.frame(importance(rf2, type = 2))
107 | write.csv(imp1, file="imp_R_class_gini.csv")
108 | #round(importance(rf), 2)
109 |
110 | imp1 <- data.frame(importance(rf2, type = 1))
111 | write.csv(imp1, file="imp_R_class_acc.csv")
112 |
113 | # Variable Importance
114 | #varImpPlot(rf,
115 | #sort = T,
116 | #main="Top - Variable Importance")
117 |
118 | #p2 <- create_rfplot(rf2, type = 1)
119 | #ggsave('../article/images/cls_permute_random_R.svg',
120 | # plot = p2, device = 'svg', height = 4, width = 6)
121 | ```
122 |
123 | ## Type = 2, mean decrease in Gini
124 |
125 | ```{r cars4}
126 | ####### no random column #########
127 |
128 |
129 | #p1 <- create_rfplot(rf1, type = 2)
130 | #ggsave('../article/images/cls_dflt_R.svg',
131 | #plot = p1, device = 'svg', height = 4, width = 6)
132 |
133 | ######## with random column ########
134 |
135 | #imp1 <- data.frame(importance(rf2, type = 2))
136 | #write.csv(imp1, file="imp_R_class_gini.csv")
137 |
138 | #p2 <- create_rfplot(rf2, type = 2)
139 | #ggsave('../article/images/cls_dflt_random_R.svg',
140 | #plot = p2, device = 'svg', height = 4, width = 6)
141 |
142 | ```
143 |
144 | ## Cost by dropping column analysis
145 |
146 | ```{r cars5, eval=FALSE}
147 | ####### no random column #########
148 | get_drop_imp <- function(rent, columns){
149 | X <- rent[,c(columns, 'interest_level')] # data
150 | rf <- randomForest(interest_level~., data = X,
151 | ntree = 40, mtry=2, nodesize=1, importance=T)
152 | full_rsq <- -1*mean(rf$err.rate) #
153 |
154 | imp <- c()
155 | for (c in columns){
156 | X_sub <- X[, !(colnames(X) == c)]
157 | rf <- randomForest(interest_level~., data = X_sub,
158 | ntree = 40, mtry=2, nodesize=1, importance=T)
159 | sub_rsq <- -1*mean(rf$err.rate) #
160 | diff_rsq <- full_rsq - sub_rsq
161 | imp <- c(imp, diff_rsq)
162 | }
163 | featureImportance <- data.frame(Feature=columns, Importance=imp)
164 | return(featureImportance)
165 | }
166 |
167 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'price')
168 | featureImportance <- get_drop_imp(rent[, 1:6], columns)
169 |
170 | write.csv(featureImportance, file="imp_R_class_gini.csv")
171 | #p1 <- create_ggplot(featureImportance)
172 | #ggsave('../article/images/cls_drop_R.svg',
173 | #plot = p1, device = 'svg', height = 4, width = 6)
174 |
175 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'random', 'price')
176 | featureImportance <- get_drop_imp(rent, columns)
177 |
178 | write.csv(featureImportance, file="imp_R_class_drop.csv")
179 | #p2 <- create_ggplot(featureImportance)
180 | #ggsave('../article/images/cls_drop_random_R.svg',
181 | #plot = p2, device = 'svg', height = 4, width = 6)
182 | ```
183 |
184 | ## Takeaways
185 |
186 | It appears that RF feature importance in R has several different metrics when evaluating. It seems that the "decrease in accuracy" metric places the `random` column dead last, as expected, while the "decrease in Gini" metric is terribly biased due to high cardinality, placing the `random` column as second most important.
187 |
188 | Another thing to note is, due to low cardinality, `bedrooms` is a less important feature for Gini decrease metrics.
189 |
190 | More of RF feature importance interpretation in R:
191 |
192 | https://cran.r-project.org/web/packages/randomForest/randomForest.pdf
193 |
194 | https://stats.stackexchange.com/questions/197827/how-to-interpret-mean-decrease-in-accuracy-and-mean-decrease-gini-in-random-fore
195 | https://stackoverflow.com/questions/736514/r-random-forests-variable-importance
196 |
--------------------------------------------------------------------------------
/notebooks/permutation-importances-regressor.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "R Notebook"
3 | output: pdf_notebook
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE, warning = F, message = F)
8 | ```
9 |
10 | ```{r warning=FALSE, message=FALSE}
11 | library(tidyverse)
12 | library(randomForest)
13 | library(cowplot)
14 | library(gridExtra)
15 | ```
16 |
17 | ```{r}
18 | rents <- read.csv('./data/rent.csv')
19 | glimpse(rents)
20 | ```
21 |
22 | ```{r}
23 | features <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'price')
24 | df <- rents[,features]
25 | df$price <- log(df$price)
26 | # with random column
27 | df['random'] <- runif(nrow(df))
28 | head(df)
29 | ```
30 |
31 | ## PLOTTING FUNCTIONS
32 |
33 | ```{r}
34 | create_rfplot <- function(rf, type){
35 | imp <- importance(rf, type=type, scale = F)
36 | featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1])
37 |
38 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
39 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) +
40 | coord_flip() +
41 | theme_light(base_size=20) +
42 | theme(axis.title.x=element_blank(),
43 | axis.title.y=element_blank(),
44 | axis.text.x = element_text(size = 15, color = "black"),
45 | axis.text.y = element_text(size = 15, color = "black"))
46 | return(p)
47 | }
48 |
49 | create_ggplot <- function(featureImportance){
50 | p <- ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
51 | geom_bar(stat="identity", fill="#53cfff", width = 0.65) +
52 | coord_flip() +
53 | theme_light(base_size=20) +
54 | theme(axis.title.x=element_blank(),
55 | axis.title.y=element_blank(),
56 | axis.text.x = element_text(size = 15, color = "black"),
57 | axis.text.y = element_text(size = 15, color = "black"))
58 | return(p)
59 | }
60 | ```
61 |
62 | ## BUILT-IN IMPORTANCE
63 |
64 | **Important Note: Unscaled Feature importances are used while assessing built-in feature importances**
65 |
66 | "Here are the definitions of the variable importance measures. The first measure is computed from permuting OOB data: For each tree, the prediction error on the out-of-bag portion of the data is recorded (error rate for classification, MSE for regression). Then the same is done after permuting each predictor variable. The difference between the two are then averaged over all trees, and normalized by the standard deviation of the differences. If the standard deviation of the differences is equal to 0 for a variable, the division is not done (but the average is almost always equal to 0 in that case).
67 |
68 | The second measure is the total decrease in node impurities from splitting on the variable, averaged over all trees. For classification, the node impurity is measured by the Gini index. For regression, it is measured by residual sum of squares."
69 |
70 | From : http://ugrad.stat.ubc.ca/R/library/randomForest/html/importance.html
71 |
72 |
73 | #### TYPE 1 = Mean decrease in MSE by **Permutation**
74 |
75 | ```{r}
76 | # without random column
77 | rf1 <- randomForest(price~., data = df[, 1:5], mtry=4,
78 | ntree = 40, importance=T)
79 | importance(rf1, scale=F)
80 | p1 <- create_rfplot(rf1, type = 1)
81 | #ggsave('../article/images/regr_permute_R.svg',
82 | #plot = p1, device = 'svg', height = 4, width = 6)
83 | ```
84 |
85 | ```{r}
86 | # with random column
87 | rf2 <- randomForest(price~., data = df, mtry = 4,
88 | ntree = 40, importance=T)
89 | importance(rf2, scale=F)
90 | p2 <- create_rfplot(rf2, type = 1)
91 | #ggsave('../article/images/regr_permute_random_R.svg',
92 | #plot = p2, device = 'svg', height = 4, width = 6)
93 | imp1 <- data.frame(importance(rf2, type = 1, scale=F))
94 | write.csv(imp1, file="./data/imp_R_regr_MSE.csv")
95 | ```
96 |
97 | #### TYPE 2 = Mean decrease in node impurity (RSS) by splitting on columns, **Python's default**
98 |
99 | ```{r}
100 | # without random column
101 | rf1 <- randomForest(price~., data = df[, 1:5], mtry=4,
102 | ntree = 40, importance=T)
103 | p1 <- create_rfplot(rf1, type = 2)
104 | #ggsave('../article/images/regr_dflt_R.svg',
105 | #plot = p1, device = 'svg', height = 4, width = 6)
106 |
107 | ```
108 |
109 |
110 | ```{r}
111 | # with random column
112 | rf2 <- randomForest(price~., data = df, mtry = 4,
113 | ntree = 40, importance=T)
114 | p2 <- create_rfplot(rf2, type = 2)
115 | #ggsave('../article/images/regr_dflt_random_R.svg',
116 | #plot = p2, device = 'svg', height = 4, width = 6)
117 | imp1 <- data.frame(importance(rf2, type = 2,scale=F))
118 | write.csv(imp1, file="./data/imp_R_regr_RSS.csv")
119 | ```
120 |
121 |
122 |
123 | ## EXAMINE COST BY DROPPING
124 |
125 |
126 | ```{r, eval=F}
127 | # PARAMS : ntree = 40, mtry = 2, nodesize = 1
128 |
129 | get_drop_imp <- function(df, columns){
130 | X <- df[,c(columns, 'price')] # data
131 | rf <- randomForest(price~., data = X,
132 | ntree = 40, mtry=2, nodesize=1, importance=T)
133 | full_rsq <- mean(rf$rsq) # R-squared
134 |
135 | imp <- c()
136 | for (c in columns){
137 | X_sub <- X[, !(colnames(X) == c)]
138 | rf <- randomForest(price~., data = X_sub,
139 | ntree = 40, mtry=2, nodesize=1, importance=T)
140 | sub_rsq <- mean(rf$rsq) # R-squared
141 | diff_rsq <- full_rsq - sub_rsq
142 | imp <- c(imp, diff_rsq)
143 | }
144 | featureImportance <- data.frame(Feature=columns, Importance=imp)
145 | return(featureImportance)
146 | }
147 | ```
148 |
149 | ```{r, eval=F}
150 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude')
151 | featureImportance <- get_drop_imp(df, columns)
152 | p1 <- create_ggplot(featureImportance)
153 | #ggsave('../article/images/regr_drop_R.svg',
154 | #plot = p1, device = 'svg', height = 4, width = 6)
155 | ```
156 |
157 | ```{r, eval=F}
158 | columns <- c('bathrooms', 'bedrooms', 'longitude', 'latitude', 'random')
159 | featureImportance <- get_drop_imp(df, columns)
160 | p2 <- create_ggplot(featureImportance)
161 | #ggsave('../article/images/regr_drop_random_R.svg',
162 | #plot = p2, device = 'svg', height = 4, width = 6)
163 |
164 | write.csv(featureImportance, file="./data/imp_R_regr_drop.csv")
165 | ```
166 |
167 |
168 |
169 |
--------------------------------------------------------------------------------
/src/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Terence Parr and Kerem Turgutlu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/play_plot.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import seaborn as sns
5 | from sklearn.ensemble import RandomForestClassifier
6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error
7 | from sklearn.model_selection import cross_val_score
8 | from sklearn.model_selection import train_test_split
9 | from sklearn.base import clone
10 | from rfpimp import *
11 | from sklearn.ensemble import RandomForestClassifier
12 | from sklearn.datasets import load_breast_cancer
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.metrics import accuracy_score
15 |
16 | df_all = pd.read_csv("../notebooks/data/rent-cls.csv")
17 |
18 | num_features = ['bathrooms','bedrooms','latitude','longitude','price']
19 | target = 'interest_level'
20 |
21 | df = df_all[num_features + [target]]
22 |
23 |
24 | def test1():
25 | # compute median per num bedrooms
26 | df_median_price_per_bedrooms = df.groupby(by='bedrooms')['price'].median().reset_index()
27 | beds_to_median = df_median_price_per_bedrooms.to_dict(orient='dict')['price']
28 | df['median_price_per_bedrooms'] = df['bedrooms'].map(beds_to_median)
29 | # compute ratio of price to median price for that num of bedrooms
30 | df['price_to_median_beds'] = df['price'] / df['median_price_per_bedrooms']
31 | # ratio of num bedrooms to price
32 | df["beds_per_price"] = df["bedrooms"] / df["price"]
33 | # total rooms (bed, bath)
34 | df["beds_baths"] = df["bedrooms"]+df["bathrooms"]
35 | del df['median_price_per_bedrooms'] # don't need after computation
36 |
37 | df_train, df_test = train_test_split(df, test_size=0.15)
38 |
39 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level']
40 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level']
41 |
42 | rf = RandomForestClassifier(n_estimators=50, n_jobs=-1,
43 | max_features=1.0,
44 | min_samples_leaf=10, oob_score=True)
45 | rf.fit(X_train, y_train)
46 |
47 | I = importances(rf, X_test, y_test)
48 | return I
49 |
50 |
51 | def test2():
52 | df_train, df_test = train_test_split(df, test_size=0.15)
53 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level']
54 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level']
55 | rf = RandomForestClassifier(n_estimators=50, n_jobs=-1,
56 | max_features=1.0,
57 | min_samples_leaf=10, oob_score=True)
58 | rf.fit(X_train, y_train)
59 | I = importances(rf, X_test, y_test, features=['bedrooms','bathrooms',['latitude', 'longitude']])
60 | return I
61 |
62 |
63 | def test3():
64 |
65 | cancer = load_breast_cancer()
66 |
67 | X, y = cancer.data, cancer.target
68 | # show first 5 columns only
69 | # df = pd.DataFrame(X[:, 0:10], columns=cancer.feature_names[0:10])
70 | df = pd.DataFrame(X, columns=cancer.feature_names)
71 | #df['diagnosis'] = cancer.target
72 | X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3)
73 |
74 | cl = RandomForestClassifier(n_estimators=20)
75 | cl.fit(X_train, y_train)
76 |
77 | I = importances(cl, X_test, y_test)
78 | return I
79 |
80 |
81 | viz = plot_importances(test1())
82 | viz.save(filename='/tmp/t.svg')
83 | I = test2()
84 | viz = plot_importances(I)
85 | # viz.save(filename='/tmp/t2.svg')
86 | viz.view()
87 |
88 | # I = test3()
89 | # viz = plot_importances(I)
90 | # viz.save(filename='/tmp/t3.svg')
91 |
92 | #cancer = load_breast_cancer()
93 | # X, y = cancer.data, cancer.target
94 | # df = pd.DataFrame(X, columns=cancer.feature_names)
95 | #viz = plot_dependence_heatmap(D, figsize=(12, 12))
96 |
97 | # D = feature_dependence_matrix(df, n_samples=5000)
98 | # viz = plot_dependence_heatmap(D, figsize=(4,4))
99 | # viz.view()
100 |
101 | #print(feature_dependence_matrix(df))
--------------------------------------------------------------------------------
/src/run_feat_dep.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import seaborn as sns
5 | from sklearn.ensemble import RandomForestRegressor
6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error
7 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_object_dtype, is_categorical_dtype
8 | from sklearn.model_selection import cross_val_score
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.base import clone
11 |
12 | from timeit import default_timer as timer
13 |
14 | from rfpimp import *
15 |
16 | df = pd.read_feather("/Users/parrt/github/mlbook-private/data/bulldozer-train-num.feather")
17 |
18 | X_train, y_train = df.drop('SalePrice', axis=1), df['SalePrice']
19 |
20 | rf = RandomForestRegressor(n_estimators=50,
21 | n_jobs=-1,
22 | oob_score=True,
23 | max_features=.4)
24 |
25 | start = timer() # ------------
26 |
27 | D = oob_dependences(rf, X_train, 2000) # like 10 seconds
28 | DM = feature_dependence_matrix(X_train, rf, 2000) # like 15 minutes
29 |
30 | end = timer() # ------------
31 | print(f"{end - start:.2f}s")
32 |
33 | print(D)
34 | print(DM)
35 |
--------------------------------------------------------------------------------
/src/run_perm_imp.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import seaborn as sns
5 | from sklearn.ensemble import RandomForestRegressor
6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error
7 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_object_dtype, is_categorical_dtype
8 | from sklearn.model_selection import cross_val_score
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.base import clone
11 | import cProfile, pstats, io
12 | from pstats import SortKey
13 |
14 | from timeit import default_timer as timer
15 |
16 | from rfpimp import *
17 |
18 | df = pd.read_feather("/Users/parrt/github/mlbook-private/data/bulldozer-train-num.feather")
19 |
20 | rf = RandomForestRegressor(n_estimators=50,
21 | n_jobs=-1,
22 | oob_score=True,
23 | max_features=.4)
24 | X_train, y_train = df.drop('SalePrice', axis=1), df['SalePrice']
25 |
26 | print("Data loaded")
27 |
28 | rf.fit(X_train, y_train)
29 |
30 | print("Model fit")
31 |
32 | start = timer() # ------------
33 |
34 | #I = oob_importances(rf, X_train, y_train, n_samples=3000)
35 | profiler = cProfile.Profile()
36 | profiler.enable()
37 | I = importances(rf, X_train, y_train, n_samples=3000)
38 | profiler.disable()
39 |
40 | end = timer() # ------------
41 | print(f"{end - start:.2f}s")
42 |
43 | s = io.StringIO()
44 | sortby = SortKey.TIME
45 | ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
46 | ps.print_stats()
47 | print(s.getvalue())
48 |
49 | viz = plot_importances(I)
50 | viz.view()
51 |
--------------------------------------------------------------------------------
/src/run_perm_imp_cancer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import seaborn as sns
5 | from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
6 | from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_error
7 | from pandas.api.types import is_string_dtype, is_numeric_dtype, is_object_dtype, is_categorical_dtype
8 | from sklearn.model_selection import cross_val_score
9 | from sklearn.model_selection import train_test_split
10 | from sklearn.base import clone
11 |
12 | from timeit import default_timer as timer
13 |
14 | from rfpimp import *
15 |
16 | df = pd.read_csv("/Users/parrt/github/qiforest/data/cancer.csv")
17 | N = len(df)-20
18 | target='diagnosis'
19 | anomaly = df[df[target] == 1]
20 | normal = df[df[target] == 0]
21 | df = pd.concat([anomaly[0:20], normal[0:N]])
22 |
23 | X, y = df.drop('diagnosis', axis=1), df['diagnosis']
24 |
25 | weights = 1 / (np.bincount(y) / len(X))
26 | rf = RandomForestClassifier(n_estimators=50,
27 | n_jobs=-1,
28 | oob_score=True,
29 | class_weight={0: weights[0], 1: weights[1]},
30 | max_features=.4)
31 | rf.fit(X, y)
32 | start = timer() # ------------
33 |
34 | jeremy_trick_RF_sample_size(100)
35 | I = oob_importances(rf, X, y, n_samples=3000)
36 | print(I)
37 | jeremy_trick_reset_RF_sample_size()
38 |
39 | # sample_weights = df.loc[df.target==0, ]
40 | I = importances(rf, X, y, features=X.columns, n_samples=3000)
41 | print(I)
42 | end = timer() # ------------
43 | print(f"{end - start:.2f}s")
44 |
45 | viz = plot_importances(I)
46 | viz.view()
47 |
--------------------------------------------------------------------------------
/src/run_rent_imp.py:
--------------------------------------------------------------------------------
1 | from rfpimp import *
2 | import pandas as pd
3 | from sklearn.ensemble import RandomForestRegressor
4 | from sklearn.model_selection import train_test_split
5 |
6 | df_orig = pd.read_csv("/Users/parrt/github/random-forest-importances/notebooks/data/rent.csv")
7 |
8 | df = df_orig.copy()
9 |
10 | # attentuate affect of outliers in price
11 | df['price'] = np.log(df['price'])
12 |
13 | df_train, df_test = train_test_split(df, test_size=0.20)
14 |
15 | features = ['bathrooms','bedrooms','longitude','latitude',
16 | 'price']
17 | df_train = df_train[features]
18 | df_test = df_test[features]
19 |
20 | X_train, y_train = df_train.drop('price',axis=1), df_train['price']
21 | X_test, y_test = df_test.drop('price',axis=1), df_test['price']
22 | X_train['random'] = np.random.random(size=len(X_train))
23 | X_test['random'] = np.random.random(size=len(X_test))
24 |
25 | rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
26 | rf.fit(X_train, y_train)
27 |
28 | imp = importances(rf, X_test, y_test) # permutation
29 | viz = plot_importances(imp)
30 | viz.view()
31 |
32 |
33 | df_train, df_test = train_test_split(df_orig, test_size=0.20)
34 | features = ['bathrooms','bedrooms','price','longitude','latitude',
35 | 'interest_level']
36 | df_train = df_train[features]
37 | df_test = df_test[features]
38 |
39 | X_train, y_train = df_train.drop('interest_level',axis=1), df_train['interest_level']
40 | X_test, y_test = df_test.drop('interest_level',axis=1), df_test['interest_level']
41 | # Add column of random numbers
42 | X_train['random'] = np.random.random(size=len(X_train))
43 | X_test['random'] = np.random.random(size=len(X_test))
44 |
45 | rf = RandomForestClassifier(n_estimators=100,
46 | min_samples_leaf=5,
47 | n_jobs=-1,
48 | oob_score=True)
49 | rf.fit(X_train, y_train)
50 |
51 | imp = importances(rf, X_test, y_test, n_samples=-1)
52 | viz = plot_importances(imp)
53 | viz.view()
54 |
55 |
--------------------------------------------------------------------------------
/src/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.md
3 |
--------------------------------------------------------------------------------
/src/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | long_description = """A library that provides feature importances, based upon
4 | the permutation importance strategy, for general scikit-learn
5 | models and implementations specifically for random forest out-of-bag scores.
6 | Built by Terence Parr and Kerem Turgutlu.
7 | See Beware Default
8 | Random Forest Importances for a deeper discussion of the issues surrounding
9 | feature importances in random forests.
10 | """
11 |
12 | setup(
13 | name='rfpimp',
14 | version='1.3.7',
15 | url='https://github.com/parrt/random-forest-importances',
16 | license='MIT',
17 | py_modules=['rfpimp'],
18 | python_requires='>=3.6',
19 | author='Terence Parr, Kerem Turgutlu',
20 | author_email='parrt@antlr.org, kcturgutlu@dons.usfca.edu',
21 | install_requires=['numpy','pandas','scikit-learn','matplotlib'],
22 | description='Permutation and drop-column importance for scikit-learn random forests and other models',
23 | long_description=long_description,
24 | long_description_content_type="text/markdown",
25 | keywords='scikit-learn random forest feature permutation importances',
26 | classifiers=['License :: OSI Approved :: MIT License',
27 | 'Intended Audience :: Developers']
28 | )
29 |
--------------------------------------------------------------------------------