├── .gitignore
├── LICENSE
├── README.html
├── README.md
├── docs
└── vimpy_logo.png
├── setup.py
└── vimpy
├── __init__.py
├── cv_vim.py
├── predictiveness_measures.py
├── spvim.py
├── spvim_ic.py
├── vim.py
└── vimpy_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # pyc files
2 | # ---------
3 | *.pyc
4 |
5 | # all things for pypi
6 | # -------------------
7 | dist/*
8 | build/*
9 | test/*
10 | vimpy.egg*
11 | olddist/*
12 |
13 | # virtual environment
14 | # -------------------
15 | venv/
16 | py3env
17 | README_cv.md
18 |
19 | # test scripts
20 | # -------------------
21 | test_precompute_cv.py
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) [2018--2020] [Brian D. Williamson]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python/`vimpy`: inference on algorithm-agnostic variable importance
2 |
3 | [](https://badge.fury.io/py/vimpy)
4 | [](https://opensource.org/licenses/MIT)
5 |
6 |
7 | **Software author:** [Brian Williamson](https://bdwilliamson.github.io/)
8 |
9 | **Methodology authors:** [Brian Williamson](https://bdwilliamson.github.io/), [Peter Gilbert](https://www.fredhutch.org/en/faculty-lab-directory/gilbert-peter.html), [Noah Simon](http://faculty.washington.edu/nrsimon/), [Marco Carone](http://faculty.washington.edu/mcarone/about.html)
10 |
11 | **R package:** https://github.com/bdwilliamson/vimp
12 |
13 | ## Introduction
14 |
15 | In predictive modeling applications, it is often of interest to determine the relative contribution of subsets of features in explaining an outcome; this is often called variable importance. It is useful to consider variable importance as a function of the unknown, underlying data-generating mechanism rather than the specific predictive algorithm used to fit the data. This package provides functions that, given fitted values from predictive algorithms, compute nonparametric estimates of variable importance based on $R^2$, deviance, classification accuracy, and area under the receiver operating characteristic curve, along with asymptotically valid confidence intervals for the true importance.
16 |
17 | For more details, please see the accompanying manuscripts "Nonparametric variable importance assessment using machine learning techniques" by Williamson, Gilbert, Carone, and Simon (*Biometrics*, 2020), ["A unified approach for inference on algorithm-agnostic variable importance"](https://arxiv.org/abs/2004.03683) by Williamson, Gilbert, Simon, and Carone (*arXiv*, 2020), and ["Efficient nonparametric statistical inference on population feature importance using Shapley values"](https://arxiv.org/abs/2006.09481) by Williamson and Feng (*arXiv*, 2020; to appear in the Proceedings of the Thirty-seventh International Conference on Machine Learning [ICML 2020]).
18 |
19 | ## Installation
20 |
21 | You may install a stable release of `vimpy` using `pip` by running `python pip install vimpy` from a Terminal window. Alternatively, you may install within a `virtualenv` environment.
22 |
23 | You may install the current dev release of `vimpy` by downloading this repository directly.
24 |
25 | ## Issues
26 |
27 | If you encounter any bugs or have any specific feature requests, please [file an issue](https://github.com/bdwilliamson/vimpy/issues).
28 |
29 | ## Example
30 |
31 | This example shows how to use `vimpy` in a simple setting with simulated data and using a single regression function. For more examples and detailed explanation, please see the [`R` vignette](https://bdwilliamson.github.io/vimp/articles/introduction_to_vimp.html).
32 |
33 | ```python
34 | ## load required libraries
35 | import numpy as np
36 | import vimpy
37 | from sklearn.ensemble import GradientBoostingRegressor
38 | from sklearn.model_selection import GridSearchCV
39 |
40 | ## -------------------------------------------------------------
41 | ## problem setup
42 | ## -------------------------------------------------------------
43 | ## define a function for the conditional mean of Y given X
44 | def cond_mean(x = None):
45 | f1 = np.where(np.logical_and(-2 <= x[:, 0], x[:, 0] < 2), np.floor(x[:, 0]), 0)
46 | f2 = np.where(x[:, 1] <= 0, 1, 0)
47 | f3 = np.where(x[:, 2] > 0, 1, 0)
48 | f6 = np.absolute(x[:, 5]/4) ** 3
49 | f7 = np.absolute(x[:, 6]/4) ** 5
50 | f11 = (7./3)*np.cos(x[:, 10]/2)
51 | ret = f1 + f2 + f3 + f6 + f7 + f11
52 | return ret
53 |
54 | ## create data
55 | np.random.seed(4747)
56 | n = 100
57 | p = 15
58 | s = 1 # importance desired for X_1
59 | x = np.zeros((n, p))
60 | for i in range(0, x.shape[1]) :
61 | x[:,i] = np.random.normal(0, 2, n)
62 |
63 | y = cond_mean(x) + np.random.normal(0, 1, n)
64 |
65 | ## -------------------------------------------------------------
66 | ## preliminary step: get regression estimators
67 | ## -------------------------------------------------------------
68 | ## use grid search to get optimal number of trees and learning rate
69 | ntrees = np.arange(100, 500, 100)
70 | lr = np.arange(.01, .1, .05)
71 |
72 | param_grid = [{'n_estimators':ntrees, 'learning_rate':lr}]
73 |
74 | ## set up cv objects
75 | cv_full = GridSearchCV(GradientBoostingRegressor(loss = 'ls', max_depth = 1), param_grid = param_grid, cv = 5)
76 | cv_small = GridSearchCV(GradientBoostingRegressor(loss = 'ls', max_depth = 1), param_grid = param_grid, cv = 5)
77 |
78 | ## -------------------------------------------------------------
79 | ## get variable importance estimates
80 | ## -------------------------------------------------------------
81 | # set seed
82 | np.random.seed(12345)
83 | ## set up the vimp object
84 | vimp = vimpy.vim(y = y, x = x, s = 1, pred_func = cv_full, measure_type = "r_squared")
85 | ## get the point estimate of variable importance
86 | vimp.get_point_est()
87 | ## get the influence function estimate
88 | vimp.get_influence_function()
89 | ## get a standard error
90 | vimp.get_se()
91 | ## get a confidence interval
92 | vimp.get_ci()
93 | ## do a hypothesis test, compute p-value
94 | vimp.hypothesis_test(alpha = 0.05, delta = 0)
95 | ## display the estimates, etc.
96 | vimp.vimp_
97 | vimp.se_
98 | vimp.ci_
99 | vimp.p_value_
100 | vimp.hyp_test_
101 |
102 | ## -------------------------------------------------------------
103 | ## using precomputed fitted values
104 | ## -------------------------------------------------------------
105 | np.random.seed(12345)
106 | folds_outer = np.random.choice(a = np.arange(2), size = n, replace = True, p = np.array([0.5, 0.5]))
107 | ## fit the full regression
108 | cv_full.fit(x[folds_outer == 1, :], y[folds_outer == 1])
109 | full_fit = cv_full.best_estimator_.predict(x[folds_outer == 1, :])
110 |
111 | ## fit the reduced regression
112 | x_small = np.delete(x[folds_outer == 0, :], s, 1) # delete the columns in s
113 | cv_small.fit(x_small, y[folds_outer == 0])
114 | small_fit = cv_small.best_estimator_.predict(x_small)
115 | ## get variable importance estimates
116 | np.random.seed(12345)
117 | vimp_precompute = vimpy.vim(y = y, x = x, s = 1, f = full_fit, r = small_fit, measure_type = "r_squared", folds = folds_outer)
118 | ## get the point estimate of variable importance
119 | vimp_precompute.get_point_est()
120 | ## get the influence function estimate
121 | vimp_precompute.get_influence_function()
122 | ## get a standard error
123 | vimp_precompute.get_se()
124 | ## get a confidence interval
125 | vimp_precompute.get_ci()
126 | ## do a hypothesis test, compute p-value
127 | vimp_precompute.hypothesis_test(alpha = 0.05, delta = 0)
128 | ## display the estimates, etc.
129 | vimp_precompute.vimp_
130 | vimp_precompute.se_
131 | vimp_precompute.ci_
132 | vimp_precompute.p_value_
133 | vimp_precompute.hyp_test_
134 |
135 | ## -------------------------------------------------------------
136 | ## get variable importance estimates using cross-validation
137 | ## -------------------------------------------------------------
138 | np.random.seed(12345)
139 | ## set up the vimp object
140 | vimp_cv = vimpy.cv_vim(y = y, x = x, s = 1, pred_func = cv_full, V = 5, measure_type = "r_squared")
141 | ## get the point estimate
142 | vimp_cv.get_point_est()
143 | ## get the standard error
144 | vimp_cv.get_influence_function()
145 | vimp_cv.get_se()
146 | ## get a confidence interval
147 | vimp_cv.get_ci()
148 | ## do a hypothesis test, compute p-value
149 | vimp_cv.hypothesis_test(alpha = 0.05, delta = 0)
150 | ## display estimates, etc.
151 | vimp_cv.vimp_
152 | vimp_cv.se_
153 | vimp_cv.ci_
154 | vimp_cv.p_value_
155 | vimp_cv.hyp_test_
156 | ```
157 |
158 | ## Logo
159 |
160 | The logo was created using [hexSticker](https://github.com/GuangchuangYu/hexSticker), [lisa](https://github.com/tyluRp/lisa), and a [python image](https://svgsilh.com/image/145410.html) distributed under the CC0 license. Many thanks to the maintainers of these packages and the [Color Lisa](https://lisa-pkg.netlify.app/) team.
161 |
--------------------------------------------------------------------------------
/docs/vimpy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bdwilliamson/vimpy/681eb21e1ff1141dc9fbaa35261e24dd17296857/docs/vimpy_logo.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="vimpy",
8 | version="2.1.1",
9 | author="Brian Williamson",
10 | author_email="brianw26@uw.edu",
11 | description="vimpy: perform inference on algorithm-agnostic variable importance in python",
12 | license='MIT',
13 | long_description=long_description,
14 | long_description_content_type="text/markdown",
15 | url="https://github.com/bdwilliamson/vimpy",
16 | packages=setuptools.find_packages(),
17 | install_requires=[
18 | 'numpy',
19 | 'scipy'
20 | ],
21 | classifiers=(
22 | "Programming Language :: Python :: 3.3",
23 | "License :: OSI Approved :: MIT License",
24 | "Operating System :: OS Independent",
25 | ),
26 | )
27 |
--------------------------------------------------------------------------------
/vimpy/__init__.py:
--------------------------------------------------------------------------------
1 | # __init__.py
2 | from .vim import vim
3 | from .cv_vim import cv_vim
4 | from .spvim import spvim
5 | from .spvim_ic import shapley_influence_function, shapley_se
6 | from .predictiveness_measures import *
7 | from .vimpy_utils import *
8 | name="vimpy"
9 |
--------------------------------------------------------------------------------
/vimpy/cv_vim.py:
--------------------------------------------------------------------------------
1 | ## Python class for cross-validated estimates of variable importance
2 | ## compute estimates and confidence intervals, do hypothesis testing
3 |
4 | ## import required libraries
5 | import numpy as np
6 | from scipy.stats import norm
7 | from .predictiveness_measures import cv_predictiveness, cv_predictiveness_precomputed
8 | from .vimpy_utils import get_measure_function
9 |
10 |
11 | class cv_vim:
12 |
13 | ## define initialization values
14 | """
15 | @param y the outcome
16 | @param x the feature data
17 | @param s the feature group of interest
18 | @param measure_type the predictiveness measure to use (for now, one of "r_squared", "auc", "accuracy", "deviance")
19 | @param V the number of cross-fitting folds (defaults to 5)
20 | @param pred_func the function that predicts outcome given features
21 | @param ensemble is pred_func an ensemble (True) or a single function (False, default)
22 | @param f fitted values from regression of outcome on all features (only used if pred_func is not specified)
23 | @param r fitted values from regression of outcome on reduced set of features (only used if pred_func is not specified)
24 | @param folds a list of length 3: outer folds, for hypothesis testing; inner folds based on the outer folds == 1 (for cross-fitting); inner folds based on outer folds == 0 (for cross-fitting)
25 | @param na_rm remove NAs prior to computing predictiveness? (defaults to False)
26 | """
27 | def __init__(self, y, x, s, measure_type, V = 5, pred_func = None, ensemble = False, f = None, r = None, folds = None, na_rm = False):
28 | self.y_ = y
29 | self.x_ = x
30 | self.s_ = s
31 | self.n_ = y.shape[0]
32 | self.p_ = x.shape[1]
33 | self.pred_func_ = pred_func
34 | self.f_ = f
35 | self.r_ = r
36 | assert (pred_func is not None or (f is not None and r is not None))
37 | self.V_ = V
38 | self.measure_type_ = measure_type
39 | self.measure_ = get_measure_function(measure_type)
40 | self.vimp_ = []
41 | self.se_ = []
42 | self.ci_ = []
43 | self.hyp_test_ = []
44 | self.test_statistic_ = []
45 | self.p_value_ = []
46 | self.v_full_ = []
47 | self.preds_full_ = []
48 | self.v_redu_ = []
49 | self.preds_redu_ = []
50 | self.se_full_ = []
51 | self.se_redu_ = []
52 | self.ci_full_ = []
53 | self.ci_redu_ = []
54 | ## set up outer folds for hypothesis testing
55 | ## set up outer folds for hypothesis testing
56 | if folds is None:
57 | self.folds_outer_ = np.random.choice(a = np.arange(2), size = self.n_, replace = True, p = np.array([0.5, 0.5]))
58 | self.folds_inner_1 = []
59 | self.folds_inner_0 = []
60 | else:
61 | assert (f is not None)
62 | self.folds_outer_ = folds[0]
63 | self.folds_inner_1 = folds[1]
64 | self.folds_inner_0 = folds[2]
65 | self.ic_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1))))
66 | self.ic_full_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1))))
67 | self.ic_redu_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1))))
68 | ## if only two unique values in y, assume binary
69 | self.binary_ = (np.unique(y).shape[0] == 2)
70 | self.na_rm_ = na_rm
71 | self.ensemble_ = ensemble
72 |
73 | ## calculate the plug-in estimator
74 | def get_point_est(self):
75 | if self.pred_func_ is not None:
76 | predictiveness_func = cv_predictiveness
77 | this_full_func = self.pred_func_
78 | this_redu_func = self.pred_func_
79 | folds_1 = None
80 | folds_0 = None
81 | else:
82 | predictiveness_func = cv_predictiveness_precomputed
83 | this_full_func = self.f_
84 | this_redu_func = self.r_
85 | folds_1 = self.folds_inner_1
86 | folds_0 = self.folds_inner_0
87 | self.v_full_, self.preds_full_, ic_full, self.folds_inner_1, self.cc_1 = predictiveness_func(self.x_[self.folds_outer_ == 1, :], self.y_[self.folds_outer_ == 1], np.arange(self.p_), self.measure_, this_full_func, V = self.V_, stratified = self.binary_, na_rm = self.na_rm_, folds = folds_1, ensemble = self.ensemble_)
88 | self.v_redu_, self.preds_redu_, ic_redu, self.folds_inner_0, self.cc_0 = predictiveness_func(self.x_[self.folds_outer_ == 0, :], self.y_[self.folds_outer_ == 0], np.delete(np.arange(self.p_), self.s_), self.measure_, this_redu_func, V = self.V_, stratified = self.binary_, na_rm = self.na_rm_, folds = folds_0, ensemble = self.ensemble_)
89 | self.vimp_ = self.v_full_ - self.v_redu_
90 | self.ic_full_[:ic_full.shape[0]] = ic_full
91 | self.ic_redu_[:ic_redu.shape[0]] = ic_redu
92 | return self
93 |
94 | ## calculate the influence function
95 | def get_influence_function(self):
96 | self.ic_ = self.ic_full_ - self.ic_redu_
97 | return self
98 |
99 | ## calculate the standard error
100 | def get_se(self):
101 | self.se_full_ = np.sqrt(np.mean(self.ic_full_ ** 2)) / np.sqrt(self.ic_full_.shape[0])
102 | self.se_redu_ = np.sqrt(np.mean(self.ic_redu_ ** 2)) / np.sqrt(self.ic_redu_.shape[0])
103 | self.se_ = np.sqrt(np.mean(self.ic_ ** 2)) / np.sqrt(self.ic_.shape[0])
104 | return self
105 |
106 | ## calculate the ci based on the estimate and the standard error
107 | def get_ci(self, level = 0.95):
108 | ## get alpha from the level
109 | a = (1 - level) / 2.
110 | a = np.array([a, 1 - a])
111 | ## calculate the quantiles
112 | fac = norm.ppf(a)
113 | ## create cis for vimp, predictiveness
114 | self.ci_ = self.vimp_ + np.outer((self.se_), fac)
115 | self.ci_full_ = self.v_full_ + np.outer((self.se_full_), fac)
116 | self.ci_redu_ = self.v_redu_ + np.outer((self.se_redu_), fac)
117 | return self
118 |
119 | ## do a hypothesis test
120 | def hypothesis_test(self, alpha = 0.05, delta = 0):
121 | self.test_statistic_ = (self.v_full_ - self.v_redu_ - delta) / np.sqrt(self.se_full_ ** 2 + self.se_redu_ ** 2)
122 | self.p_value_ = 1 - norm.cdf(self.test_statistic_)
123 | self.hyp_test_ = self.p_value_ < alpha
124 | return(self)
125 |
--------------------------------------------------------------------------------
/vimpy/predictiveness_measures.py:
--------------------------------------------------------------------------------
1 | ## Compute predictiveness measures and their corresponding influence functions
2 |
3 |
4 | # general cv predictiveness
5 | def cv_predictiveness(x, y, S, measure, pred_func, V = 5, stratified = True, na_rm = False, folds = None, ensemble = False):
6 | """
7 | Compute a cross-validated measure of predictiveness based on the data and the chosen measure
8 |
9 | @param x: the features
10 | @param y: the outcome
11 | @param S: the covariates to fit
12 | @param measure: measure of predictiveness
13 | @param pred_func: function that fits to the data
14 | @param V: the number of CV folds
15 | @param stratified: should the folds be stratified?
16 | @param na_rm: should we do a complete-case analysis (True) or not (False)
17 | @param folds (dummy)
18 | @param ensemble is this an ensemble (True) or not (False)
19 |
20 | @return cross-validated measure of predictiveness, along with preds and ics
21 | """
22 | import numpy as np
23 | from .vimpy_utils import make_folds
24 | ## if na_rm = True, do a complete-case analysis
25 | if na_rm:
26 | xs = x[:, S]
27 | cc = np.sum(np.isnan(xs), axis = 1) == 0
28 | newx = x[cc, :]
29 | newy = y[cc]
30 | else:
31 | cc = np.repeat(True, x.shape[0])
32 | newx = x
33 | newy = y
34 | ## set up CV folds
35 | folds = make_folds(newy, V, stratified = stratified)
36 | ## do CV
37 | preds = np.empty((y.shape[0],))
38 | preds.fill(np.nan)
39 | ics = np.empty((y.shape[0],))
40 | ics.fill(np.nan)
41 | vs = np.empty((V,))
42 | cc_cond = np.flatnonzero(cc)
43 | if V == 1:
44 | x_train, y_train = newx, newy
45 | pred_func.fit(x_train[:, S], np.ravel(y_train))
46 | if ensemble:
47 | preds_v = np.mean(pred_func.transform(x_train[:, S]))
48 | else:
49 | try:
50 | preds_v = pred_func.predict_proba(x_train[:, S])[:, 1]
51 | except AttributeError:
52 | preds_v = pred_func.predict(x_train[:, S])
53 |
54 | preds[cc_cond] = preds_v
55 | vs[0] = measure(y_train, preds_v)
56 | ics[cc_cond] = compute_ic(y_train, preds_v, measure.__name__)
57 | else:
58 | for v in range(V):
59 | fold_cond = np.flatnonzero(folds == v)
60 | x_train, y_train = newx[folds != v, :], newy[folds != v]
61 | x_test, y_test = newx[folds == v, :], newy[folds == v]
62 | pred_func.fit(x_train[:, S], np.ravel(y_train))
63 | if ensemble:
64 | preds_v = np.mean(pred_func.transform(x_test[:, S]))
65 | else:
66 | try:
67 | preds_v = pred_func.predict_proba(x_test[:, S])[:, 1]
68 | except AttributeError:
69 | preds_v = pred_func.predict(x_test[:, S])
70 |
71 | preds[cc_cond[fold_cond]] = preds_v
72 | vs[v] = measure(y_test, preds_v)
73 | ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__)
74 | return np.mean(vs), preds, ics, folds, cc
75 |
76 |
77 | # general predictiveness based on precomputed fits
78 | def cv_predictiveness_precomputed(x, y, S, measure, f, V = 5, stratified = True, folds = None, na_rm = False, ensemble = False):
79 | """
80 | Compute a cross-validated measure of predictiveness based on the data, the chosen measure, and the sets of fitted values f and r
81 |
82 | @param x: the features
83 | @param y: the outcome
84 | @param S: the covariates to fit
85 | @param measure: measure of predictiveness
86 | @param f: fitted values based on S
87 | @param V: the number of CV folds
88 | @param stratified: should the folds be stratified?
89 | @param folds: the CV folds
90 | @param na_rm: should we do a complete-case analysis (True) or not (False)
91 | @param ensemble: is this an ensemble or not (dummy)
92 |
93 | @return cross-validated measure of predictiveness, along with preds and ics
94 | """
95 | import numpy as np
96 | from .vimpy_utils import make_folds
97 | ## if na_rm = True, do a complete-case analysis
98 | if na_rm:
99 | xs = x[:, S]
100 | cc = np.sum(np.isnan(xs), axis = 1) == 0
101 | newy = y[cc]
102 | else:
103 | cc = np.repeat(True, x.shape[0])
104 | newy = y
105 | ## set up CV folds
106 | if folds is None:
107 | folds = make_folds(newy, V, stratified = stratified)
108 | ## do CV
109 | preds = np.empty((y.shape[0],))
110 | preds.fill(np.nan)
111 | ics = np.empty((y.shape[0],))
112 | ics.fill(np.nan)
113 | vs = np.empty((V,))
114 | cc_cond = np.flatnonzero(cc)
115 | if V == 1:
116 | y_train = newy
117 | preds_v = f
118 | preds[cc_cond] = preds_v[cc_cond]
119 | vs[0] = measure(y_train, preds_v)
120 | ics[cc_cond] = compute_ic(y_train, preds_v, measure.__name__)
121 | else:
122 | for v in range(V):
123 | fold_cond = np.flatnonzero(folds == v)
124 | y_test = newy[folds == v]
125 | preds_v = f[folds == v]
126 | preds[cc_cond[fold_cond]] = preds_v
127 | vs[v] = measure(y_test, preds_v)
128 | ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__)
129 | return np.mean(vs), preds, ics, folds, cc
130 |
131 |
132 | def accuracy(y, preds):
133 | """
134 | Compute accuracy for a given set of predictions and outcomes
135 |
136 | @param y: the outcome
137 | @param preds: the predictions based on a subset of features
138 |
139 | @return the accuracy
140 | """
141 | import sklearn.metrics as skm
142 |
143 | if len(preds.shape) == 2:
144 | if preds.shape[1] > 1:
145 | return [1. - skm.zero_one_loss(y_true = y, y_pred = preds[:, i], normalize = True) for i in range(preds.shape[1])]
146 | else:
147 | return 1. - skm.zero_one_loss(y_true = y, y_pred = preds, normalize = True)
148 | else:
149 | return 1. - skm.zero_one_loss(y_true = y, y_pred = preds, normalize = True)
150 |
151 |
152 | def auc(y, preds, *args, **kwargs):
153 | """
154 | Compute AUC for a given set of predictions and outcomes
155 |
156 | @param y: the outcome
157 | @param preds: the predictions based on a given subset of features
158 |
159 | @return the AUC
160 | """
161 | import sklearn.metrics as skm
162 |
163 | if len(preds.shape) == 2:
164 | if preds.shape[1] > 1:
165 | return [skm.roc_auc_score(y_true = y, y_score = preds[:, i], average = "micro") for i in range(preds.shape[1])]
166 | else:
167 | return skm.roc_auc_score(y_true = y, y_score = preds, average = "micro")
168 | else:
169 | return skm.roc_auc_score(y_true = y, y_score = preds, average = "micro")
170 |
171 |
172 | def cross_entropy(y, preds):
173 | """
174 | Compute cross-entropy for a given set of predictions and outcomes
175 |
176 | @param y: the outcome
177 | @param preds: the predictions based on a subset of features
178 |
179 | @return the cross-entropy
180 | """
181 | import sklearn.metrics as skm
182 |
183 | if len(preds.shape) == 2:
184 | if preds.shape[1] > 1:
185 | return [(-2) * skm.log_loss(y_true = y, y_pred = preds[:, i], normalize = True) for i in range(preds.shape[1])]
186 | else:
187 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True)
188 | else:
189 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True)
190 |
191 |
192 | def deviance(y, preds):
193 | """
194 | Compute deviance for a given set of predictions and outcomes
195 |
196 | @param y: the outcome
197 | @param preds: the predictions based on a subset of features
198 |
199 | @return the deviance
200 | """
201 | import sklearn.metrics as skm
202 | import numpy as np
203 | denom = (-1) * np.sum(np.log(np.mean(y, axis = 0)))
204 |
205 | if len(preds.shape) == 2:
206 | if preds.shape[1] > 1:
207 | return [(-2) * skm.log_loss(y_true = y, y_pred = preds[:, i], normalize = True) / denom for i in range(preds.shape[1])]
208 | else:
209 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) / denom
210 | else:
211 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) / denom
212 |
213 |
214 | def r_squared(y, preds):
215 | """
216 | Compute R^s for a given set of predictions and outcomes
217 |
218 | @param y: the outcome
219 | @param preds: the predictions based on a given subset of features
220 |
221 | @return the R^2
222 | """
223 | import sklearn.metrics as skm
224 | import numpy as np
225 | var = np.mean((y - np.mean(y)) ** 2)
226 |
227 | if len(preds.shape) == 2:
228 | if preds.shape[1] > 1:
229 | return [1. - skm.mean_squared_error(y_true = y, y_pred = preds[:, i]) / var for i in range(preds.shape[1])]
230 | else:
231 | return 1. - skm.mean_squared_error(y_true = y, y_pred = preds) / var
232 | else:
233 | return 1. - skm.mean_squared_error(y_true = y, y_pred = preds) / var
234 |
235 |
236 | ## ------------------------------------------------------------------
237 | ## influence functions
238 | ## ------------------------------------------------------------------
239 | def compute_ic(y, preds, measure):
240 | """
241 | Compute IC based on the given measure
242 |
243 | @param y: the outcome
244 | @param preds: the predictions based on the current subset of features
245 | @param measure: the predictiveness measure
246 |
247 | @return an n-vector of the IC for the given predictiveness measure
248 | """
249 |
250 | ## do the correct thing
251 | if measure == "accuracy":
252 | return accuracy_ic(y, preds)
253 | elif measure == "auc":
254 | return auc_ic(y, preds)
255 | elif measure == "cross_entropy":
256 | return cross_entropy_ic(y, preds)
257 | elif measure == "deviance":
258 | return deviance_ic(y, preds)
259 | elif measure == "r_squared":
260 | return r_squared_ic(y, preds)
261 | else:
262 | raise ValueError("We do not currently support the entered predictiveness measure. Please provide a different predictiveness measure.")
263 |
264 |
265 | def accuracy_ic(y, preds):
266 | """
267 | Compute the IC for accuracy
268 |
269 | @param y: the outcome
270 | @param preds: the predictions based on a given subset of features
271 |
272 | @return the IC for accuracy
273 | """
274 | import numpy as np
275 | if len(preds.shape) == 2:
276 | if preds.shape[1] > 1:
277 | return np.array([one_accuracy_ic(y, preds[:, m]) for m in range(preds.shape[1])])
278 | else:
279 | return np.array([one_accuracy_ic(y, preds)])
280 | else:
281 | return np.array([one_accuracy_ic(y, preds)])
282 |
283 |
284 | def one_accuracy_ic(y, preds):
285 | """
286 | Compute the IC for one accuracy
287 |
288 | @param y: the outcome
289 | @param preds: the predictions based on a given subset of features
290 |
291 | @return the IC for accuracy
292 | """
293 | import sklearn.metrics as skm
294 |
295 | misclassification = skm.zero_one_loss(y_true = y, y_pred = preds, normalize = True)
296 | return (-1) * (((preds > 1. / 2) != y) - misclassification)
297 |
298 |
299 | def auc_ic(y, preds):
300 | """
301 | Compute the IC for AUC
302 |
303 | @param y: the outcome
304 | @param preds: the predictions based on a given subset of features
305 |
306 | @return the IC for AUC
307 | """
308 | import numpy as np
309 | if len(preds.shape) == 2:
310 | if preds.shape[1] > 1:
311 | return np.array([one_auc_ic(y, preds[:, m]) for m in range(preds.shape[1])])
312 | else:
313 | return np.array([one_auc_ic(y, preds)])
314 | else:
315 | return np.array([one_auc_ic(y, preds)])
316 |
317 |
318 | def one_auc_ic(y, preds):
319 | """
320 | Compute the IC for one AUC
321 |
322 | @param y: the outcome
323 | @param preds: the predictions based on a given subset of features
324 |
325 | @return the IC for AUC
326 | """
327 | import numpy as np
328 | import sklearn.metrics as skm
329 |
330 | p_1 = np.mean(y)
331 | p_0 = 1 - p_1
332 |
333 | sens = np.array([np.mean(preds[(y == 0).reshape(preds.shape)] < x) for x in preds])
334 | spec = np.array([np.mean(preds[(y == 1).reshape(preds.shape)] > x) for x in preds])
335 |
336 | contrib_1 = (y == 1).reshape(preds.shape) / p_1 * sens
337 | contrib_0 = (y == 0).reshape(preds.shape) / p_0 * spec
338 |
339 | auc = skm.roc_auc_score(y_true = y, y_score = preds, average = "micro")
340 | return contrib_1 + contrib_0 - ((y == 0).reshape(preds.shape) / p_0 + (y == 1).reshape(preds.shape) / p_1) * auc
341 |
342 |
343 | def cross_entropy_ic(y, preds):
344 | """
345 | Compute the IC for cross-entropy
346 |
347 | @param y: the outcome
348 | @param preds: the predictions based on a given subset of features
349 |
350 | @return the IC for cross-entropy
351 | """
352 | import numpy as np
353 | if len(preds.shape) == 2:
354 | if preds.shape[1] > 1:
355 | return np.array([one_cross_entropy_ic(y, preds[:, m]) for m in range(preds.shape[1])])
356 | else:
357 | return np.array([one_cross_entropy_ic(y, preds)])
358 | else:
359 | return np.array([one_cross_entropy_ic(y, preds)])
360 |
361 |
362 | def one_cross_entropy_ic(y, preds):
363 | """
364 | Compute the IC for one cross-entropy
365 |
366 | @param y: the outcome
367 | @param preds: the predictions based on a given subset of features
368 |
369 | @return the IC for cross-entropy
370 | """
371 | import sklearn.metrics as skm
372 | import numpy as np
373 | cross_entropy = (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True)
374 | ic_cross_entropy = (-2) * np.sum(y * np.log(preds), axis = 1) - cross_entropy
375 | return ic_cross_entropy
376 |
377 |
378 | def deviance_ic(y, preds):
379 | """
380 | Compute the IC for deviance
381 |
382 | @param y: the outcome
383 | @param preds: the predictions based on a given subset of features
384 |
385 | @return the IC for deviance
386 | """
387 | import numpy as np
388 | if len(preds.shape) == 2:
389 | if preds.shape[1] > 1:
390 | return np.array([one_deviance_ic(y, preds[:, m]) for m in range(preds.shape[1])])
391 | else:
392 | return np.array([one_deviance_ic(y, preds)])
393 | else:
394 | return np.array([one_deviance_ic(y, preds)])
395 |
396 |
397 | def one_deviance_ic(y, preds):
398 | """
399 | Compute the IC for one deviance
400 |
401 | @param y: the outcome
402 | @param preds: the predictions based on a given subset of features
403 |
404 | @return the IC for deviance
405 | """
406 | import sklearn.metrics as skm
407 | import numpy as np
408 | cross_entropy = (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True)
409 | p = np.mean(y, axis = 0)
410 | denom = (-1) * np.sum(np.log(p))
411 | ic_cross_entropy = (-2) * np.sum(y * np.log(preds), axis = 1) - cross_entropy
412 | ic_denom = ((-1.) / p) * ((y == 1) - p)
413 | grad = np.array([1. / denom, (-1) * cross_entropy / (denom ** 2)])
414 | return np.dot(grad, np.stack((ic_cross_entropy, ic_denom)))
415 |
416 |
417 | def r_squared_ic(y, preds):
418 | """
419 | Compute the IC for R-squared
420 |
421 | @param y: the outcome
422 | @param preds: the predictions based on a given subset of features
423 |
424 | @return the IC for R-squared
425 | """
426 | import numpy as np
427 | if len(preds.shape) == 2:
428 | if preds.shape[1] > 1:
429 | return np.array([one_r2_ic(y, preds[:, m]) for m in range(preds.shape[1])])
430 | else:
431 | return np.array([one_r2_ic(y, preds)])
432 | else:
433 | return np.array([one_r2_ic(y, preds)])
434 |
435 |
436 | def one_r2_ic(y, preds):
437 | """
438 | Compute the IC for one R-squared
439 |
440 | @param y: the outcome
441 | @param preds: the predictions based on a given subset of features
442 |
443 | @return the IC for R-squared
444 | """
445 | import sklearn.metrics as skm
446 | import numpy as np
447 | y_flat = np.ravel(y)
448 | mse = skm.mean_squared_error(y_true = y_flat, y_pred = preds)
449 | var = np.mean((y_flat - np.mean(y_flat)) ** 2)
450 | ic_mse = (y_flat - preds) ** 2 - mse
451 | ic_var = (y_flat - np.mean(y_flat)) ** 2 - var
452 | grad = np.array([1. / var, (-1) * mse / (var ** 2)])
453 | return np.dot(grad, np.stack((ic_mse, ic_var)))
454 |
--------------------------------------------------------------------------------
/vimpy/spvim.py:
--------------------------------------------------------------------------------
1 | ## Python class for estimates of Shapley population variable importance
2 | ## compute estimates and confidence intervals, do hypothesis testing
3 |
4 | ## import required libraries
5 | import numpy as np
6 | from scipy.stats import norm
7 | from .predictiveness_measures import cv_predictiveness, compute_ic
8 | from .spvim_ic import shapley_influence_function, shapley_se
9 | from .vimpy_utils import get_measure_function, choose, shapley_hyp_test
10 |
11 |
12 | class spvim:
13 |
14 | ## define initialization values
15 | """
16 | @param y the outcome
17 | @param x the feature data
18 | @param measure_type the predictiveness measure to use (a function)
19 | @param V the number of cross-validation folds
20 | @param pred_func the function that predicts outcome given features
21 | @param ensemble is pred_func an ensemble (True) or a single function (False, default)
22 | @param na_rm remove NAs prior to computing predictiveness? (defaults to False)
23 | """
24 | def __init__(self, y, x, measure_type, V, pred_func, ensemble = False, na_rm = False):
25 | self.y_ = y
26 | self.x_ = x
27 | self.n_ = y.shape[0]
28 | self.p_ = x.shape[1]
29 | self.pred_func_ = pred_func
30 | self.V_ = V
31 | self.measure_type_ = measure_type
32 | self.measure_ = get_measure_function(measure_type)
33 | self.ics_ = []
34 | self.vimp_ = []
35 | self.lambdas_ = []
36 | self.ses_ = []
37 | self.cis_ = []
38 | self.na_rm_ = na_rm
39 | self.Z_ = []
40 | self.z_counts_ = []
41 | self.v_ = []
42 | self.v_ics_ = []
43 | self.W_ = []
44 | self.gamma_ = []
45 | self.test_statistics_ = []
46 | self.p_values_ = []
47 | self.hyp_tests_ = []
48 | self.G_ = np.vstack((np.append(1, np.zeros(self.p_)), np.ones(self.p_ + 1) - np.append(1, np.zeros(self.p_))))
49 | ## set up outer folds for hypothesis testing
50 | self.folds_outer_ = np.random.choice(a = np.arange(2), size = self.n_, replace = True, p = np.array([0.25, 0.75]))
51 | self.folds_inner_ = []
52 | ## if only two unique values in y, assume binary
53 | self.binary_ = (np.unique(y).shape[0] == 2)
54 | self.ensemble_ = ensemble
55 | self.cc_ = []
56 |
57 | def _get_kkt_matrix(self):
58 | # kkt matrix for constrained wls
59 | A_W = np.sqrt(self.W_).dot(self.Z_)
60 | kkt_matrix_11 = 2 * A_W.transpose().dot(A_W)
61 | kkt_matrix_12 = self.G_.transpose()
62 | kkt_matrix_21 = self.G_
63 | kkt_matrix_22 = np.zeros((kkt_matrix_21.shape[0], kkt_matrix_12.shape[1]))
64 | kkt_matrix = np.vstack((np.hstack((kkt_matrix_11, kkt_matrix_12)), np.hstack((kkt_matrix_21, kkt_matrix_22))))
65 | return(kkt_matrix)
66 |
67 | def _get_ls_matrix(self, c_n):
68 | A_W = np.sqrt(self.W_).dot(self.Z_)
69 | v_W = np.sqrt(self.W_).dot(self.v_)
70 | ls_matrix = np.vstack((2 * A_W.transpose().dot(v_W.reshape((len(v_W), 1))), c_n.reshape((c_n.shape[0], 1))))
71 | return(ls_matrix)
72 |
73 | ## calculate the point estimates
74 | def get_point_est(self, gamma = 1):
75 | self.gamma_ = gamma
76 | ## sample subsets, set up Z
77 | max_subset = np.array(list(range(self.p_)))
78 | sampling_weights = np.append(np.append(1, [choose(self.p_ - 2, s - 1) ** (-1) for s in range(1, self.p_)]), 1)
79 | subset_sizes = np.random.choice(np.arange(0, self.p_ + 1), p = sampling_weights / sum(sampling_weights), size = self.gamma_ * self.x_.shape[0], replace = True)
80 | S_lst_all = [np.sort(np.random.choice(np.arange(0, self.p_), subset_size, replace = False)) for subset_size in list(subset_sizes)]
81 | ## only need to continue with the unique subsets S
82 | Z_lst_all = [np.in1d(max_subset, S).astype(np.float64) for S in S_lst_all]
83 | Z, z_counts = np.unique(np.array(Z_lst_all), axis = 0, return_counts = True)
84 | Z_lst = list(Z)
85 | Z_aug_lst = [np.append(1, z) for z in Z_lst]
86 | S_lst = [max_subset[z.astype(bool).tolist()] for z in Z_lst]
87 | ## get v, preds, ic for null set
88 | preds_none = np.repeat(np.mean(self.y_[self.folds_outer_ == 1]), np.sum(self.folds_outer_ == 1))
89 | v_none = self.measure_(self.y_[self.folds_outer_ == 1], preds_none)
90 | ic_none = compute_ic(self.y_[self.folds_outer_ == 1], preds_none, self.measure_.__name__)
91 | ## get v, preds, ic for remaining non-null groups in S
92 | v_lst, preds_lst, ic_lst, self.folds_inner_, self.cc_ = zip(*(cv_predictiveness(self.x_[self.folds_outer_ == 1, :], self.y_[self.folds_outer_ == 1], s, self.measure_, self.pred_func_, V = self.V_, stratified = self.binary_, na_rm = self.na_rm_) for s in S_lst[1:]))
93 | ## set up full lists
94 | v_lst_all = [v_none] + list(v_lst)
95 | ic_lst_all = [ic_none] + list(ic_lst)
96 | self.Z_ = np.array(Z_aug_lst)
97 | self.z_counts_ = z_counts
98 | self.W_ = np.diag(z_counts / np.sum(z_counts))
99 | self.v_ = np.array(v_lst_all)
100 | self.v_ics_ = ic_lst_all
101 | c_n = np.array([v_none, v_lst_all[len(v_lst)] - v_none])
102 | kkt_matrix = self._get_kkt_matrix()
103 | ls_matrix = self._get_ls_matrix(c_n)
104 | ls_solution = np.linalg.inv(kkt_matrix).dot(ls_matrix)
105 | self.vimp_ = ls_solution[0:(self.p_ + 1), :]
106 | self.lambdas_ = ls_solution[(self.p_ + 1):ls_solution.shape[0], :]
107 | return(self)
108 |
109 | ## calculate the influence function
110 | def get_influence_functions(self):
111 | c_n = np.array([self.v_[0], self.v_[self.v_.shape[0] - 1] - self.v_[0]], dtype = object)
112 | v_ic_array = np.vstack([self.v_ics_[0], np.stack(self.v_ics_[1:], axis = 0)])
113 | self.ics_ = shapley_influence_function(self.Z_, self.z_counts_, self.W_, self.v_, self.vimp_, self.G_, c_n, v_ic_array, self.measure_.__name__)
114 | return self
115 |
116 | ## calculate standard errors
117 | def get_ses(self):
118 | ses = [shapley_se(self.ics_, idx, self.gamma_) for idx in range(self.p_ + 1)]
119 | self.ses_ = np.array(ses)
120 | return self
121 |
122 | ## calculate the ci based on the estimate and the standard error
123 | def get_cis(self, level = 0.95):
124 | ## get alpha from the level
125 | a = (1 - level) / 2.
126 | a = np.array([a, 1 - a])
127 | ## calculate the quantiles
128 | fac = norm.ppf(a)
129 | ## create it
130 | self.cis_ = self.vimp_ + np.outer((self.ses_), fac)
131 | return self
132 |
133 | ## do a hypothesis test
134 | def hypothesis_test(self, alpha = 0.05, delta = 0):
135 | ## null predictiveness
136 | preds_none_0 = np.repeat(np.mean(self.y_[self.folds_outer_ == 0]), np.sum(self.folds_outer_ == 0))
137 | v_none_0 = self.measure_(self.y_[self.folds_outer_ == 0], preds_none_0)
138 | ic_none_0 = compute_ic(self.y_[self.folds_outer_ == 0], preds_none_0, self.measure_.__name__)
139 | sigma_none_0 = np.sqrt(np.mean((ic_none_0) ** 2)) / np.sqrt(np.sum(self.folds_outer_ == 0))
140 | ## get shapley values + null predictiveness on first split
141 | shapley_vals_plus = self.vimp_ + self.vimp_[0]
142 | sigmas_one = np.sqrt(self.ses_ ** 2 + sigma_none_0 ** 2)
143 | self.test_statistics_, self.p_values_, self.hyp_tests_ = shapley_hyp_test(shapley_vals_plus[1:], v_none_0, sigmas_one, sigma_none_0, level = alpha, delta = delta, p = self.p_)
144 | return self
145 |
--------------------------------------------------------------------------------
/vimpy/spvim_ic.py:
--------------------------------------------------------------------------------
1 | # influence functions for shapley values
2 |
3 | def shapley_influence_function(Z, z_counts, W, v, psi, G, c_n, ics, measure):
4 | """
5 | Compute influence function for the given predictiveness measure
6 |
7 | @param Z the subsets of the power set with estimates
8 | @param W the matrix of weights
9 | @param v the estimated predictivness
10 | @param psi the estimated Shapley values
11 | @param G the constrained ls matrix
12 | @param c_n the constraints
13 | @param ics a list of all ics
14 | @param measure the predictiveness measure
15 | """
16 | import numpy as np
17 |
18 | ## compute contribution from estimating V
19 | Z_W = Z.transpose().dot(W)
20 | A_m = Z_W.dot(Z)
21 | A_m_inv = np.linalg.inv(A_m)
22 | phi_01 = A_m_inv.dot(Z_W).dot(ics)
23 |
24 | ## compute contribution from estimating Q
25 | qr_decomp = np.linalg.qr(G.transpose(), mode = 'complete')
26 | U_2 = qr_decomp[0][:, 3:(Z.shape[1])]
27 | V = U_2.transpose().dot(Z.transpose().dot(W).dot(Z)).dot(U_2)
28 | phi_02_shared_mat = (-1) * U_2.dot(np.linalg.inv(V))
29 | phi_02_uniq_vectors = np.array([(Z[z, :].dot(psi) - v[z]) * (U_2.transpose().dot(Z[z, :])) for z in range(Z.shape[0])], dtype = np.float64).transpose()
30 | phi_02_uniq = phi_02_shared_mat.dot(phi_02_uniq_vectors)
31 | phi_02 = np.repeat(phi_02_uniq, z_counts, axis=1)
32 |
33 | return {'contrib_v': phi_01, 'contrib_s': phi_02}
34 |
35 |
36 | def shapley_se(shapley_ics, idx, gamma, na_rm = True):
37 | """
38 | Standard error for the desired Shapley value
39 |
40 | @param shapley_ics: all influence function estimates
41 | @param idx: the index of interest
42 | @param gamma: the constant for sampling
43 | @param na_rm: remove NaNs?
44 |
45 | @return the standard error corresponding to the shapley value at idx
46 | """
47 | import numpy as np
48 | if na_rm:
49 | var_v = np.nanvar(shapley_ics['contrib_v'][idx, :])
50 | var_s = np.nanvar(shapley_ics['contrib_s'][idx, :])
51 | else:
52 | var_v = np.var(shapley_ics['contrib_v'][idx, :])
53 | var_s = np.var(shapley_ics['contrib_s'][idx, :])
54 | se = np.sqrt(var_v / shapley_ics['contrib_v'].shape[1] + var_s / shapley_ics['contrib_s'].shape[1] / gamma)
55 | return se
56 |
--------------------------------------------------------------------------------
/vimpy/vim.py:
--------------------------------------------------------------------------------
1 | ## Python class for estimates of variable importance
2 | ## compute estimates and confidence intervals, do hypothesis testing
3 |
4 | ## import required libraries
5 | import numpy as np
6 | from scipy.stats import norm
7 | from .predictiveness_measures import cv_predictiveness, cv_predictiveness_precomputed
8 | from .vimpy_utils import get_measure_function
9 |
10 |
11 | class vim:
12 |
13 | ## define initialization values
14 | """
15 | @param y the outcome
16 | @param x the feature data
17 | @param s the feature group of interest
18 | @param measure_type the predictiveness measure to use (for now, one of "r_squared", "auc", "accuracy", "deviance")
19 | @param pred_func the function that predicts outcome given features
20 | @param ensemble is pred_func an ensemble (True) or a single function (False, default)
21 | @param f fitted values from regression of outcome on all features (only used if pred_func is not specified)
22 | @param r fitted values from regression of outcome on reduced set of features (only used if pred_func is not specified)
23 | @param folds outer folds, for hypothesis testing (only used if pred_func is not specified)
24 | @param na_rm remove NAs prior to computing predictiveness? (defaults to False)
25 |
26 | @return an object of class vim
27 | """
28 | def __init__(self, y, x, s, measure_type, pred_func = None, ensemble = False, f = None, r = None, folds = None, na_rm = False):
29 | self.y_ = y
30 | self.x_ = x
31 | self.s_ = s
32 | self.n_ = y.shape[0]
33 | self.p_ = x.shape[1]
34 | self.pred_func_ = pred_func
35 | self.f_ = f
36 | self.r_ = r
37 | assert (pred_func is not None or (f is not None and r is not None))
38 | self.measure_type_ = measure_type
39 | self.measure_ = get_measure_function(measure_type)
40 | self.vimp_ = []
41 | self.se_ = []
42 | self.ci_ = []
43 | self.hyp_test_ = []
44 | self.test_statistic_ = []
45 | self.p_value_ = []
46 | self.v_full_ = []
47 | self.preds_full_ = []
48 | self.v_redu_ = []
49 | self.preds_redu_ = []
50 | self.se_full_ = []
51 | self.se_redu_ = []
52 | self.ci_full_ = []
53 | self.ci_redu_ = []
54 | ## set up outer folds for hypothesis testing
55 | if folds is None:
56 | self.folds_outer_ = np.random.choice(a = np.arange(2), size = self.n_, replace = True, p = np.array([0.5, 0.5]))
57 | else:
58 | assert (f is not None)
59 | self.folds_outer_ = folds
60 | self.folds_inner_1 = []
61 | self.folds_inner_0 = []
62 | self.cc_1 = []
63 | self.cc_0 = []
64 | self.ic_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1))))
65 | self.ic_full_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1))))
66 | self.ic_redu_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1))))
67 | ## if only two unique values in y, assume binary
68 | self.binary_ = (np.unique(y).shape[0] == 2)
69 | self.na_rm_ = na_rm
70 | self.ensemble_ = ensemble
71 |
72 | ## calculate the variable importance estimate
73 | def get_point_est(self):
74 | if self.pred_func_ is not None:
75 | predictiveness_func = cv_predictiveness
76 | this_full_func = self.pred_func_
77 | this_redu_func = self.pred_func_
78 | folds = None
79 | else:
80 | predictiveness_func = cv_predictiveness_precomputed
81 | this_full_func = self.f_
82 | this_redu_func = self.r_
83 | folds = None
84 | self.v_full_, self.preds_full_, ic_full, self.folds_inner_1, self.cc_1 = predictiveness_func(self.x_[self.folds_outer_ == 1, :], self.y_[self.folds_outer_ == 1], np.arange(self.p_), self.measure_, this_full_func, V = 1, stratified = self.binary_, na_rm = self.na_rm_, folds = folds, ensemble = self.ensemble_)
85 | self.v_redu_, self.preds_redu_, ic_redu, self.folds_inner_0, self.cc_0 = predictiveness_func(self.x_[self.folds_outer_ == 0, :], self.y_[self.folds_outer_ == 0], np.delete(np.arange(self.p_), self.s_), self.measure_, this_redu_func, V = 1, stratified = self.binary_, na_rm = self.na_rm_, folds = folds, ensemble = self.ensemble_)
86 | self.vimp_ = self.v_full_ - self.v_redu_
87 | self.ic_full_[:ic_full.shape[0]] = ic_full
88 | self.ic_redu_[:ic_redu.shape[0]] = ic_redu
89 | return self
90 |
91 | ## calculate the influence function
92 | def get_influence_function(self):
93 | self.ic_ = self.ic_full_ - self.ic_redu_
94 | return self
95 |
96 | ## calculate the standard error
97 | def get_se(self):
98 | self.se_full_ = np.sqrt(np.mean(self.ic_full_ ** 2)) / np.sqrt(self.ic_full_.shape[0])
99 | self.se_redu_ = np.sqrt(np.mean(self.ic_redu_ ** 2)) / np.sqrt(self.ic_redu_.shape[0])
100 | self.se_ = np.sqrt(np.mean(self.ic_ ** 2)) / np.sqrt(self.ic_.shape[0])
101 | return self
102 |
103 | ## calculate the ci based on the estimate and the standard error
104 | def get_ci(self, level = 0.95):
105 | ## get alpha from the level
106 | a = (1 - level) / 2.
107 | a = np.array([a, 1 - a])
108 | ## calculate the quantiles
109 | fac = norm.ppf(a)
110 | ## create cis for vimp, predictiveness
111 | self.ci_ = self.vimp_ + np.outer((self.se_), fac)
112 | self.ci_full_ = self.v_full_ + np.outer((self.se_full_), fac)
113 | self.ci_redu_ = self.v_redu_ + np.outer((self.se_redu_), fac)
114 | return self
115 |
116 | ## do a hypothesis test
117 | def hypothesis_test(self, alpha = 0.05, delta = 0):
118 | self.test_statistic_ = (self.v_full_ - self.v_redu_ - delta) / np.sqrt(self.se_full_ ** 2 + self.se_redu_ ** 2)
119 | self.p_value_ = 1 - norm.cdf(self.test_statistic_)
120 | self.hyp_test_ = self.p_value_ < alpha
121 | return(self)
122 |
--------------------------------------------------------------------------------
/vimpy/vimpy_utils.py:
--------------------------------------------------------------------------------
1 | ## utility functions
2 |
3 |
4 | def deprecated(func):
5 | '''This is a decorator which can be used to mark functions
6 | as deprecated. It will result in a warning being emitted
7 | when the function is used.'''
8 | import warnings
9 |
10 | def new_func(*args, **kwargs):
11 | warnings.warn("Call to deprecated function {}.".format(func.__name__),
12 | category=DeprecationWarning)
13 | return func(*args, **kwargs)
14 | new_func.__name__ = func.__name__
15 | new_func.__doc__ = func.__doc__
16 | new_func.__dict__.update(func.__dict__)
17 | return new_func
18 |
19 |
20 | ## get the measure function, IC function
21 | def get_measure_function(type):
22 | from .predictiveness_measures import r_squared, accuracy, auc, deviance
23 | if type == "r_squared":
24 | measure = r_squared
25 | elif type == "accuracy":
26 | measure = accuracy
27 | elif type == "auc":
28 | measure = auc
29 | elif type == "deviance":
30 | measure = deviance
31 | else:
32 | raise ValueError("We do not currently support the entered predictiveness measure. Please provide a different predictiveness measure.")
33 | return measure
34 |
35 |
36 | def choose(n, k):
37 | import math
38 | return int(math.factorial(n) / (math.factorial(k) * math.factorial(n - k)))
39 |
40 |
41 | def make_folds(y, V, stratified = True):
42 | """
43 | Create folds for CV (potentially stratified)
44 | """
45 | import numpy as np
46 | if stratified:
47 | y_1 = y == 1
48 | y_0 = y == 0
49 | folds_1 = np.resize(np.arange(V), sum(y_1))
50 | np.random.shuffle(folds_1)
51 | folds_0 = np.resize(np.arange(V), sum(y_0))
52 | np.random.shuffle(folds_0)
53 | folds = np.empty((y.shape[0]))
54 | folds[np.ravel(y_1)] = folds_1
55 | folds[np.ravel(y_0)] = folds_0
56 | else:
57 | folds = np.resize(np.arange(V), y.shape[0])
58 | np.random.shuffle(folds)
59 | return folds
60 |
61 |
62 | ## hypothesis testing with shapley values
63 | def shapley_hyp_test(vs_one_1, v_none_0, sigmas_one, sigma_none, delta = 0, level = 0.05, p = 3):
64 | """
65 | Hypothesis testing for Shapley values
66 |
67 | @param vs_one_1: one-feature measures of predictiveness
68 | @param v_none_0: null-model predictiveness
69 | @param sigmas_one: ses
70 | @param sigma_none: null-model se
71 | @param delta: value for testing
72 | @param level: significance level
73 |
74 | @return: test_statistics (the test statistics), p_vals (p-values), hyp_tests (the hypothesis testing results)
75 | """
76 | import numpy as np
77 | from scipy.stats import norm
78 |
79 | test_statistics = [(vs_one_1[v] - v_none_0 - delta) / (np.sqrt(sigmas_one[v] ** 2 + sigma_none ** 2)) for v in range(p)]
80 | p_values = 1. - norm.cdf(test_statistics)
81 | hyp_tests = p_values < level
82 | return test_statistics, p_values, hyp_tests
83 |
--------------------------------------------------------------------------------