├── .gitignore ├── LICENSE ├── README.html ├── README.md ├── docs └── vimpy_logo.png ├── setup.py └── vimpy ├── __init__.py ├── cv_vim.py ├── predictiveness_measures.py ├── spvim.py ├── spvim_ic.py ├── vim.py └── vimpy_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # pyc files 2 | # --------- 3 | *.pyc 4 | 5 | # all things for pypi 6 | # ------------------- 7 | dist/* 8 | build/* 9 | test/* 10 | vimpy.egg* 11 | olddist/* 12 | 13 | # virtual environment 14 | # ------------------- 15 | venv/ 16 | py3env 17 | README_cv.md 18 | 19 | # test scripts 20 | # ------------------- 21 | test_precompute_cv.py 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2018--2020] [Brian D. Williamson] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python/`vimpy`: inference on algorithm-agnostic variable importance

2 | 3 | [![PyPI version](https://badge.fury.io/py/vimpy.svg)](https://badge.fury.io/py/vimpy) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | 6 | 7 | **Software author:** [Brian Williamson](https://bdwilliamson.github.io/) 8 | 9 | **Methodology authors:** [Brian Williamson](https://bdwilliamson.github.io/), [Peter Gilbert](https://www.fredhutch.org/en/faculty-lab-directory/gilbert-peter.html), [Noah Simon](http://faculty.washington.edu/nrsimon/), [Marco Carone](http://faculty.washington.edu/mcarone/about.html) 10 | 11 | **R package:** https://github.com/bdwilliamson/vimp 12 | 13 | ## Introduction 14 | 15 | In predictive modeling applications, it is often of interest to determine the relative contribution of subsets of features in explaining an outcome; this is often called variable importance. It is useful to consider variable importance as a function of the unknown, underlying data-generating mechanism rather than the specific predictive algorithm used to fit the data. This package provides functions that, given fitted values from predictive algorithms, compute nonparametric estimates of variable importance based on $R^2$, deviance, classification accuracy, and area under the receiver operating characteristic curve, along with asymptotically valid confidence intervals for the true importance. 16 | 17 | For more details, please see the accompanying manuscripts "Nonparametric variable importance assessment using machine learning techniques" by Williamson, Gilbert, Carone, and Simon (*Biometrics*, 2020), ["A unified approach for inference on algorithm-agnostic variable importance"](https://arxiv.org/abs/2004.03683) by Williamson, Gilbert, Simon, and Carone (*arXiv*, 2020), and ["Efficient nonparametric statistical inference on population feature importance using Shapley values"](https://arxiv.org/abs/2006.09481) by Williamson and Feng (*arXiv*, 2020; to appear in the Proceedings of the Thirty-seventh International Conference on Machine Learning [ICML 2020]). 18 | 19 | ## Installation 20 | 21 | You may install a stable release of `vimpy` using `pip` by running `python pip install vimpy` from a Terminal window. Alternatively, you may install within a `virtualenv` environment. 22 | 23 | You may install the current dev release of `vimpy` by downloading this repository directly. 24 | 25 | ## Issues 26 | 27 | If you encounter any bugs or have any specific feature requests, please [file an issue](https://github.com/bdwilliamson/vimpy/issues). 28 | 29 | ## Example 30 | 31 | This example shows how to use `vimpy` in a simple setting with simulated data and using a single regression function. For more examples and detailed explanation, please see the [`R` vignette](https://bdwilliamson.github.io/vimp/articles/introduction_to_vimp.html). 32 | 33 | ```python 34 | ## load required libraries 35 | import numpy as np 36 | import vimpy 37 | from sklearn.ensemble import GradientBoostingRegressor 38 | from sklearn.model_selection import GridSearchCV 39 | 40 | ## ------------------------------------------------------------- 41 | ## problem setup 42 | ## ------------------------------------------------------------- 43 | ## define a function for the conditional mean of Y given X 44 | def cond_mean(x = None): 45 | f1 = np.where(np.logical_and(-2 <= x[:, 0], x[:, 0] < 2), np.floor(x[:, 0]), 0) 46 | f2 = np.where(x[:, 1] <= 0, 1, 0) 47 | f3 = np.where(x[:, 2] > 0, 1, 0) 48 | f6 = np.absolute(x[:, 5]/4) ** 3 49 | f7 = np.absolute(x[:, 6]/4) ** 5 50 | f11 = (7./3)*np.cos(x[:, 10]/2) 51 | ret = f1 + f2 + f3 + f6 + f7 + f11 52 | return ret 53 | 54 | ## create data 55 | np.random.seed(4747) 56 | n = 100 57 | p = 15 58 | s = 1 # importance desired for X_1 59 | x = np.zeros((n, p)) 60 | for i in range(0, x.shape[1]) : 61 | x[:,i] = np.random.normal(0, 2, n) 62 | 63 | y = cond_mean(x) + np.random.normal(0, 1, n) 64 | 65 | ## ------------------------------------------------------------- 66 | ## preliminary step: get regression estimators 67 | ## ------------------------------------------------------------- 68 | ## use grid search to get optimal number of trees and learning rate 69 | ntrees = np.arange(100, 500, 100) 70 | lr = np.arange(.01, .1, .05) 71 | 72 | param_grid = [{'n_estimators':ntrees, 'learning_rate':lr}] 73 | 74 | ## set up cv objects 75 | cv_full = GridSearchCV(GradientBoostingRegressor(loss = 'ls', max_depth = 1), param_grid = param_grid, cv = 5) 76 | cv_small = GridSearchCV(GradientBoostingRegressor(loss = 'ls', max_depth = 1), param_grid = param_grid, cv = 5) 77 | 78 | ## ------------------------------------------------------------- 79 | ## get variable importance estimates 80 | ## ------------------------------------------------------------- 81 | # set seed 82 | np.random.seed(12345) 83 | ## set up the vimp object 84 | vimp = vimpy.vim(y = y, x = x, s = 1, pred_func = cv_full, measure_type = "r_squared") 85 | ## get the point estimate of variable importance 86 | vimp.get_point_est() 87 | ## get the influence function estimate 88 | vimp.get_influence_function() 89 | ## get a standard error 90 | vimp.get_se() 91 | ## get a confidence interval 92 | vimp.get_ci() 93 | ## do a hypothesis test, compute p-value 94 | vimp.hypothesis_test(alpha = 0.05, delta = 0) 95 | ## display the estimates, etc. 96 | vimp.vimp_ 97 | vimp.se_ 98 | vimp.ci_ 99 | vimp.p_value_ 100 | vimp.hyp_test_ 101 | 102 | ## ------------------------------------------------------------- 103 | ## using precomputed fitted values 104 | ## ------------------------------------------------------------- 105 | np.random.seed(12345) 106 | folds_outer = np.random.choice(a = np.arange(2), size = n, replace = True, p = np.array([0.5, 0.5])) 107 | ## fit the full regression 108 | cv_full.fit(x[folds_outer == 1, :], y[folds_outer == 1]) 109 | full_fit = cv_full.best_estimator_.predict(x[folds_outer == 1, :]) 110 | 111 | ## fit the reduced regression 112 | x_small = np.delete(x[folds_outer == 0, :], s, 1) # delete the columns in s 113 | cv_small.fit(x_small, y[folds_outer == 0]) 114 | small_fit = cv_small.best_estimator_.predict(x_small) 115 | ## get variable importance estimates 116 | np.random.seed(12345) 117 | vimp_precompute = vimpy.vim(y = y, x = x, s = 1, f = full_fit, r = small_fit, measure_type = "r_squared", folds = folds_outer) 118 | ## get the point estimate of variable importance 119 | vimp_precompute.get_point_est() 120 | ## get the influence function estimate 121 | vimp_precompute.get_influence_function() 122 | ## get a standard error 123 | vimp_precompute.get_se() 124 | ## get a confidence interval 125 | vimp_precompute.get_ci() 126 | ## do a hypothesis test, compute p-value 127 | vimp_precompute.hypothesis_test(alpha = 0.05, delta = 0) 128 | ## display the estimates, etc. 129 | vimp_precompute.vimp_ 130 | vimp_precompute.se_ 131 | vimp_precompute.ci_ 132 | vimp_precompute.p_value_ 133 | vimp_precompute.hyp_test_ 134 | 135 | ## ------------------------------------------------------------- 136 | ## get variable importance estimates using cross-validation 137 | ## ------------------------------------------------------------- 138 | np.random.seed(12345) 139 | ## set up the vimp object 140 | vimp_cv = vimpy.cv_vim(y = y, x = x, s = 1, pred_func = cv_full, V = 5, measure_type = "r_squared") 141 | ## get the point estimate 142 | vimp_cv.get_point_est() 143 | ## get the standard error 144 | vimp_cv.get_influence_function() 145 | vimp_cv.get_se() 146 | ## get a confidence interval 147 | vimp_cv.get_ci() 148 | ## do a hypothesis test, compute p-value 149 | vimp_cv.hypothesis_test(alpha = 0.05, delta = 0) 150 | ## display estimates, etc. 151 | vimp_cv.vimp_ 152 | vimp_cv.se_ 153 | vimp_cv.ci_ 154 | vimp_cv.p_value_ 155 | vimp_cv.hyp_test_ 156 | ``` 157 | 158 | ## Logo 159 | 160 | The logo was created using [hexSticker](https://github.com/GuangchuangYu/hexSticker), [lisa](https://github.com/tyluRp/lisa), and a [python image](https://svgsilh.com/image/145410.html) distributed under the CC0 license. Many thanks to the maintainers of these packages and the [Color Lisa](https://lisa-pkg.netlify.app/) team. 161 | -------------------------------------------------------------------------------- /docs/vimpy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bdwilliamson/vimpy/681eb21e1ff1141dc9fbaa35261e24dd17296857/docs/vimpy_logo.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="vimpy", 8 | version="2.1.1", 9 | author="Brian Williamson", 10 | author_email="brianw26@uw.edu", 11 | description="vimpy: perform inference on algorithm-agnostic variable importance in python", 12 | license='MIT', 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/bdwilliamson/vimpy", 16 | packages=setuptools.find_packages(), 17 | install_requires=[ 18 | 'numpy', 19 | 'scipy' 20 | ], 21 | classifiers=( 22 | "Programming Language :: Python :: 3.3", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: OS Independent", 25 | ), 26 | ) 27 | -------------------------------------------------------------------------------- /vimpy/__init__.py: -------------------------------------------------------------------------------- 1 | # __init__.py 2 | from .vim import vim 3 | from .cv_vim import cv_vim 4 | from .spvim import spvim 5 | from .spvim_ic import shapley_influence_function, shapley_se 6 | from .predictiveness_measures import * 7 | from .vimpy_utils import * 8 | name="vimpy" 9 | -------------------------------------------------------------------------------- /vimpy/cv_vim.py: -------------------------------------------------------------------------------- 1 | ## Python class for cross-validated estimates of variable importance 2 | ## compute estimates and confidence intervals, do hypothesis testing 3 | 4 | ## import required libraries 5 | import numpy as np 6 | from scipy.stats import norm 7 | from .predictiveness_measures import cv_predictiveness, cv_predictiveness_precomputed 8 | from .vimpy_utils import get_measure_function 9 | 10 | 11 | class cv_vim: 12 | 13 | ## define initialization values 14 | """ 15 | @param y the outcome 16 | @param x the feature data 17 | @param s the feature group of interest 18 | @param measure_type the predictiveness measure to use (for now, one of "r_squared", "auc", "accuracy", "deviance") 19 | @param V the number of cross-fitting folds (defaults to 5) 20 | @param pred_func the function that predicts outcome given features 21 | @param ensemble is pred_func an ensemble (True) or a single function (False, default) 22 | @param f fitted values from regression of outcome on all features (only used if pred_func is not specified) 23 | @param r fitted values from regression of outcome on reduced set of features (only used if pred_func is not specified) 24 | @param folds a list of length 3: outer folds, for hypothesis testing; inner folds based on the outer folds == 1 (for cross-fitting); inner folds based on outer folds == 0 (for cross-fitting) 25 | @param na_rm remove NAs prior to computing predictiveness? (defaults to False) 26 | """ 27 | def __init__(self, y, x, s, measure_type, V = 5, pred_func = None, ensemble = False, f = None, r = None, folds = None, na_rm = False): 28 | self.y_ = y 29 | self.x_ = x 30 | self.s_ = s 31 | self.n_ = y.shape[0] 32 | self.p_ = x.shape[1] 33 | self.pred_func_ = pred_func 34 | self.f_ = f 35 | self.r_ = r 36 | assert (pred_func is not None or (f is not None and r is not None)) 37 | self.V_ = V 38 | self.measure_type_ = measure_type 39 | self.measure_ = get_measure_function(measure_type) 40 | self.vimp_ = [] 41 | self.se_ = [] 42 | self.ci_ = [] 43 | self.hyp_test_ = [] 44 | self.test_statistic_ = [] 45 | self.p_value_ = [] 46 | self.v_full_ = [] 47 | self.preds_full_ = [] 48 | self.v_redu_ = [] 49 | self.preds_redu_ = [] 50 | self.se_full_ = [] 51 | self.se_redu_ = [] 52 | self.ci_full_ = [] 53 | self.ci_redu_ = [] 54 | ## set up outer folds for hypothesis testing 55 | ## set up outer folds for hypothesis testing 56 | if folds is None: 57 | self.folds_outer_ = np.random.choice(a = np.arange(2), size = self.n_, replace = True, p = np.array([0.5, 0.5])) 58 | self.folds_inner_1 = [] 59 | self.folds_inner_0 = [] 60 | else: 61 | assert (f is not None) 62 | self.folds_outer_ = folds[0] 63 | self.folds_inner_1 = folds[1] 64 | self.folds_inner_0 = folds[2] 65 | self.ic_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1)))) 66 | self.ic_full_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1)))) 67 | self.ic_redu_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1)))) 68 | ## if only two unique values in y, assume binary 69 | self.binary_ = (np.unique(y).shape[0] == 2) 70 | self.na_rm_ = na_rm 71 | self.ensemble_ = ensemble 72 | 73 | ## calculate the plug-in estimator 74 | def get_point_est(self): 75 | if self.pred_func_ is not None: 76 | predictiveness_func = cv_predictiveness 77 | this_full_func = self.pred_func_ 78 | this_redu_func = self.pred_func_ 79 | folds_1 = None 80 | folds_0 = None 81 | else: 82 | predictiveness_func = cv_predictiveness_precomputed 83 | this_full_func = self.f_ 84 | this_redu_func = self.r_ 85 | folds_1 = self.folds_inner_1 86 | folds_0 = self.folds_inner_0 87 | self.v_full_, self.preds_full_, ic_full, self.folds_inner_1, self.cc_1 = predictiveness_func(self.x_[self.folds_outer_ == 1, :], self.y_[self.folds_outer_ == 1], np.arange(self.p_), self.measure_, this_full_func, V = self.V_, stratified = self.binary_, na_rm = self.na_rm_, folds = folds_1, ensemble = self.ensemble_) 88 | self.v_redu_, self.preds_redu_, ic_redu, self.folds_inner_0, self.cc_0 = predictiveness_func(self.x_[self.folds_outer_ == 0, :], self.y_[self.folds_outer_ == 0], np.delete(np.arange(self.p_), self.s_), self.measure_, this_redu_func, V = self.V_, stratified = self.binary_, na_rm = self.na_rm_, folds = folds_0, ensemble = self.ensemble_) 89 | self.vimp_ = self.v_full_ - self.v_redu_ 90 | self.ic_full_[:ic_full.shape[0]] = ic_full 91 | self.ic_redu_[:ic_redu.shape[0]] = ic_redu 92 | return self 93 | 94 | ## calculate the influence function 95 | def get_influence_function(self): 96 | self.ic_ = self.ic_full_ - self.ic_redu_ 97 | return self 98 | 99 | ## calculate the standard error 100 | def get_se(self): 101 | self.se_full_ = np.sqrt(np.mean(self.ic_full_ ** 2)) / np.sqrt(self.ic_full_.shape[0]) 102 | self.se_redu_ = np.sqrt(np.mean(self.ic_redu_ ** 2)) / np.sqrt(self.ic_redu_.shape[0]) 103 | self.se_ = np.sqrt(np.mean(self.ic_ ** 2)) / np.sqrt(self.ic_.shape[0]) 104 | return self 105 | 106 | ## calculate the ci based on the estimate and the standard error 107 | def get_ci(self, level = 0.95): 108 | ## get alpha from the level 109 | a = (1 - level) / 2. 110 | a = np.array([a, 1 - a]) 111 | ## calculate the quantiles 112 | fac = norm.ppf(a) 113 | ## create cis for vimp, predictiveness 114 | self.ci_ = self.vimp_ + np.outer((self.se_), fac) 115 | self.ci_full_ = self.v_full_ + np.outer((self.se_full_), fac) 116 | self.ci_redu_ = self.v_redu_ + np.outer((self.se_redu_), fac) 117 | return self 118 | 119 | ## do a hypothesis test 120 | def hypothesis_test(self, alpha = 0.05, delta = 0): 121 | self.test_statistic_ = (self.v_full_ - self.v_redu_ - delta) / np.sqrt(self.se_full_ ** 2 + self.se_redu_ ** 2) 122 | self.p_value_ = 1 - norm.cdf(self.test_statistic_) 123 | self.hyp_test_ = self.p_value_ < alpha 124 | return(self) 125 | -------------------------------------------------------------------------------- /vimpy/predictiveness_measures.py: -------------------------------------------------------------------------------- 1 | ## Compute predictiveness measures and their corresponding influence functions 2 | 3 | 4 | # general cv predictiveness 5 | def cv_predictiveness(x, y, S, measure, pred_func, V = 5, stratified = True, na_rm = False, folds = None, ensemble = False): 6 | """ 7 | Compute a cross-validated measure of predictiveness based on the data and the chosen measure 8 | 9 | @param x: the features 10 | @param y: the outcome 11 | @param S: the covariates to fit 12 | @param measure: measure of predictiveness 13 | @param pred_func: function that fits to the data 14 | @param V: the number of CV folds 15 | @param stratified: should the folds be stratified? 16 | @param na_rm: should we do a complete-case analysis (True) or not (False) 17 | @param folds (dummy) 18 | @param ensemble is this an ensemble (True) or not (False) 19 | 20 | @return cross-validated measure of predictiveness, along with preds and ics 21 | """ 22 | import numpy as np 23 | from .vimpy_utils import make_folds 24 | ## if na_rm = True, do a complete-case analysis 25 | if na_rm: 26 | xs = x[:, S] 27 | cc = np.sum(np.isnan(xs), axis = 1) == 0 28 | newx = x[cc, :] 29 | newy = y[cc] 30 | else: 31 | cc = np.repeat(True, x.shape[0]) 32 | newx = x 33 | newy = y 34 | ## set up CV folds 35 | folds = make_folds(newy, V, stratified = stratified) 36 | ## do CV 37 | preds = np.empty((y.shape[0],)) 38 | preds.fill(np.nan) 39 | ics = np.empty((y.shape[0],)) 40 | ics.fill(np.nan) 41 | vs = np.empty((V,)) 42 | cc_cond = np.flatnonzero(cc) 43 | if V == 1: 44 | x_train, y_train = newx, newy 45 | pred_func.fit(x_train[:, S], np.ravel(y_train)) 46 | if ensemble: 47 | preds_v = np.mean(pred_func.transform(x_train[:, S])) 48 | else: 49 | try: 50 | preds_v = pred_func.predict_proba(x_train[:, S])[:, 1] 51 | except AttributeError: 52 | preds_v = pred_func.predict(x_train[:, S]) 53 | 54 | preds[cc_cond] = preds_v 55 | vs[0] = measure(y_train, preds_v) 56 | ics[cc_cond] = compute_ic(y_train, preds_v, measure.__name__) 57 | else: 58 | for v in range(V): 59 | fold_cond = np.flatnonzero(folds == v) 60 | x_train, y_train = newx[folds != v, :], newy[folds != v] 61 | x_test, y_test = newx[folds == v, :], newy[folds == v] 62 | pred_func.fit(x_train[:, S], np.ravel(y_train)) 63 | if ensemble: 64 | preds_v = np.mean(pred_func.transform(x_test[:, S])) 65 | else: 66 | try: 67 | preds_v = pred_func.predict_proba(x_test[:, S])[:, 1] 68 | except AttributeError: 69 | preds_v = pred_func.predict(x_test[:, S]) 70 | 71 | preds[cc_cond[fold_cond]] = preds_v 72 | vs[v] = measure(y_test, preds_v) 73 | ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__) 74 | return np.mean(vs), preds, ics, folds, cc 75 | 76 | 77 | # general predictiveness based on precomputed fits 78 | def cv_predictiveness_precomputed(x, y, S, measure, f, V = 5, stratified = True, folds = None, na_rm = False, ensemble = False): 79 | """ 80 | Compute a cross-validated measure of predictiveness based on the data, the chosen measure, and the sets of fitted values f and r 81 | 82 | @param x: the features 83 | @param y: the outcome 84 | @param S: the covariates to fit 85 | @param measure: measure of predictiveness 86 | @param f: fitted values based on S 87 | @param V: the number of CV folds 88 | @param stratified: should the folds be stratified? 89 | @param folds: the CV folds 90 | @param na_rm: should we do a complete-case analysis (True) or not (False) 91 | @param ensemble: is this an ensemble or not (dummy) 92 | 93 | @return cross-validated measure of predictiveness, along with preds and ics 94 | """ 95 | import numpy as np 96 | from .vimpy_utils import make_folds 97 | ## if na_rm = True, do a complete-case analysis 98 | if na_rm: 99 | xs = x[:, S] 100 | cc = np.sum(np.isnan(xs), axis = 1) == 0 101 | newy = y[cc] 102 | else: 103 | cc = np.repeat(True, x.shape[0]) 104 | newy = y 105 | ## set up CV folds 106 | if folds is None: 107 | folds = make_folds(newy, V, stratified = stratified) 108 | ## do CV 109 | preds = np.empty((y.shape[0],)) 110 | preds.fill(np.nan) 111 | ics = np.empty((y.shape[0],)) 112 | ics.fill(np.nan) 113 | vs = np.empty((V,)) 114 | cc_cond = np.flatnonzero(cc) 115 | if V == 1: 116 | y_train = newy 117 | preds_v = f 118 | preds[cc_cond] = preds_v[cc_cond] 119 | vs[0] = measure(y_train, preds_v) 120 | ics[cc_cond] = compute_ic(y_train, preds_v, measure.__name__) 121 | else: 122 | for v in range(V): 123 | fold_cond = np.flatnonzero(folds == v) 124 | y_test = newy[folds == v] 125 | preds_v = f[folds == v] 126 | preds[cc_cond[fold_cond]] = preds_v 127 | vs[v] = measure(y_test, preds_v) 128 | ics[cc_cond[fold_cond]] = compute_ic(y_test, preds_v, measure.__name__) 129 | return np.mean(vs), preds, ics, folds, cc 130 | 131 | 132 | def accuracy(y, preds): 133 | """ 134 | Compute accuracy for a given set of predictions and outcomes 135 | 136 | @param y: the outcome 137 | @param preds: the predictions based on a subset of features 138 | 139 | @return the accuracy 140 | """ 141 | import sklearn.metrics as skm 142 | 143 | if len(preds.shape) == 2: 144 | if preds.shape[1] > 1: 145 | return [1. - skm.zero_one_loss(y_true = y, y_pred = preds[:, i], normalize = True) for i in range(preds.shape[1])] 146 | else: 147 | return 1. - skm.zero_one_loss(y_true = y, y_pred = preds, normalize = True) 148 | else: 149 | return 1. - skm.zero_one_loss(y_true = y, y_pred = preds, normalize = True) 150 | 151 | 152 | def auc(y, preds, *args, **kwargs): 153 | """ 154 | Compute AUC for a given set of predictions and outcomes 155 | 156 | @param y: the outcome 157 | @param preds: the predictions based on a given subset of features 158 | 159 | @return the AUC 160 | """ 161 | import sklearn.metrics as skm 162 | 163 | if len(preds.shape) == 2: 164 | if preds.shape[1] > 1: 165 | return [skm.roc_auc_score(y_true = y, y_score = preds[:, i], average = "micro") for i in range(preds.shape[1])] 166 | else: 167 | return skm.roc_auc_score(y_true = y, y_score = preds, average = "micro") 168 | else: 169 | return skm.roc_auc_score(y_true = y, y_score = preds, average = "micro") 170 | 171 | 172 | def cross_entropy(y, preds): 173 | """ 174 | Compute cross-entropy for a given set of predictions and outcomes 175 | 176 | @param y: the outcome 177 | @param preds: the predictions based on a subset of features 178 | 179 | @return the cross-entropy 180 | """ 181 | import sklearn.metrics as skm 182 | 183 | if len(preds.shape) == 2: 184 | if preds.shape[1] > 1: 185 | return [(-2) * skm.log_loss(y_true = y, y_pred = preds[:, i], normalize = True) for i in range(preds.shape[1])] 186 | else: 187 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) 188 | else: 189 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) 190 | 191 | 192 | def deviance(y, preds): 193 | """ 194 | Compute deviance for a given set of predictions and outcomes 195 | 196 | @param y: the outcome 197 | @param preds: the predictions based on a subset of features 198 | 199 | @return the deviance 200 | """ 201 | import sklearn.metrics as skm 202 | import numpy as np 203 | denom = (-1) * np.sum(np.log(np.mean(y, axis = 0))) 204 | 205 | if len(preds.shape) == 2: 206 | if preds.shape[1] > 1: 207 | return [(-2) * skm.log_loss(y_true = y, y_pred = preds[:, i], normalize = True) / denom for i in range(preds.shape[1])] 208 | else: 209 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) / denom 210 | else: 211 | return (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) / denom 212 | 213 | 214 | def r_squared(y, preds): 215 | """ 216 | Compute R^s for a given set of predictions and outcomes 217 | 218 | @param y: the outcome 219 | @param preds: the predictions based on a given subset of features 220 | 221 | @return the R^2 222 | """ 223 | import sklearn.metrics as skm 224 | import numpy as np 225 | var = np.mean((y - np.mean(y)) ** 2) 226 | 227 | if len(preds.shape) == 2: 228 | if preds.shape[1] > 1: 229 | return [1. - skm.mean_squared_error(y_true = y, y_pred = preds[:, i]) / var for i in range(preds.shape[1])] 230 | else: 231 | return 1. - skm.mean_squared_error(y_true = y, y_pred = preds) / var 232 | else: 233 | return 1. - skm.mean_squared_error(y_true = y, y_pred = preds) / var 234 | 235 | 236 | ## ------------------------------------------------------------------ 237 | ## influence functions 238 | ## ------------------------------------------------------------------ 239 | def compute_ic(y, preds, measure): 240 | """ 241 | Compute IC based on the given measure 242 | 243 | @param y: the outcome 244 | @param preds: the predictions based on the current subset of features 245 | @param measure: the predictiveness measure 246 | 247 | @return an n-vector of the IC for the given predictiveness measure 248 | """ 249 | 250 | ## do the correct thing 251 | if measure == "accuracy": 252 | return accuracy_ic(y, preds) 253 | elif measure == "auc": 254 | return auc_ic(y, preds) 255 | elif measure == "cross_entropy": 256 | return cross_entropy_ic(y, preds) 257 | elif measure == "deviance": 258 | return deviance_ic(y, preds) 259 | elif measure == "r_squared": 260 | return r_squared_ic(y, preds) 261 | else: 262 | raise ValueError("We do not currently support the entered predictiveness measure. Please provide a different predictiveness measure.") 263 | 264 | 265 | def accuracy_ic(y, preds): 266 | """ 267 | Compute the IC for accuracy 268 | 269 | @param y: the outcome 270 | @param preds: the predictions based on a given subset of features 271 | 272 | @return the IC for accuracy 273 | """ 274 | import numpy as np 275 | if len(preds.shape) == 2: 276 | if preds.shape[1] > 1: 277 | return np.array([one_accuracy_ic(y, preds[:, m]) for m in range(preds.shape[1])]) 278 | else: 279 | return np.array([one_accuracy_ic(y, preds)]) 280 | else: 281 | return np.array([one_accuracy_ic(y, preds)]) 282 | 283 | 284 | def one_accuracy_ic(y, preds): 285 | """ 286 | Compute the IC for one accuracy 287 | 288 | @param y: the outcome 289 | @param preds: the predictions based on a given subset of features 290 | 291 | @return the IC for accuracy 292 | """ 293 | import sklearn.metrics as skm 294 | 295 | misclassification = skm.zero_one_loss(y_true = y, y_pred = preds, normalize = True) 296 | return (-1) * (((preds > 1. / 2) != y) - misclassification) 297 | 298 | 299 | def auc_ic(y, preds): 300 | """ 301 | Compute the IC for AUC 302 | 303 | @param y: the outcome 304 | @param preds: the predictions based on a given subset of features 305 | 306 | @return the IC for AUC 307 | """ 308 | import numpy as np 309 | if len(preds.shape) == 2: 310 | if preds.shape[1] > 1: 311 | return np.array([one_auc_ic(y, preds[:, m]) for m in range(preds.shape[1])]) 312 | else: 313 | return np.array([one_auc_ic(y, preds)]) 314 | else: 315 | return np.array([one_auc_ic(y, preds)]) 316 | 317 | 318 | def one_auc_ic(y, preds): 319 | """ 320 | Compute the IC for one AUC 321 | 322 | @param y: the outcome 323 | @param preds: the predictions based on a given subset of features 324 | 325 | @return the IC for AUC 326 | """ 327 | import numpy as np 328 | import sklearn.metrics as skm 329 | 330 | p_1 = np.mean(y) 331 | p_0 = 1 - p_1 332 | 333 | sens = np.array([np.mean(preds[(y == 0).reshape(preds.shape)] < x) for x in preds]) 334 | spec = np.array([np.mean(preds[(y == 1).reshape(preds.shape)] > x) for x in preds]) 335 | 336 | contrib_1 = (y == 1).reshape(preds.shape) / p_1 * sens 337 | contrib_0 = (y == 0).reshape(preds.shape) / p_0 * spec 338 | 339 | auc = skm.roc_auc_score(y_true = y, y_score = preds, average = "micro") 340 | return contrib_1 + contrib_0 - ((y == 0).reshape(preds.shape) / p_0 + (y == 1).reshape(preds.shape) / p_1) * auc 341 | 342 | 343 | def cross_entropy_ic(y, preds): 344 | """ 345 | Compute the IC for cross-entropy 346 | 347 | @param y: the outcome 348 | @param preds: the predictions based on a given subset of features 349 | 350 | @return the IC for cross-entropy 351 | """ 352 | import numpy as np 353 | if len(preds.shape) == 2: 354 | if preds.shape[1] > 1: 355 | return np.array([one_cross_entropy_ic(y, preds[:, m]) for m in range(preds.shape[1])]) 356 | else: 357 | return np.array([one_cross_entropy_ic(y, preds)]) 358 | else: 359 | return np.array([one_cross_entropy_ic(y, preds)]) 360 | 361 | 362 | def one_cross_entropy_ic(y, preds): 363 | """ 364 | Compute the IC for one cross-entropy 365 | 366 | @param y: the outcome 367 | @param preds: the predictions based on a given subset of features 368 | 369 | @return the IC for cross-entropy 370 | """ 371 | import sklearn.metrics as skm 372 | import numpy as np 373 | cross_entropy = (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) 374 | ic_cross_entropy = (-2) * np.sum(y * np.log(preds), axis = 1) - cross_entropy 375 | return ic_cross_entropy 376 | 377 | 378 | def deviance_ic(y, preds): 379 | """ 380 | Compute the IC for deviance 381 | 382 | @param y: the outcome 383 | @param preds: the predictions based on a given subset of features 384 | 385 | @return the IC for deviance 386 | """ 387 | import numpy as np 388 | if len(preds.shape) == 2: 389 | if preds.shape[1] > 1: 390 | return np.array([one_deviance_ic(y, preds[:, m]) for m in range(preds.shape[1])]) 391 | else: 392 | return np.array([one_deviance_ic(y, preds)]) 393 | else: 394 | return np.array([one_deviance_ic(y, preds)]) 395 | 396 | 397 | def one_deviance_ic(y, preds): 398 | """ 399 | Compute the IC for one deviance 400 | 401 | @param y: the outcome 402 | @param preds: the predictions based on a given subset of features 403 | 404 | @return the IC for deviance 405 | """ 406 | import sklearn.metrics as skm 407 | import numpy as np 408 | cross_entropy = (-2) * skm.log_loss(y_true = y, y_pred = preds, normalize = True) 409 | p = np.mean(y, axis = 0) 410 | denom = (-1) * np.sum(np.log(p)) 411 | ic_cross_entropy = (-2) * np.sum(y * np.log(preds), axis = 1) - cross_entropy 412 | ic_denom = ((-1.) / p) * ((y == 1) - p) 413 | grad = np.array([1. / denom, (-1) * cross_entropy / (denom ** 2)]) 414 | return np.dot(grad, np.stack((ic_cross_entropy, ic_denom))) 415 | 416 | 417 | def r_squared_ic(y, preds): 418 | """ 419 | Compute the IC for R-squared 420 | 421 | @param y: the outcome 422 | @param preds: the predictions based on a given subset of features 423 | 424 | @return the IC for R-squared 425 | """ 426 | import numpy as np 427 | if len(preds.shape) == 2: 428 | if preds.shape[1] > 1: 429 | return np.array([one_r2_ic(y, preds[:, m]) for m in range(preds.shape[1])]) 430 | else: 431 | return np.array([one_r2_ic(y, preds)]) 432 | else: 433 | return np.array([one_r2_ic(y, preds)]) 434 | 435 | 436 | def one_r2_ic(y, preds): 437 | """ 438 | Compute the IC for one R-squared 439 | 440 | @param y: the outcome 441 | @param preds: the predictions based on a given subset of features 442 | 443 | @return the IC for R-squared 444 | """ 445 | import sklearn.metrics as skm 446 | import numpy as np 447 | y_flat = np.ravel(y) 448 | mse = skm.mean_squared_error(y_true = y_flat, y_pred = preds) 449 | var = np.mean((y_flat - np.mean(y_flat)) ** 2) 450 | ic_mse = (y_flat - preds) ** 2 - mse 451 | ic_var = (y_flat - np.mean(y_flat)) ** 2 - var 452 | grad = np.array([1. / var, (-1) * mse / (var ** 2)]) 453 | return np.dot(grad, np.stack((ic_mse, ic_var))) 454 | -------------------------------------------------------------------------------- /vimpy/spvim.py: -------------------------------------------------------------------------------- 1 | ## Python class for estimates of Shapley population variable importance 2 | ## compute estimates and confidence intervals, do hypothesis testing 3 | 4 | ## import required libraries 5 | import numpy as np 6 | from scipy.stats import norm 7 | from .predictiveness_measures import cv_predictiveness, compute_ic 8 | from .spvim_ic import shapley_influence_function, shapley_se 9 | from .vimpy_utils import get_measure_function, choose, shapley_hyp_test 10 | 11 | 12 | class spvim: 13 | 14 | ## define initialization values 15 | """ 16 | @param y the outcome 17 | @param x the feature data 18 | @param measure_type the predictiveness measure to use (a function) 19 | @param V the number of cross-validation folds 20 | @param pred_func the function that predicts outcome given features 21 | @param ensemble is pred_func an ensemble (True) or a single function (False, default) 22 | @param na_rm remove NAs prior to computing predictiveness? (defaults to False) 23 | """ 24 | def __init__(self, y, x, measure_type, V, pred_func, ensemble = False, na_rm = False): 25 | self.y_ = y 26 | self.x_ = x 27 | self.n_ = y.shape[0] 28 | self.p_ = x.shape[1] 29 | self.pred_func_ = pred_func 30 | self.V_ = V 31 | self.measure_type_ = measure_type 32 | self.measure_ = get_measure_function(measure_type) 33 | self.ics_ = [] 34 | self.vimp_ = [] 35 | self.lambdas_ = [] 36 | self.ses_ = [] 37 | self.cis_ = [] 38 | self.na_rm_ = na_rm 39 | self.Z_ = [] 40 | self.z_counts_ = [] 41 | self.v_ = [] 42 | self.v_ics_ = [] 43 | self.W_ = [] 44 | self.gamma_ = [] 45 | self.test_statistics_ = [] 46 | self.p_values_ = [] 47 | self.hyp_tests_ = [] 48 | self.G_ = np.vstack((np.append(1, np.zeros(self.p_)), np.ones(self.p_ + 1) - np.append(1, np.zeros(self.p_)))) 49 | ## set up outer folds for hypothesis testing 50 | self.folds_outer_ = np.random.choice(a = np.arange(2), size = self.n_, replace = True, p = np.array([0.25, 0.75])) 51 | self.folds_inner_ = [] 52 | ## if only two unique values in y, assume binary 53 | self.binary_ = (np.unique(y).shape[0] == 2) 54 | self.ensemble_ = ensemble 55 | self.cc_ = [] 56 | 57 | def _get_kkt_matrix(self): 58 | # kkt matrix for constrained wls 59 | A_W = np.sqrt(self.W_).dot(self.Z_) 60 | kkt_matrix_11 = 2 * A_W.transpose().dot(A_W) 61 | kkt_matrix_12 = self.G_.transpose() 62 | kkt_matrix_21 = self.G_ 63 | kkt_matrix_22 = np.zeros((kkt_matrix_21.shape[0], kkt_matrix_12.shape[1])) 64 | kkt_matrix = np.vstack((np.hstack((kkt_matrix_11, kkt_matrix_12)), np.hstack((kkt_matrix_21, kkt_matrix_22)))) 65 | return(kkt_matrix) 66 | 67 | def _get_ls_matrix(self, c_n): 68 | A_W = np.sqrt(self.W_).dot(self.Z_) 69 | v_W = np.sqrt(self.W_).dot(self.v_) 70 | ls_matrix = np.vstack((2 * A_W.transpose().dot(v_W.reshape((len(v_W), 1))), c_n.reshape((c_n.shape[0], 1)))) 71 | return(ls_matrix) 72 | 73 | ## calculate the point estimates 74 | def get_point_est(self, gamma = 1): 75 | self.gamma_ = gamma 76 | ## sample subsets, set up Z 77 | max_subset = np.array(list(range(self.p_))) 78 | sampling_weights = np.append(np.append(1, [choose(self.p_ - 2, s - 1) ** (-1) for s in range(1, self.p_)]), 1) 79 | subset_sizes = np.random.choice(np.arange(0, self.p_ + 1), p = sampling_weights / sum(sampling_weights), size = self.gamma_ * self.x_.shape[0], replace = True) 80 | S_lst_all = [np.sort(np.random.choice(np.arange(0, self.p_), subset_size, replace = False)) for subset_size in list(subset_sizes)] 81 | ## only need to continue with the unique subsets S 82 | Z_lst_all = [np.in1d(max_subset, S).astype(np.float64) for S in S_lst_all] 83 | Z, z_counts = np.unique(np.array(Z_lst_all), axis = 0, return_counts = True) 84 | Z_lst = list(Z) 85 | Z_aug_lst = [np.append(1, z) for z in Z_lst] 86 | S_lst = [max_subset[z.astype(bool).tolist()] for z in Z_lst] 87 | ## get v, preds, ic for null set 88 | preds_none = np.repeat(np.mean(self.y_[self.folds_outer_ == 1]), np.sum(self.folds_outer_ == 1)) 89 | v_none = self.measure_(self.y_[self.folds_outer_ == 1], preds_none) 90 | ic_none = compute_ic(self.y_[self.folds_outer_ == 1], preds_none, self.measure_.__name__) 91 | ## get v, preds, ic for remaining non-null groups in S 92 | v_lst, preds_lst, ic_lst, self.folds_inner_, self.cc_ = zip(*(cv_predictiveness(self.x_[self.folds_outer_ == 1, :], self.y_[self.folds_outer_ == 1], s, self.measure_, self.pred_func_, V = self.V_, stratified = self.binary_, na_rm = self.na_rm_) for s in S_lst[1:])) 93 | ## set up full lists 94 | v_lst_all = [v_none] + list(v_lst) 95 | ic_lst_all = [ic_none] + list(ic_lst) 96 | self.Z_ = np.array(Z_aug_lst) 97 | self.z_counts_ = z_counts 98 | self.W_ = np.diag(z_counts / np.sum(z_counts)) 99 | self.v_ = np.array(v_lst_all) 100 | self.v_ics_ = ic_lst_all 101 | c_n = np.array([v_none, v_lst_all[len(v_lst)] - v_none]) 102 | kkt_matrix = self._get_kkt_matrix() 103 | ls_matrix = self._get_ls_matrix(c_n) 104 | ls_solution = np.linalg.inv(kkt_matrix).dot(ls_matrix) 105 | self.vimp_ = ls_solution[0:(self.p_ + 1), :] 106 | self.lambdas_ = ls_solution[(self.p_ + 1):ls_solution.shape[0], :] 107 | return(self) 108 | 109 | ## calculate the influence function 110 | def get_influence_functions(self): 111 | c_n = np.array([self.v_[0], self.v_[self.v_.shape[0] - 1] - self.v_[0]], dtype = object) 112 | v_ic_array = np.vstack([self.v_ics_[0], np.stack(self.v_ics_[1:], axis = 0)]) 113 | self.ics_ = shapley_influence_function(self.Z_, self.z_counts_, self.W_, self.v_, self.vimp_, self.G_, c_n, v_ic_array, self.measure_.__name__) 114 | return self 115 | 116 | ## calculate standard errors 117 | def get_ses(self): 118 | ses = [shapley_se(self.ics_, idx, self.gamma_) for idx in range(self.p_ + 1)] 119 | self.ses_ = np.array(ses) 120 | return self 121 | 122 | ## calculate the ci based on the estimate and the standard error 123 | def get_cis(self, level = 0.95): 124 | ## get alpha from the level 125 | a = (1 - level) / 2. 126 | a = np.array([a, 1 - a]) 127 | ## calculate the quantiles 128 | fac = norm.ppf(a) 129 | ## create it 130 | self.cis_ = self.vimp_ + np.outer((self.ses_), fac) 131 | return self 132 | 133 | ## do a hypothesis test 134 | def hypothesis_test(self, alpha = 0.05, delta = 0): 135 | ## null predictiveness 136 | preds_none_0 = np.repeat(np.mean(self.y_[self.folds_outer_ == 0]), np.sum(self.folds_outer_ == 0)) 137 | v_none_0 = self.measure_(self.y_[self.folds_outer_ == 0], preds_none_0) 138 | ic_none_0 = compute_ic(self.y_[self.folds_outer_ == 0], preds_none_0, self.measure_.__name__) 139 | sigma_none_0 = np.sqrt(np.mean((ic_none_0) ** 2)) / np.sqrt(np.sum(self.folds_outer_ == 0)) 140 | ## get shapley values + null predictiveness on first split 141 | shapley_vals_plus = self.vimp_ + self.vimp_[0] 142 | sigmas_one = np.sqrt(self.ses_ ** 2 + sigma_none_0 ** 2) 143 | self.test_statistics_, self.p_values_, self.hyp_tests_ = shapley_hyp_test(shapley_vals_plus[1:], v_none_0, sigmas_one, sigma_none_0, level = alpha, delta = delta, p = self.p_) 144 | return self 145 | -------------------------------------------------------------------------------- /vimpy/spvim_ic.py: -------------------------------------------------------------------------------- 1 | # influence functions for shapley values 2 | 3 | def shapley_influence_function(Z, z_counts, W, v, psi, G, c_n, ics, measure): 4 | """ 5 | Compute influence function for the given predictiveness measure 6 | 7 | @param Z the subsets of the power set with estimates 8 | @param W the matrix of weights 9 | @param v the estimated predictivness 10 | @param psi the estimated Shapley values 11 | @param G the constrained ls matrix 12 | @param c_n the constraints 13 | @param ics a list of all ics 14 | @param measure the predictiveness measure 15 | """ 16 | import numpy as np 17 | 18 | ## compute contribution from estimating V 19 | Z_W = Z.transpose().dot(W) 20 | A_m = Z_W.dot(Z) 21 | A_m_inv = np.linalg.inv(A_m) 22 | phi_01 = A_m_inv.dot(Z_W).dot(ics) 23 | 24 | ## compute contribution from estimating Q 25 | qr_decomp = np.linalg.qr(G.transpose(), mode = 'complete') 26 | U_2 = qr_decomp[0][:, 3:(Z.shape[1])] 27 | V = U_2.transpose().dot(Z.transpose().dot(W).dot(Z)).dot(U_2) 28 | phi_02_shared_mat = (-1) * U_2.dot(np.linalg.inv(V)) 29 | phi_02_uniq_vectors = np.array([(Z[z, :].dot(psi) - v[z]) * (U_2.transpose().dot(Z[z, :])) for z in range(Z.shape[0])], dtype = np.float64).transpose() 30 | phi_02_uniq = phi_02_shared_mat.dot(phi_02_uniq_vectors) 31 | phi_02 = np.repeat(phi_02_uniq, z_counts, axis=1) 32 | 33 | return {'contrib_v': phi_01, 'contrib_s': phi_02} 34 | 35 | 36 | def shapley_se(shapley_ics, idx, gamma, na_rm = True): 37 | """ 38 | Standard error for the desired Shapley value 39 | 40 | @param shapley_ics: all influence function estimates 41 | @param idx: the index of interest 42 | @param gamma: the constant for sampling 43 | @param na_rm: remove NaNs? 44 | 45 | @return the standard error corresponding to the shapley value at idx 46 | """ 47 | import numpy as np 48 | if na_rm: 49 | var_v = np.nanvar(shapley_ics['contrib_v'][idx, :]) 50 | var_s = np.nanvar(shapley_ics['contrib_s'][idx, :]) 51 | else: 52 | var_v = np.var(shapley_ics['contrib_v'][idx, :]) 53 | var_s = np.var(shapley_ics['contrib_s'][idx, :]) 54 | se = np.sqrt(var_v / shapley_ics['contrib_v'].shape[1] + var_s / shapley_ics['contrib_s'].shape[1] / gamma) 55 | return se 56 | -------------------------------------------------------------------------------- /vimpy/vim.py: -------------------------------------------------------------------------------- 1 | ## Python class for estimates of variable importance 2 | ## compute estimates and confidence intervals, do hypothesis testing 3 | 4 | ## import required libraries 5 | import numpy as np 6 | from scipy.stats import norm 7 | from .predictiveness_measures import cv_predictiveness, cv_predictiveness_precomputed 8 | from .vimpy_utils import get_measure_function 9 | 10 | 11 | class vim: 12 | 13 | ## define initialization values 14 | """ 15 | @param y the outcome 16 | @param x the feature data 17 | @param s the feature group of interest 18 | @param measure_type the predictiveness measure to use (for now, one of "r_squared", "auc", "accuracy", "deviance") 19 | @param pred_func the function that predicts outcome given features 20 | @param ensemble is pred_func an ensemble (True) or a single function (False, default) 21 | @param f fitted values from regression of outcome on all features (only used if pred_func is not specified) 22 | @param r fitted values from regression of outcome on reduced set of features (only used if pred_func is not specified) 23 | @param folds outer folds, for hypothesis testing (only used if pred_func is not specified) 24 | @param na_rm remove NAs prior to computing predictiveness? (defaults to False) 25 | 26 | @return an object of class vim 27 | """ 28 | def __init__(self, y, x, s, measure_type, pred_func = None, ensemble = False, f = None, r = None, folds = None, na_rm = False): 29 | self.y_ = y 30 | self.x_ = x 31 | self.s_ = s 32 | self.n_ = y.shape[0] 33 | self.p_ = x.shape[1] 34 | self.pred_func_ = pred_func 35 | self.f_ = f 36 | self.r_ = r 37 | assert (pred_func is not None or (f is not None and r is not None)) 38 | self.measure_type_ = measure_type 39 | self.measure_ = get_measure_function(measure_type) 40 | self.vimp_ = [] 41 | self.se_ = [] 42 | self.ci_ = [] 43 | self.hyp_test_ = [] 44 | self.test_statistic_ = [] 45 | self.p_value_ = [] 46 | self.v_full_ = [] 47 | self.preds_full_ = [] 48 | self.v_redu_ = [] 49 | self.preds_redu_ = [] 50 | self.se_full_ = [] 51 | self.se_redu_ = [] 52 | self.ci_full_ = [] 53 | self.ci_redu_ = [] 54 | ## set up outer folds for hypothesis testing 55 | if folds is None: 56 | self.folds_outer_ = np.random.choice(a = np.arange(2), size = self.n_, replace = True, p = np.array([0.5, 0.5])) 57 | else: 58 | assert (f is not None) 59 | self.folds_outer_ = folds 60 | self.folds_inner_1 = [] 61 | self.folds_inner_0 = [] 62 | self.cc_1 = [] 63 | self.cc_0 = [] 64 | self.ic_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1)))) 65 | self.ic_full_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1)))) 66 | self.ic_redu_ = np.zeros((max(np.sum(self.folds_outer_ == 0), np.sum(self.folds_outer_ == 1)))) 67 | ## if only two unique values in y, assume binary 68 | self.binary_ = (np.unique(y).shape[0] == 2) 69 | self.na_rm_ = na_rm 70 | self.ensemble_ = ensemble 71 | 72 | ## calculate the variable importance estimate 73 | def get_point_est(self): 74 | if self.pred_func_ is not None: 75 | predictiveness_func = cv_predictiveness 76 | this_full_func = self.pred_func_ 77 | this_redu_func = self.pred_func_ 78 | folds = None 79 | else: 80 | predictiveness_func = cv_predictiveness_precomputed 81 | this_full_func = self.f_ 82 | this_redu_func = self.r_ 83 | folds = None 84 | self.v_full_, self.preds_full_, ic_full, self.folds_inner_1, self.cc_1 = predictiveness_func(self.x_[self.folds_outer_ == 1, :], self.y_[self.folds_outer_ == 1], np.arange(self.p_), self.measure_, this_full_func, V = 1, stratified = self.binary_, na_rm = self.na_rm_, folds = folds, ensemble = self.ensemble_) 85 | self.v_redu_, self.preds_redu_, ic_redu, self.folds_inner_0, self.cc_0 = predictiveness_func(self.x_[self.folds_outer_ == 0, :], self.y_[self.folds_outer_ == 0], np.delete(np.arange(self.p_), self.s_), self.measure_, this_redu_func, V = 1, stratified = self.binary_, na_rm = self.na_rm_, folds = folds, ensemble = self.ensemble_) 86 | self.vimp_ = self.v_full_ - self.v_redu_ 87 | self.ic_full_[:ic_full.shape[0]] = ic_full 88 | self.ic_redu_[:ic_redu.shape[0]] = ic_redu 89 | return self 90 | 91 | ## calculate the influence function 92 | def get_influence_function(self): 93 | self.ic_ = self.ic_full_ - self.ic_redu_ 94 | return self 95 | 96 | ## calculate the standard error 97 | def get_se(self): 98 | self.se_full_ = np.sqrt(np.mean(self.ic_full_ ** 2)) / np.sqrt(self.ic_full_.shape[0]) 99 | self.se_redu_ = np.sqrt(np.mean(self.ic_redu_ ** 2)) / np.sqrt(self.ic_redu_.shape[0]) 100 | self.se_ = np.sqrt(np.mean(self.ic_ ** 2)) / np.sqrt(self.ic_.shape[0]) 101 | return self 102 | 103 | ## calculate the ci based on the estimate and the standard error 104 | def get_ci(self, level = 0.95): 105 | ## get alpha from the level 106 | a = (1 - level) / 2. 107 | a = np.array([a, 1 - a]) 108 | ## calculate the quantiles 109 | fac = norm.ppf(a) 110 | ## create cis for vimp, predictiveness 111 | self.ci_ = self.vimp_ + np.outer((self.se_), fac) 112 | self.ci_full_ = self.v_full_ + np.outer((self.se_full_), fac) 113 | self.ci_redu_ = self.v_redu_ + np.outer((self.se_redu_), fac) 114 | return self 115 | 116 | ## do a hypothesis test 117 | def hypothesis_test(self, alpha = 0.05, delta = 0): 118 | self.test_statistic_ = (self.v_full_ - self.v_redu_ - delta) / np.sqrt(self.se_full_ ** 2 + self.se_redu_ ** 2) 119 | self.p_value_ = 1 - norm.cdf(self.test_statistic_) 120 | self.hyp_test_ = self.p_value_ < alpha 121 | return(self) 122 | -------------------------------------------------------------------------------- /vimpy/vimpy_utils.py: -------------------------------------------------------------------------------- 1 | ## utility functions 2 | 3 | 4 | def deprecated(func): 5 | '''This is a decorator which can be used to mark functions 6 | as deprecated. It will result in a warning being emitted 7 | when the function is used.''' 8 | import warnings 9 | 10 | def new_func(*args, **kwargs): 11 | warnings.warn("Call to deprecated function {}.".format(func.__name__), 12 | category=DeprecationWarning) 13 | return func(*args, **kwargs) 14 | new_func.__name__ = func.__name__ 15 | new_func.__doc__ = func.__doc__ 16 | new_func.__dict__.update(func.__dict__) 17 | return new_func 18 | 19 | 20 | ## get the measure function, IC function 21 | def get_measure_function(type): 22 | from .predictiveness_measures import r_squared, accuracy, auc, deviance 23 | if type == "r_squared": 24 | measure = r_squared 25 | elif type == "accuracy": 26 | measure = accuracy 27 | elif type == "auc": 28 | measure = auc 29 | elif type == "deviance": 30 | measure = deviance 31 | else: 32 | raise ValueError("We do not currently support the entered predictiveness measure. Please provide a different predictiveness measure.") 33 | return measure 34 | 35 | 36 | def choose(n, k): 37 | import math 38 | return int(math.factorial(n) / (math.factorial(k) * math.factorial(n - k))) 39 | 40 | 41 | def make_folds(y, V, stratified = True): 42 | """ 43 | Create folds for CV (potentially stratified) 44 | """ 45 | import numpy as np 46 | if stratified: 47 | y_1 = y == 1 48 | y_0 = y == 0 49 | folds_1 = np.resize(np.arange(V), sum(y_1)) 50 | np.random.shuffle(folds_1) 51 | folds_0 = np.resize(np.arange(V), sum(y_0)) 52 | np.random.shuffle(folds_0) 53 | folds = np.empty((y.shape[0])) 54 | folds[np.ravel(y_1)] = folds_1 55 | folds[np.ravel(y_0)] = folds_0 56 | else: 57 | folds = np.resize(np.arange(V), y.shape[0]) 58 | np.random.shuffle(folds) 59 | return folds 60 | 61 | 62 | ## hypothesis testing with shapley values 63 | def shapley_hyp_test(vs_one_1, v_none_0, sigmas_one, sigma_none, delta = 0, level = 0.05, p = 3): 64 | """ 65 | Hypothesis testing for Shapley values 66 | 67 | @param vs_one_1: one-feature measures of predictiveness 68 | @param v_none_0: null-model predictiveness 69 | @param sigmas_one: ses 70 | @param sigma_none: null-model se 71 | @param delta: value for testing 72 | @param level: significance level 73 | 74 | @return: test_statistics (the test statistics), p_vals (p-values), hyp_tests (the hypothesis testing results) 75 | """ 76 | import numpy as np 77 | from scipy.stats import norm 78 | 79 | test_statistics = [(vs_one_1[v] - v_none_0 - delta) / (np.sqrt(sigmas_one[v] ** 2 + sigma_none ** 2)) for v in range(p)] 80 | p_values = 1. - norm.cdf(test_statistics) 81 | hyp_tests = p_values < level 82 | return test_statistics, p_values, hyp_tests 83 | --------------------------------------------------------------------------------