├── .github ├── .gitignore ├── workflows │ ├── remove-old-artifacts.yml │ ├── lint.yaml │ ├── pkgdown.yaml │ └── R-CMD-check.yaml └── CONTRIBUTING.md ├── .covrignore ├── inst ├── joss_paper │ ├── .gitignore │ ├── paper.pdf │ ├── paper.md │ ├── paper.Rmd │ └── paper.bib ├── scripts │ ├── devel │ │ ├── master_res.rds │ │ ├── master_res2.rds │ │ ├── devel_non_exact_grouping.R │ │ ├── same_seed_as_master.R │ │ ├── compare_indep_implementations.R │ │ └── compare_explain_batch.R │ ├── create_lm_model_object.R │ ├── create_xgboost_model_object.R │ ├── shap_python_script.py │ ├── readme_example.R │ ├── example_custom_model.R │ ├── example_ctree_method.R │ └── compare_shap_python.R ├── model_objects │ ├── lm_model_object.rds │ ├── xgboost_model_object.rds │ ├── xgboost_model_object_raw │ ├── xgboost_model_object_cat.rds │ └── xgboost_model_object_raw_cat └── REFERENCES.bib ├── src ├── .gitignore ├── Makevars ├── Makevars.win ├── features.cpp ├── impute_data.cpp ├── distance.cpp ├── weighted_matrix.cpp └── RcppExports.cpp ├── vignettes └── .gitignore ├── LICENSE ├── tests ├── testthat.R └── testthat │ ├── model_objects │ └── lm_model_object.rds │ ├── test_objects │ ├── shapley_explainer_obj.rds │ ├── explanation_explain_obj_list.rds │ ├── shapley_explainer_group1_2_obj.rds │ ├── shapley_explainer_group1_obj.rds │ ├── shapley_explainer_group2_2_obj.rds │ ├── shapley_explainer_group2_obj.rds │ ├── explanation_explain_group_obj_list.rds │ ├── explanation_explain_obj_list_fixed.rds │ └── explanation_explain_obj_list_no_ctree.rds │ ├── test-transformation.R │ ├── test-plot.R │ ├── test-predictions.R │ ├── test-src_weighted_matrix.R │ ├── test-src_impute_data.R │ ├── manual_test_scripts │ └── test_custom_models.R │ ├── test-observations.R │ └── test-a-shapley.R ├── man ├── figures │ ├── README-basic_example-1.png │ └── NR-logo_utvidet_r32g60b136_small.png ├── get_supported_models.Rd ├── rss_cpp.Rd ├── feature_matrix_cpp.Rd ├── compute_shapley.Rd ├── correction_matrix_cpp.Rd ├── gaussian_transform.Rd ├── inv_gaussian_transform.Rd ├── get_list_approaches.Rd ├── feature_group.Rd ├── hat_matrix_cpp.Rd ├── sample_gaussian.Rd ├── aicc_full_cpp.Rd ├── check_groups.Rd ├── gaussian_transform_separate.Rd ├── aicc_full_single_cpp.Rd ├── weight_matrix_cpp.Rd ├── weight_matrix.Rd ├── create_S_batch.Rd ├── apply_dummies.Rd ├── feature_group_not_exact.Rd ├── sample_copula.Rd ├── shapley_weights.Rd ├── mahalanobis_distance_cpp.Rd ├── process_groups.Rd ├── update_data.Rd ├── sample_combinations.Rd ├── get_data_specs.Rd ├── model_checker.Rd ├── prepare_and_predict.Rd ├── preprocess_data.Rd ├── check_features.Rd ├── make_dummies.Rd ├── observation_impute.Rd ├── get_model_specs.Rd ├── prediction.Rd ├── observation_impute_cpp.Rd ├── prepare_data.Rd ├── plot.shapr.Rd ├── feature_combinations.Rd ├── sample_ctree.Rd ├── create_ctree.Rd ├── predict_model.Rd └── shapr.Rd ├── R ├── utils.R ├── zzz.R ├── shapr-package.R ├── transformation.R ├── predictions.R ├── plot.R └── RcppExports.R ├── .Rbuildignore ├── .gitignore ├── shapr.Rproj ├── _pkgdown.yml ├── .lintr ├── LICENSE.md ├── CODE_OF_CONDUCT.md ├── DESCRIPTION ├── cran-comments.md ├── NAMESPACE ├── NEWS.md └── README.Rmd /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.covrignore: -------------------------------------------------------------------------------- 1 | R/global.R 2 | R/zzz.R 3 | -------------------------------------------------------------------------------- /inst/joss_paper/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Norsk Regnesentral 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(shapr) 3 | 4 | test_check("shapr") 5 | -------------------------------------------------------------------------------- /inst/joss_paper/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/joss_paper/paper.pdf -------------------------------------------------------------------------------- /inst/scripts/devel/master_res.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/scripts/devel/master_res.rds -------------------------------------------------------------------------------- /inst/scripts/devel/master_res2.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/scripts/devel/master_res2.rds -------------------------------------------------------------------------------- /inst/model_objects/lm_model_object.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/lm_model_object.rds -------------------------------------------------------------------------------- /man/figures/README-basic_example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/man/figures/README-basic_example-1.png -------------------------------------------------------------------------------- /inst/model_objects/xgboost_model_object.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object.rds -------------------------------------------------------------------------------- /inst/model_objects/xgboost_model_object_raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object_raw -------------------------------------------------------------------------------- /inst/model_objects/xgboost_model_object_cat.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object_cat.rds -------------------------------------------------------------------------------- /inst/model_objects/xgboost_model_object_raw_cat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object_raw_cat -------------------------------------------------------------------------------- /man/figures/NR-logo_utvidet_r32g60b136_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/man/figures/NR-logo_utvidet_r32g60b136_small.png -------------------------------------------------------------------------------- /tests/testthat/model_objects/lm_model_object.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/model_objects/lm_model_object.rds -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | CXX_STD = CXX11 2 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 3 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 4 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | CXX_STD = CXX11 2 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 3 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 4 | -------------------------------------------------------------------------------- /tests/testthat/test_objects/shapley_explainer_obj.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_obj.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/explanation_explain_obj_list.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_obj_list.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/shapley_explainer_group1_2_obj.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group1_2_obj.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/shapley_explainer_group1_obj.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group1_obj.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/shapley_explainer_group2_2_obj.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group2_2_obj.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/shapley_explainer_group2_obj.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group2_obj.rds -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | unique_features <- function(x) { 3 | unique( 4 | unlist( 5 | strsplit(x, split = ":", fixed = TRUE) 6 | ) 7 | ) 8 | } 9 | -------------------------------------------------------------------------------- /tests/testthat/test_objects/explanation_explain_group_obj_list.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_group_obj_list.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/explanation_explain_obj_list_fixed.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_obj_list_fixed.rds -------------------------------------------------------------------------------- /tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname = find.package("shapr"), pkgname = "shapr") { 2 | 3 | # CRAN Note avoidance 4 | utils::globalVariables( 5 | c( 6 | ".", ".N", ".I", ".GRP", ".SD" 7 | ) 8 | ) 9 | invisible() 10 | } 11 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^README\.Rmd$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^Rprof\.out$ 5 | ^\.circleci$ 6 | ^\.lintr$ 7 | ^\.covrignore$ 8 | ^LICENSE\.md$ 9 | inst/compare_lundberg\.xgb\.obj 10 | ^\.github$ 11 | ^CODE_OF_CONDUCT\.md$ 12 | ^cran-comments\.md$ 13 | ^docs$ 14 | ^CRAN-RELEASE$ 15 | ^inst/scripts$ 16 | ^doc$ 17 | ^Meta$ 18 | ^_pkgdown\.yml$ 19 | ^pkgdown$ 20 | ^.Rprofile 21 | -------------------------------------------------------------------------------- /man/get_supported_models.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models.R 3 | \name{get_supported_models} 4 | \alias{get_supported_models} 5 | \title{Provides a data.table with the supported models} 6 | \usage{ 7 | get_supported_models() 8 | } 9 | \description{ 10 | Provides a data.table with the supported models 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | Rprof.out 6 | src/*.o 7 | src/*.so 8 | 9 | *.o 10 | 11 | *.so 12 | 13 | tmp/ 14 | 15 | \.RDataTmp 16 | 17 | *.out 18 | 19 | *.Rout 20 | 21 | src-i386/ 22 | 23 | src-x64/ 24 | 25 | paper_experiments/res/single_res/ 26 | 27 | paper_experiments/res/single_res_old/ 28 | inst/doc 29 | 30 | docs/* 31 | doc 32 | Meta 33 | docs 34 | -------------------------------------------------------------------------------- /inst/scripts/create_lm_model_object.R: -------------------------------------------------------------------------------- 1 | # Load data ----------- 2 | data("Boston", package = "MASS") 3 | df <- tail(Boston, 50) 4 | 5 | # Fit linear model 6 | set.seed(123) 7 | model <- lm(medv ~ lstat + rm + dis + indus, data = df) 8 | 9 | saveRDS(object = model, "inst/model_objects/lm_model_object.rds") 10 | 11 | # Used for testing as well, so need a copy un the testthat directory 12 | saveRDS(object = model, "tests/testthat/model_objects/lm_model_object.rds") 13 | -------------------------------------------------------------------------------- /src/features.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //' @keywords internal 5 | // [[Rcpp::export]] 6 | List sample_features_cpp(int m, IntegerVector n_features) { 7 | 8 | int n = n_features.length(); 9 | List l(n); 10 | 11 | for (int i = 0; i < n; i++) { 12 | 13 | int s = n_features[i]; 14 | IntegerVector k = sample(m, s); 15 | std::sort(k.begin(), k.end()); 16 | l[i] = k; 17 | 18 | } 19 | 20 | return l; 21 | } 22 | -------------------------------------------------------------------------------- /.github/workflows/remove-old-artifacts.yml: -------------------------------------------------------------------------------- 1 | name: Remove old artifacts 2 | 3 | on: 4 | schedule: 5 | # Every day at 1am 6 | - cron: '0 1 * * *' 7 | 8 | jobs: 9 | remove-old-artifacts: 10 | runs-on: ubuntu-latest 11 | timeout-minutes: 10 12 | 13 | steps: 14 | - name: Remove old artifacts 15 | uses: c-hive/gha-remove-artifacts@v1 16 | with: 17 | age: '1 week' 18 | skip-recent: 6 19 | # Optional inputs 20 | # skip-tags: true 21 | -------------------------------------------------------------------------------- /shapr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source --no-lock 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /man/rss_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{rss_cpp} 4 | \alias{rss_cpp} 5 | \title{sigma_hat_sq-function} 6 | \usage{ 7 | rss_cpp(H, y) 8 | } 9 | \arguments{ 10 | \item{H}{Matrix. Output from \code{\link{hat_matrix_cpp}}} 11 | 12 | \item{y}{Vector, i.e. representing the response variable} 13 | } 14 | \value{ 15 | Scalar 16 | } 17 | \description{ 18 | sigma_hat_sq-function 19 | } 20 | \author{ 21 | Martin Jullum 22 | } 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/feature_matrix_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{feature_matrix_cpp} 4 | \alias{feature_matrix_cpp} 5 | \title{Get feature matrix} 6 | \usage{ 7 | feature_matrix_cpp(features, m) 8 | } 9 | \arguments{ 10 | \item{features}{List} 11 | 12 | \item{m}{Positive integer. Total number of features} 13 | } 14 | \value{ 15 | Matrix 16 | } 17 | \description{ 18 | Get feature matrix 19 | } 20 | \author{ 21 | Nikolai Sellereite 22 | } 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://norskregnesentral.github.io/shapr/ 2 | 3 | navbar: 4 | structure: 5 | left: [home, articles, news, reference] 6 | right: [github] 7 | components: 8 | articles: 9 | text: Vignettes 10 | menu: 11 | - text: "Explaining individual machine learning predictions with Shapley values" 12 | href: articles/understanding_shapr.html 13 | news: 14 | text: News 15 | href: news/index.html 16 | reference: 17 | text: Manual 18 | href: reference/index.html 19 | -------------------------------------------------------------------------------- /man/compute_shapley.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictions.R 3 | \name{compute_shapley} 4 | \alias{compute_shapley} 5 | \title{Compute shapley values} 6 | \usage{ 7 | compute_shapley(explainer, contribution_mat) 8 | } 9 | \arguments{ 10 | \item{explainer}{An \code{explain} object.} 11 | 12 | \item{contribution_mat}{The contribution matrix.} 13 | } 14 | \value{ 15 | A \code{data.table} with shapley values for each test observation. 16 | } 17 | \description{ 18 | Compute shapley values 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/correction_matrix_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{correction_matrix_cpp} 4 | \alias{correction_matrix_cpp} 5 | \title{correction term with trace_input in AICc formula} 6 | \usage{ 7 | correction_matrix_cpp(tr_H, n) 8 | } 9 | \arguments{ 10 | \item{tr_H}{numeric giving the trace of H} 11 | 12 | \item{n}{numeric given the number of rows in H} 13 | } 14 | \value{ 15 | Scalar 16 | } 17 | \description{ 18 | correction term with trace_input in AICc formula 19 | } 20 | \author{ 21 | Martin Jullum 22 | } 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/gaussian_transform.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/transformation.R 3 | \name{gaussian_transform} 4 | \alias{gaussian_transform} 5 | \title{Transforms a sample to standardized normal distribution} 6 | \usage{ 7 | gaussian_transform(x) 8 | } 9 | \arguments{ 10 | \item{x}{Numeric vector.The data which should be transformed to a standard normal distribution.} 11 | } 12 | \value{ 13 | Numeric vector of length \code{length(x)} 14 | } 15 | \description{ 16 | Transforms a sample to standardized normal distribution 17 | } 18 | \author{ 19 | Martin Jullum 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /R/shapr-package.R: -------------------------------------------------------------------------------- 1 | #' @importFrom data.table data.table fread fwrite setnames := year month 2 | #' uniqueN setkey as.data.table copy between is.data.table setcolorder rbindlist 3 | #' 4 | #' @importFrom graphics plot hist rect 5 | #' 6 | #' @importFrom utils head tail methods 7 | #' 8 | #' @importFrom stats predict 9 | #' 10 | #' @importFrom stats as.formula 11 | #' 12 | #' @importFrom stats model.matrix 13 | #' 14 | #' @importFrom stats model.frame 15 | #' 16 | #' @importFrom stats setNames 17 | #' 18 | #' @importFrom stats contrasts 19 | #' 20 | #' @importFrom Rcpp sourceCpp 21 | #' 22 | #' @keywords internal 23 | #' 24 | #' @useDynLib shapr, .registration = TRUE 25 | NULL 26 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | linters: with_defaults( 2 | line_length_linter = lintr::line_length_linter(120), 3 | object_name_linter = NULL, 4 | object_usage_linter = NULL, 5 | seq_linter = NULL, 6 | cyclocomp_linter = lintr::cyclocomp_linter() 7 | ) 8 | exclusions: list( 9 | "inst/scripts/compare_shap_python.R", 10 | "inst/scripts/create_lm_model_object.R", 11 | "inst/scripts/create_xgboost_model_object.R", 12 | "inst/scripts/example_ctree_model.R", 13 | "inst/scripts/example_custom_model.R", 14 | "inst/scripts/readme_example.R", 15 | "inst/scripts/shap_python_script.py", 16 | "inst/scripts/devel/compare_indep_implementations.R", 17 | "R/RcppExports.R", 18 | "R/zzz.R" 19 | ) 20 | -------------------------------------------------------------------------------- /man/inv_gaussian_transform.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/transformation.R 3 | \name{inv_gaussian_transform} 4 | \alias{inv_gaussian_transform} 5 | \title{Transforms new data to a standardized normal distribution} 6 | \usage{ 7 | inv_gaussian_transform(zx, n_z) 8 | } 9 | \arguments{ 10 | \item{zx}{Numeric vector. The first \code{n_z} items are the Gaussian data, and the last part is 11 | the data with the original transformation.} 12 | 13 | \item{n_z}{Positive integer. Number of elements of \code{zx} that belongs to new data.} 14 | } 15 | \value{ 16 | Numeric vector of length \code{n_z} 17 | } 18 | \description{ 19 | Transforms new data to a standardized normal distribution 20 | } 21 | \author{ 22 | Martin Jullum 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/get_list_approaches.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/explanation.R 3 | \name{get_list_approaches} 4 | \alias{get_list_approaches} 5 | \title{Helper function used in \code{\link{explain.combined}}} 6 | \usage{ 7 | get_list_approaches(n_features, approach) 8 | } 9 | \arguments{ 10 | \item{n_features}{Integer vector. Note that 11 | \code{length(n_features) <= 2^m}, where \code{m} equals the number 12 | of features.} 13 | 14 | \item{approach}{Character vector of length \code{m}. All elements should be 15 | either \code{"empirical"}, \code{"gaussian"} or \code{"copula"}.} 16 | } 17 | \value{ 18 | List 19 | } 20 | \description{ 21 | Helper function used in \code{\link{explain.combined}} 22 | } 23 | \author{ 24 | Nikolai Sellereite 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/feature_group.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/features.R 3 | \name{feature_group} 4 | \alias{feature_group} 5 | \title{Analogue to feature_exact, but for groups instead.} 6 | \usage{ 7 | feature_group(group_num, weight_zero_m = 10^6) 8 | } 9 | \arguments{ 10 | \item{group_num}{List. Contains vector of integers indicating the feature numbers for the 11 | different groups.} 12 | 13 | \item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special 14 | cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.} 15 | } 16 | \value{ 17 | data.table with all feature group combinations, shapley weights etc. 18 | } 19 | \description{ 20 | Analogue to feature_exact, but for groups instead. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: lint 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - uses: r-lib/actions/setup-r@v1 20 | with: 21 | use-public-rspm: true 22 | 23 | - uses: r-lib/actions/setup-r-dependencies@v1 24 | with: 25 | extra-packages: lintr 26 | 27 | - name: Lint 28 | run: lintr::lint_package() 29 | shell: Rscript {0} 30 | -------------------------------------------------------------------------------- /man/hat_matrix_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{hat_matrix_cpp} 4 | \alias{hat_matrix_cpp} 5 | \title{Computing single H matrix in AICc-function using the Mahalanobis distance} 6 | \usage{ 7 | hat_matrix_cpp(X, mcov, S_scale_dist, h) 8 | } 9 | \arguments{ 10 | \item{X}{matrix with "covariates"} 11 | 12 | \item{mcov}{covariance matrix} 13 | 14 | \item{S_scale_dist}{logical indicating whether the Mahalanobis distance should be scaled with the number of variables} 15 | 16 | \item{h}{numeric specifying the scaling (sigma)} 17 | } 18 | \value{ 19 | Matrix of dimension \code{ncol(X)*ncol(X)} 20 | } 21 | \description{ 22 | Computing single H matrix in AICc-function using the Mahalanobis distance 23 | } 24 | \author{ 25 | Martin Jullum 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /man/sample_gaussian.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sampling.R 3 | \name{sample_gaussian} 4 | \alias{sample_gaussian} 5 | \title{Sample conditional Gaussian variables} 6 | \usage{ 7 | sample_gaussian(index_given, n_samples, mu, cov_mat, m, x_test) 8 | } 9 | \arguments{ 10 | \item{index_given}{Integer vector. The indices of the features to condition upon. Note that 11 | \code{min(index_given) >= 1} and \code{max(index_given) <= m}.} 12 | 13 | \item{m}{Positive integer. The total number of features.} 14 | 15 | \item{x_test}{Numeric matrix. Contains the features of the observation whose 16 | predictions ought to be explained (test data).} 17 | } 18 | \value{ 19 | data.table 20 | } 21 | \description{ 22 | Sample conditional Gaussian variables 23 | } 24 | \author{ 25 | Martin Jullum 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /inst/scripts/create_xgboost_model_object.R: -------------------------------------------------------------------------------- 1 | library(xgboost) 2 | 3 | data("Boston") 4 | 5 | x_var <- c("lstat", "rm", "dis", "indus") 6 | y_var <- "medv" 7 | 8 | x_train <- as.matrix(tail(Boston[, x_var], -6)) 9 | y_train <- tail(Boston[, y_var], -6) 10 | x_test <- as.matrix(head(Boston[, x_var], 6)) 11 | 12 | # Creating a larger test data set (300 observations) for more realistic function time calls. 13 | # Modifying x_test to repeat the 6 test observations 50 times 14 | x_test = rep(1,50) %x% x_test 15 | colnames(x_test) <- colnames(x_train) 16 | 17 | # Fitting a basic xgboost model to the training data 18 | model <- xgboost( 19 | data = x_train, 20 | label = y_train, 21 | nround = 20 22 | ) 23 | 24 | saveRDS(model,file = "inst/model_objects/xgboost_model_object.rds") 25 | 26 | xgb.save(model=model,fname = "inst/model_objects/xgboost_model_object_raw") 27 | -------------------------------------------------------------------------------- /man/aicc_full_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{aicc_full_cpp} 4 | \alias{aicc_full_cpp} 5 | \title{AICc formula for several sets, alternative definition} 6 | \usage{ 7 | aicc_full_cpp(h, X_list, mcov_list, S_scale_dist, y_list, negative) 8 | } 9 | \arguments{ 10 | \item{h}{Numeric. Specifies the scaling (sigma)} 11 | 12 | \item{X_list}{List} 13 | 14 | \item{mcov_list}{List} 15 | 16 | \item{S_scale_dist}{Logical. Indicates whether Mahalanobis distance should be scaled with the 17 | number of variables} 18 | 19 | \item{y_list}{List.} 20 | 21 | \item{negative}{Logical.} 22 | } 23 | \value{ 24 | Scalar with the numeric value of the AICc formula 25 | } 26 | \description{ 27 | AICc formula for several sets, alternative definition 28 | } 29 | \author{ 30 | Martin Jullum 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/check_groups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/features.R 3 | \name{check_groups} 4 | \alias{check_groups} 5 | \title{Check that the group parameter has the right form and content} 6 | \usage{ 7 | check_groups(feature_labels, group) 8 | } 9 | \arguments{ 10 | \item{feature_labels}{Vector of characters. Contains the feature labels used by the model} 11 | 12 | \item{group}{List. If \code{NULL} regular feature wise Shapley values are computed. 13 | If provided, group wise Shapley values are computed. \code{group} then has length equal to 14 | the number of groups. The list element contains character vectors with the features included 15 | in each of the different groups.} 16 | } 17 | \value{ 18 | Error or NULL 19 | } 20 | \description{ 21 | Check that the group parameter has the right form and content 22 | } 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/gaussian_transform_separate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/transformation.R 3 | \name{gaussian_transform_separate} 4 | \alias{gaussian_transform_separate} 5 | \title{Transforms new data to standardized normal (dimension 1) based on other data transformations} 6 | \usage{ 7 | gaussian_transform_separate(yx, n_y) 8 | } 9 | \arguments{ 10 | \item{yx}{Numeric vector. The first \code{n_y} items is the data that is transformed, and last 11 | part is the data with the original transformation.} 12 | 13 | \item{n_y}{Positive integer. Number of elements of \code{yx} that belongs to the gaussian data.} 14 | } 15 | \value{ 16 | Vector of back-transformed Gaussian data 17 | } 18 | \description{ 19 | Transforms new data to standardized normal (dimension 1) based on other data transformations 20 | } 21 | \author{ 22 | Martin Jullum 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/aicc_full_single_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{aicc_full_single_cpp} 4 | \alias{aicc_full_single_cpp} 5 | \title{Temp-function for computing the full AICc with several X's etc} 6 | \usage{ 7 | aicc_full_single_cpp(X, mcov, S_scale_dist, h, y) 8 | } 9 | \arguments{ 10 | \item{X}{matrix with "covariates"} 11 | 12 | \item{mcov}{covariance matrix} 13 | 14 | \item{S_scale_dist}{logical indicating whether the Mahalanobis distance should be scaled with the number of variables} 15 | 16 | \item{h}{numeric specifying the scaling (sigma)} 17 | 18 | \item{y}{vector with the "response variable"} 19 | } 20 | \value{ 21 | Scalar with the numeric value of the AICc formula 22 | } 23 | \description{ 24 | Temp-function for computing the full AICc with several X's etc 25 | } 26 | \author{ 27 | Martin Jullum 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/weight_matrix_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{weight_matrix_cpp} 4 | \alias{weight_matrix_cpp} 5 | \title{Calculate weight matrix} 6 | \usage{ 7 | weight_matrix_cpp(subsets, m, n, w) 8 | } 9 | \arguments{ 10 | \item{subsets}{List. Each of the elements equals an integer 11 | vector representing a valid combination of features/feature groups.} 12 | 13 | \item{m}{Integer. Number of features/feature groups} 14 | 15 | \item{n}{Integer. Number of combinations} 16 | 17 | \item{w}{Numeric vector of length \code{n}, i.e. \code{w[i]} equals 18 | the Shapley weight of feature/feature group combination \code{i}, represented by 19 | \code{subsets[[i]]}.} 20 | } 21 | \value{ 22 | Matrix of dimension n x m + 1 23 | } 24 | \description{ 25 | Calculate weight matrix 26 | } 27 | \author{ 28 | Nikolai Sellereite 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/weight_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shapley.R 3 | \name{weight_matrix} 4 | \alias{weight_matrix} 5 | \title{Calculate weighted matrix} 6 | \usage{ 7 | weight_matrix(X, normalize_W_weights = TRUE, is_groupwise = FALSE) 8 | } 9 | \arguments{ 10 | \item{X}{data.table} 11 | 12 | \item{normalize_W_weights}{Logical. Whether to normalize the weights for the combinations to sum to 1 for 13 | increased numerical stability before solving the WLS (weighted least squares). Applies to all combinations 14 | except combination \code{1} and \code{2^m}.} 15 | 16 | \item{is_groupwise}{Logical. Indicating whether group wise Shapley values are to be computed.} 17 | } 18 | \value{ 19 | Numeric matrix. See \code{\link{weight_matrix_cpp}} for more information. 20 | } 21 | \description{ 22 | Calculate weighted matrix 23 | } 24 | \author{ 25 | Nikolai Sellereite, Martin Jullum 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /man/create_S_batch.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/explanation.R 3 | \name{create_S_batch} 4 | \alias{create_S_batch} 5 | \title{Compute Shapley values in batches} 6 | \usage{ 7 | create_S_batch(explainer, n_batches, index_S = NULL) 8 | } 9 | \arguments{ 10 | \item{explainer}{The binary matrix \code{S} returned from \code{\link{shapr}}.} 11 | 12 | \item{n_batches}{Numeric value specifying how many batches \code{S} should be split into.} 13 | 14 | \item{index_S}{Numeric vector specifying which rows of \code{S} that should be considered.} 15 | } 16 | \value{ 17 | A list of length \code{n_batches}. 18 | } 19 | \description{ 20 | Create a list of indexes used to compute Shapley values in batches. 21 | } 22 | \details{ 23 | If \code{index_S} is not \code{NULL} then the number of batches is scaled such that the 24 | total number of batches is equal \code{n_batches} and not within the rows specified by\code{index_S}. 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/apply_dummies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/features.R 3 | \name{apply_dummies} 4 | \alias{apply_dummies} 5 | \title{Apply dummy variables - this is an internal function intended only to be used in 6 | predict_model.xgb.Booster()} 7 | \usage{ 8 | apply_dummies(feature_list, testdata) 9 | } 10 | \arguments{ 11 | \item{feature_list}{List. The \code{feature_list} object in the output object after running 12 | \code{\link[shapr:make_dummies]{make_dummies}}} 13 | 14 | \item{testdata}{data.table or data.frame. New data that has the same 15 | feature names, types, and levels as \code{feature_list}.} 16 | } 17 | \value{ 18 | A data.table with all features but where the factors in \code{testdata} are 19 | one-hot encoded variables as specified in feature_list 20 | } 21 | \description{ 22 | Apply dummy variables - this is an internal function intended only to be used in 23 | predict_model.xgb.Booster() 24 | } 25 | \author{ 26 | Annabelle Redelmeier, Martin Jullum 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master, cranversion] 6 | tags: ['*'] 7 | 8 | name: pkgdown 9 | 10 | jobs: 11 | pkgdown: 12 | runs-on: ubuntu-latest 13 | env: 14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - uses: r-lib/actions/setup-pandoc@v1 19 | 20 | - uses: r-lib/actions/setup-r@v1 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v1 25 | with: 26 | extra-packages: pkgdown 27 | needs: website 28 | 29 | - name: Deploy package 30 | run: | 31 | git config --local user.name "$GITHUB_ACTOR" 32 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 33 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 34 | -------------------------------------------------------------------------------- /man/feature_group_not_exact.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/features.R 3 | \name{feature_group_not_exact} 4 | \alias{feature_group_not_exact} 5 | \title{Analogue to feature_not_exact, but for groups instead.} 6 | \usage{ 7 | feature_group_not_exact(group_num, n_combinations = 200, weight_zero_m = 10^6) 8 | } 9 | \arguments{ 10 | \item{group_num}{List. Contains vector of integers indicating the feature numbers for the 11 | different groups.} 12 | 13 | \item{n_combinations}{Integer. The number of feature combinations to sample. If \code{NULL}, 14 | the exact method is used and all combinations are considered. The maximum number of 15 | combinations equals \code{2^ncol(x)}.} 16 | 17 | \item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special 18 | cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.} 19 | } 20 | \value{ 21 | data.table with all feature group combinations, shapley weights etc. 22 | } 23 | \description{ 24 | Analogue to feature_not_exact, but for groups instead. 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Norsk Regnesentral 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /man/sample_copula.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sampling.R 3 | \name{sample_copula} 4 | \alias{sample_copula} 5 | \title{Sample conditional variables using the Gaussian copula approach} 6 | \usage{ 7 | sample_copula( 8 | index_given, 9 | n_samples, 10 | mu, 11 | cov_mat, 12 | m, 13 | x_test_gaussian, 14 | x_train, 15 | x_test 16 | ) 17 | } 18 | \arguments{ 19 | \item{index_given}{Integer vector. The indices of the features to condition upon. Note that 20 | \code{min(index_given) >= 1} and \code{max(index_given) <= m}.} 21 | 22 | \item{m}{Positive integer. The total number of features.} 23 | 24 | \item{x_test_gaussian}{Numeric matrix. Contains the observation whose predictions ought to be explained (test data), 25 | after quantile-transforming them to standard Gaussian variables.} 26 | 27 | \item{x_test}{Numeric matrix. Contains the features of the observation whose 28 | predictions ought to be explained (test data).} 29 | } 30 | \value{ 31 | data.table 32 | } 33 | \description{ 34 | Sample conditional variables using the Gaussian copula approach 35 | } 36 | \author{ 37 | Martin Jullum 38 | } 39 | \keyword{internal} 40 | -------------------------------------------------------------------------------- /man/shapley_weights.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shapley.R 3 | \name{shapley_weights} 4 | \alias{shapley_weights} 5 | \title{Calculate Shapley weight} 6 | \usage{ 7 | shapley_weights(m, N, n_components, weight_zero_m = 10^6) 8 | } 9 | \arguments{ 10 | \item{m}{Positive integer. Total number of features/feature groups.} 11 | 12 | \item{N}{Positive integer. The number of unique combinations when sampling \code{n_components} features/feature 13 | groups, without replacement, from a sample space consisting of \code{m} different features/feature groups.} 14 | 15 | \item{n_components}{Positive integer. Represents the number of features/feature groups you want to sample from 16 | a feature space consisting of \code{m} unique features/feature groups. Note that \code{ 0 < = n_components <= m}.} 17 | 18 | \item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special 19 | cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.} 20 | } 21 | \value{ 22 | Numeric 23 | } 24 | \description{ 25 | Calculate Shapley weight 26 | } 27 | \author{ 28 | Nikolai Sellereite 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /inst/scripts/shap_python_script.py: -------------------------------------------------------------------------------- 1 | #### Python code #### 2 | import xgboost as xgb 3 | import shap 4 | import numpy as np 5 | import pandas as pd 6 | import time 7 | 8 | model = xgb.Booster() # init model 9 | model.load_model("inst/model_objects/xgboost_model_object_raw") 10 | 11 | ## kernel shap sends data as numpy array which has no column names, so we fix it 12 | def xgb_predict(data_asarray): 13 | data_asDmatrix = xgb.DMatrix(data_asarray) 14 | return model.predict(data_asDmatrix) 15 | 16 | py_pred_test = xgb_predict(r.x_test) # Test predictions in python 17 | 18 | # 19 | 20 | #### Applying kernelshap 21 | 22 | time_py_start = time.perf_counter() 23 | 24 | shap_kernel_explainer = shap.KernelExplainer(xgb_predict, r.x_train) 25 | Kshap_shap0 = shap_kernel_explainer.shap_values(r.x_test,nsamples = int(100000),l1_reg=0) 26 | 27 | time_py_end = time.perf_counter() 28 | 29 | time_py = time_py_end-time_py_start 30 | 31 | getattr(shap_kernel_explainer,'expected_value') # This is phi0, not used at all below 32 | 33 | Kshap_shap = pd.DataFrame(Kshap_shap0,columns = r.x_var) 34 | 35 | Kshap_shap.insert(0,"none",getattr(shap_kernel_explainer,'expected_value'),True) # Adding the none column 36 | -------------------------------------------------------------------------------- /man/mahalanobis_distance_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{mahalanobis_distance_cpp} 4 | \alias{mahalanobis_distance_cpp} 5 | \title{(Generalized) Mahalanobis distance} 6 | \usage{ 7 | mahalanobis_distance_cpp( 8 | featureList, 9 | Xtrain_mat, 10 | Xtest_mat, 11 | mcov, 12 | S_scale_dist 13 | ) 14 | } 15 | \arguments{ 16 | \item{featureList}{List of vectors indicating all factor combinations that should be included in the computations. Assumes that the first one is empty.} 17 | 18 | \item{Xtrain_mat}{Matrix} 19 | 20 | \item{Xtest_mat}{Matrix} 21 | 22 | \item{mcov}{Matrix. The Sigma-matrix in the Mahalanobis distance formula (\code{stats::cov(Xtrain_mat)}) gives Mahalanobis distance, 23 | \code{diag(m)} gives the Euclidean distance.} 24 | 25 | \item{S_scale_dist}{Logical indicating} 26 | } 27 | \value{ 28 | Array of three dimensions. Contains the squared distance for between all training and test observations for all feature combinations passed to the function. 29 | } 30 | \description{ 31 | Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}. 32 | } 33 | \author{ 34 | Martin Jullum 35 | } 36 | \keyword{internal} 37 | -------------------------------------------------------------------------------- /man/process_groups.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocess_data.R 3 | \name{process_groups} 4 | \alias{process_groups} 5 | \title{Process (check and update names) the group list} 6 | \usage{ 7 | process_groups(group, feature_labels) 8 | } 9 | \arguments{ 10 | \item{group}{List. If \code{NULL} regular feature wise Shapley values are computed. 11 | If provided, group wise Shapley values are computed. \code{group} then has length equal to 12 | the number of groups. The list element contains character vectors with the features included 13 | in each of the different groups.} 14 | 15 | \item{feature_labels}{Vector of characters. Contains the feature labels used by the model} 16 | } 17 | \value{ 18 | List with two named elements: \code{group}: The input, but with group names if non-existing, 19 | \code{group_num} a corresponding group list with names replaced by feature number 20 | } 21 | \description{ 22 | Process (check and update names) the group list 23 | } 24 | \details{ 25 | This function takes care of all preprocessing and checking of the provided data in \code{x} against 26 | the feature_list which is typically the output from \code{\link[shapr:get_model_specs]{get_model_specs}} 27 | } 28 | \author{ 29 | Martin Jullum 30 | } 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /inst/scripts/readme_example.R: -------------------------------------------------------------------------------- 1 | library(xgboost) 2 | library(shapr) 3 | 4 | data("Boston", package = "MASS") 5 | 6 | x_var <- c("lstat", "rm", "dis", "indus") 7 | y_var <- "medv" 8 | 9 | x_train <- as.matrix(Boston[-1:-6, x_var]) 10 | y_train <- Boston[-1:-6, y_var] 11 | x_test <- as.matrix(Boston[1:6, x_var]) 12 | 13 | # Looking at the dependence between the features 14 | cor(x_train) 15 | 16 | # Fitting a basic xgboost model to the training data 17 | model <- xgboost( 18 | data = x_train, 19 | label = y_train, 20 | nround = 20, 21 | verbose = FALSE 22 | ) 23 | 24 | # Prepare the data for explanation 25 | explainer <- shapr(x_train, model) 26 | 27 | # Specifying the phi_0, i.e. the expected prediction without any features 28 | p <- mean(y_train) 29 | 30 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using 31 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) 32 | explanation <- explain( 33 | x_test, 34 | approach = "empirical", 35 | explainer = explainer, 36 | prediction_zero = p 37 | ) 38 | 39 | # Printing the Shapley values for the test data. 40 | # For more information about the interpretation of the values in the table, see ?shapr::explain. 41 | print(explanation$dt) 42 | 43 | # Finally we plot the resulting explanations 44 | plot(explanation) 45 | -------------------------------------------------------------------------------- /man/update_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocess_data.R 3 | \name{update_data} 4 | \alias{update_data} 5 | \title{Updates data by reference according to the updater argument.} 6 | \usage{ 7 | update_data(data, updater) 8 | } 9 | \arguments{ 10 | \item{data}{data.table. Data that ought to be updated.} 11 | 12 | \item{updater}{List. The object should be the output from 13 | \code{\link[shapr:check_features]{check_features}}.} 14 | } 15 | \value{ 16 | NULL. 17 | } 18 | \description{ 19 | \code{data} is updated, i.e. unused columns and factor levels are removed as described in 20 | \code{updater}. This is done by reference, i.e. updates the object being passed to data even if nothing is 21 | returned by the function itself. 22 | } 23 | \examples{ 24 | # Load example data 25 | if (requireNamespace("MASS", quietly = TRUE)) { 26 | data("Boston", package = "MASS") 27 | # Split data into test- and training data 28 | x_train <- data.table::as.data.table(head(Boston)) 29 | x_train[, rad := as.factor(rad)] 30 | data_features <- get_data_specs(x_train) 31 | model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) 32 | 33 | model_features <- get_model_specs(model) 34 | updater <- check_features(model_features, data_features) 35 | update_data(x_train, updater) 36 | } 37 | } 38 | \author{ 39 | Martin Jullum 40 | } 41 | \keyword{internal} 42 | -------------------------------------------------------------------------------- /tests/testthat/test-transformation.R: -------------------------------------------------------------------------------- 1 | context("test-transformation.R") 2 | 3 | test_that("Test inv_gaussian_transform", { 4 | 5 | # Example ----------- 6 | zx <- rnorm(50) 7 | n_z <- 30 8 | 9 | x <- inv_gaussian_transform(zx, n_z) 10 | 11 | # Tests ----------- 12 | expect_true(is.atomic(x)) 13 | expect_true(is.double(x)) 14 | 15 | expect_equal(length(x), n_z) 16 | expect_true(min(x) >= min(zx[-c(1:n_z)])) 17 | expect_true(max(x) <= max(zx[-c(1:n_z)])) 18 | 19 | # Erros ----------- 20 | expect_error(inv_gaussian_transform(zx, length(zx))) 21 | expect_error(inv_gaussian_transform(zx, length(zx) + 1)) 22 | }) 23 | 24 | test_that("Test gaussian_transform_separate", { 25 | 26 | # Example ----------- 27 | yx <- rnorm(50) 28 | n_y <- 30 29 | 30 | x <- gaussian_transform_separate(yx, n_y) 31 | 32 | # Tests ----------- 33 | expect_true(is.atomic(x)) 34 | expect_true(is.double(x)) 35 | expect_equal(length(x), n_y) 36 | 37 | # Erros ----------- 38 | expect_error(gaussian_transform_separate(yx, length(yx))) 39 | expect_error(gaussian_transform_separate(yx, length(yx) + 1)) 40 | }) 41 | 42 | test_that("Test gaussian_transform", { 43 | 44 | # Example ----------- 45 | y <- rnorm(50) 46 | x <- gaussian_transform(y) 47 | 48 | # Tests ----------- 49 | expect_true(is.atomic(x)) 50 | expect_true(is.double(x)) 51 | expect_equal(length(x), length(y)) 52 | }) 53 | -------------------------------------------------------------------------------- /man/sample_combinations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sampling.R 3 | \name{sample_combinations} 4 | \alias{sample_combinations} 5 | \title{Helper function to sample a combination of training and testing rows, which does not risk 6 | getting the same observation twice. Need to improve this help file.} 7 | \usage{ 8 | sample_combinations(ntrain, ntest, nsamples, joint_sampling = TRUE) 9 | } 10 | \arguments{ 11 | \item{ntrain}{Positive integer. Number of training observations to sample from.} 12 | 13 | \item{ntest}{Positive integer. Number of test observations to sample from.} 14 | 15 | \item{nsamples}{Positive integer. Number of samples.} 16 | 17 | \item{joint_sampling}{Logical. Indicates whether train- and test data should be sampled 18 | separately or in a joint sampling space. If they are sampled separately (which typically 19 | would be used when optimizing more than one distribution at once) we sample with replacement 20 | if \code{nsamples > ntrain}. Note that this solution is not optimal. Be careful if you're 21 | doing optimization over every test observation when \code{nsamples > ntrain}.} 22 | } 23 | \value{ 24 | data.frame 25 | } 26 | \description{ 27 | Helper function to sample a combination of training and testing rows, which does not risk 28 | getting the same observation twice. Need to improve this help file. 29 | } 30 | \author{ 31 | Martin Jullum 32 | } 33 | \keyword{internal} 34 | -------------------------------------------------------------------------------- /inst/scripts/devel/devel_non_exact_grouping.R: -------------------------------------------------------------------------------- 1 | 2 | ### NOTE: THIS DOES NO LONGER WORK AS WE SWITCH TO exact when a large n_combinations is used, but the checks 3 | ### confirms the code works as intended. 4 | 5 | library(xgboost) 6 | library(shapr) 7 | library(data.table) 8 | 9 | #### Testing grouping function 10 | 11 | data("Boston", package = "MASS") 12 | 13 | x_var <- c("lstat", "rm", "dis", "indus","age","tax","ptratio","black","nox") 14 | y_var <- "medv" 15 | 16 | x_train <- as.matrix(Boston[-1:-6, x_var]) 17 | y_train <- Boston[-1:-6, y_var] 18 | x_test <- as.matrix(Boston[1:6, x_var]) 19 | 20 | # Looking at the dependence between the features 21 | cor(x_train) 22 | 23 | # Fitting a basic xgboost model to the training data 24 | model <- xgboost( 25 | data = x_train, 26 | label = y_train, 27 | nround = 20, 28 | verbose = FALSE 29 | ) 30 | 31 | group <- list(A=x_var[1:3],B=x_var[4:5],C=x_var[7],D=x_var[c(6,8)],E=x_var[9]) 32 | 33 | explainer1 <- shapr(x_train, model,group = group,n_combinations=10^ 6) 34 | 35 | explainer2 <- shapr(x_train, model,group = group) 36 | 37 | explanation1 <- explain( 38 | x_test, 39 | approach = "independence", 40 | explainer = explainer1, 41 | prediction_zero = p 42 | ) 43 | 44 | explanation2 <- explain( 45 | x_test, 46 | approach = "independence", 47 | explainer = explainer2, 48 | prediction_zero = p 49 | ) 50 | 51 | 52 | explanation2$dt-explanation1$dt # All small differences! 53 | 54 | 55 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (https://www.contributor-covenant.org), version 1.0.0, available at 25 | https://contributor-covenant.org/version/1/0/0/. 26 | -------------------------------------------------------------------------------- /man/get_data_specs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocess_data.R 3 | \name{get_data_specs} 4 | \alias{get_data_specs} 5 | \title{Fetches feature information from a given data set} 6 | \usage{ 7 | get_data_specs(x) 8 | } 9 | \arguments{ 10 | \item{x}{matrix, data.frame or data.table The data to extract feature information from.} 11 | } 12 | \value{ 13 | A list with the following elements: 14 | \describe{ 15 | \item{labels}{character vector with the feature names to compute Shapley values for} 16 | \item{classes}{a named character vector with the labels as names and the class types as elements} 17 | \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements 18 | (NULL if the feature is not a factor)} 19 | } 20 | } 21 | \description{ 22 | Fetches feature information from a given data set 23 | } 24 | \details{ 25 | This function is used to extract the feature information to be checked against the corresponding 26 | information extracted from the model and other data sets. The function is called from 27 | \code{\link[shapr:preprocess_data]{preprocess_data}} 28 | and \code{\link[shapr:make_dummies]{make_dummies}} 29 | } 30 | \examples{ 31 | # Load example data 32 | if (requireNamespace("MASS", quietly = TRUE)) { 33 | data("Boston", package = "MASS") 34 | # Split data into test- and training data 35 | x_train <- data.table::as.data.table(head(Boston)) 36 | x_train[, rad := as.factor(rad)] 37 | get_data_specs(x_train) 38 | } 39 | } 40 | \author{ 41 | Martin Jullum 42 | } 43 | \keyword{internal} 44 | -------------------------------------------------------------------------------- /inst/scripts/devel/same_seed_as_master.R: -------------------------------------------------------------------------------- 1 | library(xgboost) 2 | #library(shapr) 3 | library(data.table) 4 | 5 | data("Boston", package = "MASS") 6 | 7 | x_var <- c("lstat", "rm", "dis", "indus")#,"nox","age","tax","ptratio") 8 | y_var <- "medv" 9 | 10 | x_train <- as.matrix(Boston[-1:-6, x_var]) 11 | y_train <- Boston[-1:-6, y_var] 12 | x_test <- as.matrix(Boston[1:6, x_var]) 13 | 14 | # Fitting a basic xgboost model to the training data 15 | model <- xgboost( 16 | data = x_train, 17 | label = y_train, 18 | nround = 20, 19 | verbose = FALSE 20 | ) 21 | # THIS IS GENERATED FROM MASTER BRANCH 22 | # Prepare the data for explanation 23 | explainer <- shapr(x_train, model,n_combinations = 100) 24 | p = mean(y_train) 25 | gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000) 26 | emp = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000) 27 | copula = explain(x_test, explainer, "copula", prediction_zero = p, n_samples = 10000) 28 | indep = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000) 29 | comb = explain(x_test, explainer, c("gaussian", "gaussian", "empirical", "empirical"), prediction_zero = p, n_samples = 10000) 30 | ctree = explain(x_test, explainer, "ctree", mincriterion = 0.95, prediction_zero = p, n_samples = 10000) 31 | ctree2 = explain(x_test, explainer, "ctree", mincriterion = c(0.95, 0.95, 0.95, 0.95), prediction_zero = p, n_samples = 10000) 32 | 33 | 34 | # results from master 35 | 36 | res_master = readRDS("inst/scripts/devel/master_res.rds") 37 | 38 | all.equal(comb$dt, res_master$comb$dt) #TRUE 39 | all.equal(comb$p, res_master$comb$p) #TRUE 40 | -------------------------------------------------------------------------------- /man/model_checker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models.R 3 | \name{model_checker} 4 | \alias{model_checker} 5 | \alias{model_checker.default} 6 | \alias{model_checker.lm} 7 | \alias{model_checker.glm} 8 | \alias{model_checker.ranger} 9 | \alias{model_checker.gam} 10 | \alias{model_checker.xgb.Booster} 11 | \title{Check that the type of model is supported by the explanation method} 12 | \usage{ 13 | model_checker(x) 14 | 15 | \method{model_checker}{default}(x) 16 | 17 | \method{model_checker}{lm}(x) 18 | 19 | \method{model_checker}{glm}(x) 20 | 21 | \method{model_checker}{ranger}(x) 22 | 23 | \method{model_checker}{gam}(x) 24 | 25 | \method{model_checker}{xgb.Booster}(x) 26 | } 27 | \arguments{ 28 | \item{x}{Model object for the model to be explained.} 29 | } 30 | \value{ 31 | Error or NULL 32 | } 33 | \description{ 34 | The function checks whether the model given by \code{x} is supported. 35 | If \code{x} is not a supported model the function will return an error message, otherwise it return NULL 36 | (meaning all types of models with this class is supported) 37 | } 38 | \details{ 39 | See \code{\link{predict_model}} for more information about 40 | what type of models \code{shapr} currently support. 41 | } 42 | \examples{ 43 | if (requireNamespace("MASS", quietly = TRUE)) { 44 | # Load example data 45 | data("Boston", package = "MASS") 46 | # Split data into test- and training data 47 | x_train <- head(Boston, -3) 48 | # Fit a linear model 49 | model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) 50 | 51 | # Checking the model object 52 | model_checker(x = model) 53 | } 54 | } 55 | \keyword{internal} 56 | -------------------------------------------------------------------------------- /man/prepare_and_predict.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/explanation.R 3 | \name{prepare_and_predict} 4 | \alias{prepare_and_predict} 5 | \title{Calculate Shapley values} 6 | \usage{ 7 | prepare_and_predict(explainer, n_batches, prediction_zero, ...) 8 | } 9 | \arguments{ 10 | \item{explainer}{An \code{explainer} object to use for explaining the observations. 11 | See \code{\link{shapr}}.} 12 | 13 | \item{n_batches}{Positive integer. 14 | Specifies how many batches the total number of feature combinations should be split into when calculating the 15 | contribution function for each test observation. 16 | The default value is 1. 17 | Increasing the number of batches may significantly reduce the RAM allocation for models with many features. 18 | This typically comes with a small increase in computation time.} 19 | 20 | \item{prediction_zero}{Numeric. The prediction value for unseen data, typically equal to the mean of 21 | the response.} 22 | 23 | \item{...}{Arguments passed to \code{\link{prepare_data}} with exception of \code{only_return_contrib_dt}, 24 | which is only passed to explain. If \code{TRUE} the 25 | \code{data.table} from \code{\link{prediction}} is returned, else an object of class \code{shapr}. 26 | Each column (except for \code{row_id}) correspond to the vector \code{v_D} in Equation 7 in the reference. 27 | The Shapley values can be calculated by \code{t(explainer$W \%*\% dt_contrib[, -"row_id"]))}} 28 | } 29 | \value{ 30 | A list. See \code{\link{explain}} for more information. 31 | } 32 | \description{ 33 | Sample covariate values, predict and calculate Shapley values. The sampling and prediction can be done in batches 34 | if \code{n_batches} is greater than 1. 35 | } 36 | \keyword{internal} 37 | -------------------------------------------------------------------------------- /man/preprocess_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocess_data.R 3 | \name{preprocess_data} 4 | \alias{preprocess_data} 5 | \title{Process (check and update) data according to specified feature list} 6 | \usage{ 7 | preprocess_data(x, feature_list) 8 | } 9 | \arguments{ 10 | \item{x}{matrix, data.frame or data.table. The data to check input for and update 11 | according to the specification in \code{feature_list}.} 12 | 13 | \item{feature_list}{List. Output from running \code{\link[shapr:get_data_specs]{get_data_specs}} or 14 | \code{\link[shapr:get_model_specs]{get_model_specs}}} 15 | } 16 | \value{ 17 | List with two named elements: \code{x_dt}: Checked and updated data \code{x} in data.table format, and 18 | \code{update_feature_list} the output from \code{\link[shapr:check_features]{check_features}} 19 | } 20 | \description{ 21 | Process (check and update) data according to specified feature list 22 | } 23 | \details{ 24 | This function takes care of all preprocessing and checking of the provided data in \code{x} against 25 | the feature_list which is typically the output from \code{\link[shapr:get_model_specs]{get_model_specs}} 26 | } 27 | \examples{ 28 | # Load example data 29 | if (requireNamespace("MASS", quietly = TRUE)) { 30 | data("Boston", package = "MASS") 31 | # Split data into test- and training data 32 | x_train <- data.table::as.data.table(head(Boston)) 33 | x_train[, rad := as.factor(rad)] 34 | data_features <- get_data_specs(x_train) 35 | model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) 36 | 37 | model_features <- get_model_specs(model) 38 | preprocess_data(x_train, model_features) 39 | } 40 | } 41 | \author{ 42 | Martin Jullum 43 | } 44 | \keyword{internal} 45 | -------------------------------------------------------------------------------- /tests/testthat/test-plot.R: -------------------------------------------------------------------------------- 1 | context("test-plot.R") 2 | 3 | test_that("Test plot.shapr", { 4 | if (requireNamespace("ggplot2", quietly = TRUE)) { 5 | 6 | # Example ----------- 7 | x <- matrix(c( 8 | 4.98, 9.14, 4.03, 2.94, 5.33, 9 | 6.575, 6.421, 7.185, 6.998, 7.147, 10 | 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 11 | 2.31, 7.07, 7.07, 2.18, 2.18 12 | ), 13 | ncol = 4 14 | ) 15 | 16 | colnames(x) <- c("lstat", "rm", "dis", "indus") 17 | 18 | explanation <- list() 19 | explanation$p <- c(31.30145, 23.25194, 33.11547, 33.43015, 31.72984) 20 | explanation$dt <- data.table::data.table( 21 | "none" = rep(22.00, 5), 22 | "lstat" = c(5.2632, 0.1672, 5.9888, 8.2142, 0.5060), 23 | "rm" = c(-1.2527, -0.7088, 5.5451, 0.7508, 5.6875), 24 | "dis" = c(0.2920, 0.9689, 0.5660, 0.1893, 0.8432), 25 | "indus" = c(4.5529, 0.3787, -1.4304, 1.8298, 2.2471) 26 | ) 27 | explanation$x_test <- x 28 | explanation$is_groupwise <- FALSE 29 | attr(explanation, "class") <- c("shapr", "list") 30 | 31 | 32 | # Test ----------- 33 | p <- plot(explanation, plot_phi0 = FALSE) 34 | 35 | expect_equal(colnames(x), unique(as.character(p$data$variable))) 36 | expect_equal(explanation$p, unique(p$data$pred)) 37 | expect_equal(sort(as.vector(as.matrix(explanation$dt[, -c("none")]))), sort(p$data$phi)) 38 | 39 | p <- plot(explanation, plot_phi0 = TRUE) 40 | 41 | expect_equal(colnames(explanation$dt), unique(as.character(p$data$variable))) 42 | expect_equal(explanation$p, unique(p$data$pred)) 43 | expect_equal(sort(as.vector(as.matrix(explanation$dt))), sort(p$data$phi)) 44 | 45 | p <- plot(explanation, plot_phi0 = TRUE, top_k_features = 2) 46 | 47 | expect_equal(2, max(p$data$rank)) 48 | } 49 | }) 50 | -------------------------------------------------------------------------------- /man/check_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocess_data.R 3 | \name{check_features} 4 | \alias{check_features} 5 | \title{Checks that two extracted feature lists have exactly the same properties} 6 | \usage{ 7 | check_features(f_list_1, f_list_2, use_1_as_truth = T) 8 | } 9 | \arguments{ 10 | \item{f_list_1, f_list_2}{List. As extracted from either \code{get_data_specs} or \code{get_model_specs}.} 11 | 12 | \item{use_1_as_truth}{Logical. If TRUE, \code{f_list_2} is compared to \code{f_list_1}, i.e. additional elements 13 | is allowed in \code{f_list_2}, and if \code{f_list_1}'s feature classes contains NAs, feature class check is 14 | ignored regardless of what is specified in \code{f_list_1}. If FALSE, \code{f_list_1} and \code{f_list_2} are 15 | equated and they need to contain exactly the same elements. Set to TRUE when comparing a model and data, and FALSE 16 | when comparing two data sets.} 17 | } 18 | \value{ 19 | List. The \code{f_list_1} is returned as inserted if there all check are carried out. If some info is 20 | missing from \code{f_list_1}, the function continues consistency checking using \code{f_list_2} and returns that. 21 | } 22 | \description{ 23 | Checks that two extracted feature lists have exactly the same properties 24 | } 25 | \examples{ 26 | # Load example data 27 | if (requireNamespace("MASS", quietly = TRUE)) { 28 | data("Boston", package = "MASS") 29 | # Split data into test- and training data 30 | x_train <- data.table::as.data.table(head(Boston)) 31 | x_train[, rad := as.factor(rad)] 32 | data_features <- get_data_specs(x_train) 33 | model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) 34 | 35 | model_features <- get_model_specs(model) 36 | check_features(model_features, data_features) 37 | } 38 | } 39 | \author{ 40 | Martin Jullum 41 | } 42 | \keyword{internal} 43 | -------------------------------------------------------------------------------- /man/make_dummies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/features.R 3 | \name{make_dummies} 4 | \alias{make_dummies} 5 | \title{Initiate the making of dummy variables} 6 | \usage{ 7 | make_dummies(traindata, testdata) 8 | } 9 | \arguments{ 10 | \item{traindata}{data.table or data.frame.} 11 | 12 | \item{testdata}{data.table or data.frame. New data that has the same 13 | feature names, types, and levels as \code{traindata}.} 14 | } 15 | \value{ 16 | A list that contains the following entries: 17 | \describe{ 18 | \item{feature_list}{List. Output from \code{check_features}} 19 | \item{train_dummies}{A data.frame containing all of the factors in \code{traindata} as 20 | one-hot encoded variables.} 21 | \item{test_dummies}{A data.frame containing all of the factors in \code{testdata} as 22 | one-hot encoded variables.} 23 | \item{traindata_new}{Original traindata with correct column ordering and factor levels. To be passed to 24 | \code{\link[shapr:shapr]{shapr}.}} 25 | \item{testdata_new}{Original testdata with correct column ordering and factor levels. To be passed to 26 | \code{\link[shapr:explain]{explain}.}} 27 | } 28 | } 29 | \description{ 30 | Initiate the making of dummy variables 31 | } 32 | \examples{ 33 | if (requireNamespace("MASS", quietly = TRUE)) { 34 | data("Boston", package = "MASS") 35 | x_var <- c("lstat", "rm", "dis", "indus") 36 | y_var <- "medv" 37 | x_train <- as.data.frame(Boston[401:411, x_var]) 38 | y_train <- Boston[401:408, y_var] 39 | x_test <- as.data.frame(Boston[1:4, x_var]) 40 | 41 | # convert to factors for illustational purpose 42 | x_train$rm <- factor(round(x_train$rm)) 43 | x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) 44 | 45 | dummylist <- make_dummies(traindata = x_train, testdata = x_test) 46 | } 47 | } 48 | \author{ 49 | Annabelle Redelmeier, Martin Jullum 50 | } 51 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to shapr 2 | 3 | This outlines how to propose a change to shapr. 4 | 5 | ### Fixing typos 6 | 7 | Small typos or grammatical errors in documentation may be edited directly using 8 | the GitHub web interface, so long as the changes are made in the _source_ file. 9 | 10 | * YES: you edit a roxygen comment in a `.R` file below `R/`. 11 | * NO: you edit an `.Rd` file below `man/`. 12 | 13 | ### Prerequisites 14 | 15 | Before you make a substantial pull request, you should always file an issue and 16 | make sure someone from the team agrees that it’s a problem. If you’ve found a 17 | bug, create an associated issue and illustrate the bug with a minimal 18 | [reprex](https://www.tidyverse.org/help/#reprex). 19 | 20 | ### Pull request process 21 | 22 | * We recommend that you create a Git branch for each pull request (PR). 23 | * Look at the CircleCI build status before and after making changes. 24 | The `README` should contain badges for any continuous integration services used 25 | by the package. 26 | * New code should follow the tidyverse [style guide](http://style.tidyverse.org). 27 | You can use the [styler](https://CRAN.R-project.org/package=styler) package to 28 | apply these styles, but please don't restyle code that has nothing to do with 29 | your PR. 30 | * We use [roxygen2](https://cran.r-project.org/package=roxygen2) for documentation. 31 | * We use [testthat](https://cran.r-project.org/package=testthat). Contributions 32 | with test cases included are easier to accept. 33 | * For user-facing changes, add a bullet to the top of `NEWS.md` below the 34 | current development version header describing the changes made followed by your 35 | GitHub username, and links to relevant issue(s)/PR(s). 36 | 37 | ### Code of Conduct 38 | 39 | Please note that the shapr project is released with a 40 | [Contributor Code of Conduct](CODE_OF_CONDUCT.md). By contributing to this 41 | project you agree to abide by its terms. 42 | -------------------------------------------------------------------------------- /tests/testthat/test-predictions.R: -------------------------------------------------------------------------------- 1 | context("test-predictions.R") 2 | 3 | test_that("Test prediction", { 4 | 5 | # Example ----------- 6 | if (requireNamespace("MASS", quietly = TRUE)) { 7 | data("Boston", package = "MASS") 8 | dt_train <- data.table::as.data.table(Boston) 9 | features <- c("lstat", "rm", "dis", "indus") 10 | n_combinations <- 10 11 | n_features <- 4 12 | prediction_zero <- .5 13 | n_xtest <- 8 14 | explainer <- list() 15 | explainer$model <- stats::lm(formula = "medv ~ lstat + rm + dis + indus", data = head(dt_train, -n_xtest)) 16 | explainer$x_test <- tail(dt_train[, .SD, .SDcols = features], n_xtest) 17 | explainer$W <- matrix(1, nrow = n_features + 1, ncol = n_combinations) 18 | explainer$is_groupwise <- FALSE 19 | explainer$S <- matrix(1, nrow = n_combinations, ncol = n_features) 20 | dt <- dt_train[, .SD, .SDcols = features][rep(1:.N, 4)] 21 | dt[, id := rep_len(1:n_xtest, .N)] 22 | dt[, id_combination := rep_len(1:n_combinations, .N), id] 23 | dt[, w := runif(.N)] 24 | max_id_combination <- dt[, max(id_combination)] 25 | dt <- dt[!(id_combination == max_id_combination)] 26 | dt_lastrows <- data.table::data.table( 27 | explainer$x_test, 28 | id = 1:n_xtest, 29 | id_combination = max_id_combination, 30 | w = 1.0 31 | ) 32 | dt <- rbind(dt, dt_lastrows, dt_lastrows, dt_lastrows) 33 | 34 | x <- prediction(dt, prediction_zero, explainer) 35 | 36 | # Test ----------- 37 | lnms <- c("p", "dt_mat") 38 | expect_equal(class(x), "list") 39 | expect_equal(names(x), lnms) 40 | expect_equal(x$p, predict_model(explainer$model, explainer$x_test)) 41 | expect_true(data.table::is.data.table(x$dt_mat)) 42 | 43 | # t(W %*% x$dt_mat) = shapley values 44 | expect_equal(ncol(explainer$W), nrow(x$dt_mat)) 45 | 46 | # Tests errors 47 | expect_error(prediction(dt[id < n_xtest], prediction_zero, explainer)) 48 | } 49 | }) 50 | -------------------------------------------------------------------------------- /man/observation_impute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/observations.R 3 | \name{observation_impute} 4 | \alias{observation_impute} 5 | \title{Generate permutations of training data using test observations} 6 | \usage{ 7 | observation_impute( 8 | W_kernel, 9 | S, 10 | x_train, 11 | x_test, 12 | w_threshold = 0.7, 13 | n_samples = 1000 14 | ) 15 | } 16 | \arguments{ 17 | \item{W_kernel}{Numeric matrix. Contains all nonscaled weights between training and test 18 | observations for all feature combinations. The dimension equals \code{n_train x m}.} 19 | 20 | \item{S}{Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} 21 | and \code{m} equals the total number of sampled/non-sampled feature combinations and 22 | the total number of unique features, respectively. Note that \code{m = ncol(x_train)}.} 23 | 24 | \item{x_train}{Numeric matrix} 25 | 26 | \item{x_test}{Numeric matrix} 27 | 28 | \item{w_threshold}{Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion 29 | of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the 30 | \code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. 31 | \code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} 32 | 33 | \item{n_samples}{Positive integer. Indicating the maximum number of samples to use in the 34 | Monte Carlo integration for every conditional expectation. See also details.} 35 | } 36 | \value{ 37 | data.table 38 | } 39 | \description{ 40 | Generate permutations of training data using test observations 41 | } 42 | \references{ 43 | Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent: 44 | More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. 45 | } 46 | \author{ 47 | Nikolai Sellereite 48 | } 49 | \keyword{internal} 50 | -------------------------------------------------------------------------------- /man/get_model_specs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models.R 3 | \name{get_model_specs} 4 | \alias{get_model_specs} 5 | \alias{get_model_specs.default} 6 | \alias{get_model_specs.lm} 7 | \alias{get_model_specs.glm} 8 | \alias{get_model_specs.gam} 9 | \alias{get_model_specs.ranger} 10 | \alias{get_model_specs.xgb.Booster} 11 | \title{Fetches feature information from a given model object} 12 | \usage{ 13 | get_model_specs(x) 14 | 15 | \method{get_model_specs}{default}(x) 16 | 17 | \method{get_model_specs}{lm}(x) 18 | 19 | \method{get_model_specs}{glm}(x) 20 | 21 | \method{get_model_specs}{gam}(x) 22 | 23 | \method{get_model_specs}{ranger}(x) 24 | 25 | \method{get_model_specs}{xgb.Booster}(x) 26 | } 27 | \arguments{ 28 | \item{x}{Model object for the model to be explained.} 29 | } 30 | \value{ 31 | A list with the following elements: 32 | \describe{ 33 | \item{labels}{character vector with the feature names to compute Shapley values for} 34 | \item{classes}{a named character vector with the labels as names and the class type as elements} 35 | \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements 36 | (NULL if the feature is not a factor)} 37 | } 38 | } 39 | \description{ 40 | Fetches feature information from a given model object 41 | } 42 | \details{ 43 | This function is used to extract the feature information to be checked against data passed to \code{shapr} 44 | and \code{explain}. The function is called from \code{preprocess_data}. 45 | } 46 | \examples{ 47 | if (requireNamespace("MASS", quietly = TRUE)) { 48 | # Load example data 49 | data("Boston", package = "MASS") 50 | # Split data into test- and training data 51 | x_train <- data.table::as.data.table(head(Boston)) 52 | x_train[, rad := as.factor(rad)] 53 | model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) 54 | 55 | get_model_specs(model) 56 | } 57 | } 58 | \author{ 59 | Martin Jullum 60 | } 61 | \keyword{internal} 62 | -------------------------------------------------------------------------------- /R/transformation.R: -------------------------------------------------------------------------------- 1 | #' Transforms new data to a standardized normal distribution 2 | #' 3 | #' @param zx Numeric vector. The first \code{n_z} items are the Gaussian data, and the last part is 4 | #' the data with the original transformation. 5 | #' @param n_z Positive integer. Number of elements of \code{zx} that belongs to new data. 6 | #' 7 | #' @return Numeric vector of length \code{n_z} 8 | #' 9 | #' @keywords internal 10 | #' 11 | #' @author Martin Jullum 12 | inv_gaussian_transform <- function(zx, n_z) { 13 | if (n_z >= length(zx)) stop("n_z should be less than length(zx)") 14 | ind <- 1:n_z 15 | z <- zx[ind] 16 | x <- zx[-ind] 17 | u <- stats::pnorm(z) 18 | x_new <- stats::quantile(x, probs = u) 19 | return(as.double(x_new)) 20 | } 21 | 22 | #' Transforms new data to standardized normal (dimension 1) based on other data transformations 23 | #' 24 | #' @param yx Numeric vector. The first \code{n_y} items is the data that is transformed, and last 25 | #' part is the data with the original transformation. 26 | #' @param n_y Positive integer. Number of elements of \code{yx} that belongs to the gaussian data. 27 | #' 28 | #' @return Vector of back-transformed Gaussian data 29 | #' 30 | #' @keywords internal 31 | #' 32 | #' @author Martin Jullum 33 | gaussian_transform_separate <- function(yx, n_y) { 34 | if (n_y >= length(yx)) stop("n_y should be less than length(yx)") 35 | ind <- 1:n_y 36 | x <- yx[-ind] 37 | tmp <- rank(yx)[ind] 38 | tmp <- tmp - rank(tmp) + 0.5 39 | u_y <- tmp / (length(x) + 1) 40 | z_y <- stats::qnorm(u_y) 41 | return(z_y) 42 | } 43 | 44 | #' Transforms a sample to standardized normal distribution 45 | #' 46 | #' @param x Numeric vector.The data which should be transformed to a standard normal distribution. 47 | #' 48 | #' @return Numeric vector of length \code{length(x)} 49 | #' 50 | #' @keywords internal 51 | #' 52 | #' @author Martin Jullum 53 | gaussian_transform <- function(x) { 54 | u <- rank(x) / (length(x) + 1) 55 | z <- stats::qnorm(u) 56 | return(z) 57 | } 58 | -------------------------------------------------------------------------------- /man/prediction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predictions.R 3 | \name{prediction} 4 | \alias{prediction} 5 | \title{Calculate Shapley weights for test data} 6 | \usage{ 7 | prediction(dt, prediction_zero, explainer) 8 | } 9 | \arguments{ 10 | \item{dt}{data.table} 11 | 12 | \item{prediction_zero}{Numeric. The value to use for \code{phi_0}.} 13 | 14 | \item{explainer}{An object of class \code{explainer}. See \code{\link{shapr}}.} 15 | } 16 | \value{ 17 | An object of class \code{c("shapr", "list")}. For more details see \code{\link{explain}}. 18 | } 19 | \description{ 20 | This function should only be called internally, and not be used as 21 | a stand-alone function. 22 | } 23 | \details{ 24 | If \code{dt} does not contain three columns called \code{id}, \code{id_combination} and \code{w} 25 | the function will fail. \code{id} represents a unique key for a given test observation, 26 | and \code{id_combination} is a unique key for which feature combination the row represents. \code{w} 27 | represents the Shapley value of feature combination given by \code{id_combination}. In addition 28 | to these three columns, \code{dt} should also have columns which matches the variables used 29 | when training the model. 30 | 31 | I.e. you have fitted a linear model using the features \code{x1}, 32 | \code{x2} and \code{x3}, and you want to explain 5 test observations using the exact method, i.e. 33 | setting \code{exact = TRUE} in \code{\link{shapr}}, the following properties should be satisfied 34 | \enumerate{ 35 | \item \code{colnames(dt)} equals \code{c("x1", "x2", "x3", "id", "id_combination", ""w)} 36 | \item \code{dt[, max(id)]} equals the number of test observations 37 | \item \code{dt[, min(id)]} equals 1L. 38 | \item \code{dt[, max(id_combination)]} equals \code{2^m} where m equals the number of features. 39 | \item \code{dt[, min(id_combination)]} equals 1L. 40 | \item \code{dt[, type(w)]} equals \code{double}. 41 | } 42 | } 43 | \author{ 44 | Nikolai Sellereite 45 | } 46 | \keyword{internal} 47 | -------------------------------------------------------------------------------- /man/observation_impute_cpp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{observation_impute_cpp} 4 | \alias{observation_impute_cpp} 5 | \title{Get imputed data} 6 | \usage{ 7 | observation_impute_cpp(index_xtrain, index_s, xtrain, xtest, S) 8 | } 9 | \arguments{ 10 | \item{index_xtrain}{Positive integer. Represents a sequence of row indices from \code{xtrain}, 11 | i.e. \code{min(index_xtrain) >= 1} and \code{max(index_xtrain) <= nrow(xtrain)}.} 12 | 13 | \item{index_s}{Positive integer. Represents a sequence of row indices from \code{S}, 14 | i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}.} 15 | 16 | \item{xtrain}{Numeric matrix.} 17 | 18 | \item{xtest}{Numeric matrix. Represents a single test observation.} 19 | 20 | \item{S}{Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals 21 | the total number of sampled/non-sampled feature combinations and \code{m} equals 22 | the total number of unique features. Note that \code{m = ncol(xtrain)}. See details 23 | for more information.} 24 | } 25 | \value{ 26 | Numeric matrix 27 | } 28 | \description{ 29 | Get imputed data 30 | } 31 | \details{ 32 | \code{S(i, j) = 1} if and only if feature \code{j} is present in feature 33 | combination \code{i}, otherwise \code{S(i, j) = 0}. I.e. if \code{m = 3}, there 34 | are \code{2^3 = 8} unique ways to combine the features. In this case \code{dim(S) = c(8, 3)}. 35 | Let's call the features \code{x1, x2, x3} and take a closer look at the combination 36 | represented by \code{s = c(x1, x2)}. If this combination is represented by the second row, 37 | the following is true: \code{S[2, 1:3] = c(1, 1, 0)}. 38 | 39 | The returned object, \code{X}, is a numeric matrix where 40 | \code{dim(X) = c(length(index_xtrain), ncol(xtrain))}. If feature \code{j} is present in 41 | the k-th observation, that is \code{S[index_[k], j] == 1}, \code{X[k, j] = xtest[1, j]}. 42 | Otherwise \code{X[k, j] = xtrain[index_xtrain[k], j]}. 43 | } 44 | \author{ 45 | Nikolai Sellereite 46 | } 47 | \keyword{internal} 48 | -------------------------------------------------------------------------------- /tests/testthat/test-src_weighted_matrix.R: -------------------------------------------------------------------------------- 1 | context("test-src_weighted_matrix.R") 2 | 3 | test_that("Test weight_matrix_cpp", { 4 | 5 | ## Example ----------- 6 | m <- 3 7 | n <- 2^m 8 | subsets <- unlist( 9 | lapply( 10 | 0:m, 11 | utils::combn, 12 | x = m, 13 | simplify = FALSE 14 | ), 15 | recursive = FALSE 16 | ) 17 | w_all <- shapley_weights(m, choose(m, 0:m), 0:m) 18 | w_all[!is.finite(w_all)] <- 10^6 19 | w <- w_all[sapply(subsets, length) + 1] 20 | x <- weight_matrix_cpp( 21 | subsets = subsets, 22 | m = m, 23 | n = n, 24 | w = w 25 | ) 26 | 27 | ## Exact results ----------- 28 | Z <- matrix(0, nrow = n, ncol = m + 1) 29 | Z[, 1] <- 1 30 | for (i in seq_along(subsets)) { 31 | f <- subsets[[i]] 32 | if (length(f) > 0) { 33 | Z[i, f + 1] <- 1 34 | } 35 | } 36 | W <- matrix(0, nrow = n, ncol = n) 37 | diag(W) <- w 38 | res <- solve(t(Z) %*% W %*% Z) %*% (t(Z) %*% W) 39 | 40 | ## Test results ----------- 41 | expect_true(is.matrix(x)) 42 | expect_true(is.double(x)) 43 | expect_equal(nrow(x), m + 1) 44 | expect_equal(ncol(x), n) 45 | expect_equal(x, res) 46 | }) 47 | 48 | test_that("Test feature_matrix_cpp", { 49 | 50 | ## Example ----------- 51 | features <- list( 52 | integer(0), 53 | 1:2, 54 | 10, 55 | 4:8, 56 | 3:7 57 | ) 58 | m <- 10 59 | x <- feature_matrix_cpp(features, m) 60 | 61 | ## Test results ----------- 62 | expect_true(is.matrix(x)) 63 | expect_equal(ncol(x), m) 64 | expect_equal(nrow(x), length(features)) 65 | expect_true(max(x) <= 1) 66 | expect_true(min(x) >= 0) 67 | expect_equal(sapply(features, length), rowSums(x)) 68 | for (i in seq_along(features)) { 69 | feature_i <- features[[i]] 70 | n_features <- length(feature_i) 71 | if (n_features == 0) { 72 | expect_equal(x[i, ], rep(0, m)) 73 | } else { 74 | expect_equal(x[i, feature_i], rep(1, n_features)) 75 | expect_equal(x[i, -feature_i], rep(0, m - n_features)) 76 | } 77 | } 78 | expect_error(feature_matrix_cpp(list(1, 2:3), 3)) 79 | }) 80 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: shapr 2 | Version: 0.2.0.9002 3 | Title: Prediction Explanation with Dependence-Aware Shapley Values 4 | Description: Complex machine learning models are often hard to interpret. However, in 5 | many situations it is crucial to understand and explain why a model made a specific 6 | prediction. Shapley values is the only method for such prediction explanation framework 7 | with a solid theoretical foundation. Previously known methods for estimating the Shapley 8 | values do, however, assume feature independence. This package implements the method 9 | described in Aas, Jullum and Løland (2019) , which accounts for any feature 10 | dependence, and thereby produces more accurate estimates of the true Shapley values. 11 | Authors@R: c( 12 | person("Nikolai", "Sellereite", email = "nikolaisellereite@gmail.com", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")), 13 | person("Martin", "Jullum", email = "Martin.Jullum@nr.no", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-3908-5155")), 14 | person("Annabelle", "Redelmeier", email = "Annabelle.Redelmeier@nr.no", role = "aut"), 15 | person("Anders", "Løland", email = "Anders.Loland@nr.no", role = "ctb"), 16 | person("Jens Christian", "Wahl", email = "Jens.Christian.Wahl@nr.no", role = "ctb"), 17 | person("Camilla", "Lingjærde", role = "ctb"), 18 | person("Norsk Regnesentral", role = c("cph", "fnd")) 19 | ) 20 | URL: https://norskregnesentral.github.io/shapr/, https://github.com/NorskRegnesentral/shapr 21 | BugReports: https://github.com/NorskRegnesentral/shapr/issues 22 | License: MIT + file LICENSE 23 | Encoding: UTF-8 24 | LazyData: true 25 | ByteCompile: true 26 | Language: en-US 27 | RoxygenNote: 7.1.2 28 | Depends: R (>= 3.5.0) 29 | Imports: 30 | stats, 31 | data.table, 32 | Rcpp (>= 0.12.15), 33 | condMVNorm, 34 | mvnfast, 35 | Matrix 36 | Suggests: 37 | ranger, 38 | xgboost, 39 | mgcv, 40 | testthat, 41 | knitr, 42 | rmarkdown, 43 | roxygen2, 44 | MASS, 45 | ggplot2, 46 | caret, 47 | gbm, 48 | party, 49 | partykit 50 | LinkingTo: 51 | RcppArmadillo, 52 | Rcpp 53 | VignetteBuilder: knitr 54 | -------------------------------------------------------------------------------- /tests/testthat/test-src_impute_data.R: -------------------------------------------------------------------------------- 1 | context("test-src_impute_data.R") 2 | 3 | test_that("Test observation_impute_cpp", { 4 | 5 | # Example data ----------- 6 | if (requireNamespace("datasets", quietly = TRUE)) { 7 | data("mtcars", package = "datasets") 8 | rownames(mtcars) <- NULL 9 | mtcars <- as.matrix(mtcars) 10 | 11 | # Example ----------- 12 | m <- 3 13 | n_combinations <- 2^m 14 | mtcars <- mtcars[1:15, seq(m)] 15 | ntrain <- 14 16 | xtrain <- mtcars[seq(ntrain), ] 17 | xtest <- mtcars[-seq(ntrain), , drop = FALSE] 18 | S <- matrix(0L, n_combinations, m) 19 | features <- list( 20 | integer(), 1, 2, 3, c(1, 2), c(1, 3), c(2, 3), c(1, 2, 3) 21 | ) 22 | for (i in seq_along(features)) { 23 | feature_i <- features[[i]] 24 | if (length(feature_i) > 0) { 25 | S[i, features[[i]]] <- 1L 26 | } 27 | } 28 | 29 | # Tests (invalid input) ----------- 30 | expect_error( 31 | observation_impute_cpp( 32 | index_xtrain = c(1, 2), 33 | index_s = c(1, 2, 3), 34 | xtrain = xtrain, 35 | xtest = xtest, 36 | S = S 37 | ) 38 | ) 39 | expect_error( 40 | observation_impute_cpp( 41 | index_xtrain = c(1, 2), 42 | index_s = c(2, 3), 43 | xtrain = xtrain[, 1:2], 44 | xtest = xtest, 45 | S = S 46 | ) 47 | ) 48 | 49 | # Tests (valid input) ----------- 50 | index_xtrain <- c(1, 2) 51 | index_s <- c(4, 5) 52 | x <- observation_impute_cpp( 53 | index_xtrain = index_xtrain, 54 | index_s = index_s, 55 | xtrain = xtrain, 56 | xtest = xtest, 57 | S = S 58 | ) 59 | 60 | expect_equal(nrow(x), length(index_s)) 61 | expect_equal(ncol(x), ncol(xtrain)) 62 | expect_true(is.matrix(x)) 63 | expect_true(is.double(x)) 64 | 65 | for (i in 1:nrow(x)) { 66 | feature_i <- features[[index_s[i]]] 67 | 68 | for (j in seq(m)) { 69 | if (j %in% feature_i) { 70 | expect_equal(x[i, j], unname(xtest[1, j])) 71 | } else { 72 | expect_equal(x[i, j], unname(xtrain[index_xtrain[i], j])) 73 | } 74 | } 75 | } 76 | } 77 | }) 78 | -------------------------------------------------------------------------------- /man/prepare_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/observations.R 3 | \name{prepare_data} 4 | \alias{prepare_data} 5 | \alias{prepare_data.independence} 6 | \alias{prepare_data.empirical} 7 | \alias{prepare_data.gaussian} 8 | \alias{prepare_data.copula} 9 | \alias{prepare_data.ctree} 10 | \title{Generate data used for predictions} 11 | \usage{ 12 | prepare_data(x, ...) 13 | 14 | \method{prepare_data}{independence}(x, index_features = NULL, ...) 15 | 16 | \method{prepare_data}{empirical}(x, index_features = NULL, ...) 17 | 18 | \method{prepare_data}{gaussian}(x, index_features = NULL, ...) 19 | 20 | \method{prepare_data}{copula}(x, index_features = NULL, ...) 21 | 22 | \method{prepare_data}{ctree}( 23 | x, 24 | index_features = NULL, 25 | mc_cores = 1, 26 | mc_cores_create_ctree = mc_cores, 27 | mc_cores_sample_ctree = mc_cores, 28 | ... 29 | ) 30 | } 31 | \arguments{ 32 | \item{x}{Explainer object. See \code{\link{explain}} for more information.} 33 | 34 | \item{...}{Currently not used.} 35 | 36 | \item{index_features}{List. Default is NULL but if either various methods are being used or various mincriterion are 37 | used for different numbers of conditioned features, this will be a list with the features to pass.} 38 | 39 | \item{mc_cores}{Integer. Only for class \code{ctree} currently. The number of cores to use in paralellization of the 40 | tree building (\code{create_ctree}) and tree sampling (\code{sample_ctree}). Defaults to 1. Note: Uses 41 | parallel::mclapply which relies on forking, i.e. uses only 1 core on Windows systems.} 42 | 43 | \item{mc_cores_create_ctree}{Integer. Same as \code{mc_cores}, but specific for the tree building function 44 | #' Defaults to \code{mc_cores}.} 45 | 46 | \item{mc_cores_sample_ctree}{Integer. Same as \code{mc_cores}, but specific for the tree building prediction 47 | function. 48 | Defaults to \code{mc_cores}.} 49 | 50 | \item{seed}{Positive integer. If \code{NULL} the seed will be inherited from the calling environment.} 51 | } 52 | \value{ 53 | A `data.table` containing simulated data passed to \code{\link{prediction}}. 54 | } 55 | \description{ 56 | Generate data used for predictions 57 | } 58 | \keyword{internal} 59 | -------------------------------------------------------------------------------- /man/plot.shapr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.R 3 | \name{plot.shapr} 4 | \alias{plot.shapr} 5 | \title{Plot of the Shapley value explanations} 6 | \usage{ 7 | \method{plot}{shapr}( 8 | x, 9 | digits = 3, 10 | plot_phi0 = TRUE, 11 | index_x_test = NULL, 12 | top_k_features = NULL, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{An \code{shapr} object. See \code{\link{explain}}.} 18 | 19 | \item{digits}{Integer. Number of significant digits to use in the feature description} 20 | 21 | \item{plot_phi0}{Logical. Whether to include \code{phi0} in the plot} 22 | 23 | \item{index_x_test}{Integer vector. Which of the test observations to plot. E.g. if you have 24 | explained 10 observations using \code{\link{explain}}, you can generate a plot for the first 5 25 | observations by setting \code{index_x_test = 1:5}.} 26 | 27 | \item{top_k_features}{Integer. How many features to include in the plot. E.g. if you have 15 28 | features in your model you can plot the 5 most important features, for each explanation, by setting 29 | \code{top_k_features = 1:5}.} 30 | 31 | \item{...}{Currently not used.} 32 | } 33 | \value{ 34 | ggplot object with plots of the Shapley value explanations 35 | } 36 | \description{ 37 | Plots the individual prediction explanations. 38 | } 39 | \details{ 40 | See \code{vignette("understanding_shapr", package = "shapr")} for an example of 41 | how you should use the function. 42 | } 43 | \examples{ 44 | if (requireNamespace("MASS", quietly = TRUE)) { 45 | #' # Load example data 46 | data("Boston", package = "MASS") 47 | 48 | # Split data into test- and training data 49 | x_train <- head(Boston, -3) 50 | x_test <- tail(Boston, 3) 51 | 52 | # Fit a linear model 53 | model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) 54 | 55 | # Create an explainer object 56 | explainer <- shapr(x_train, model) 57 | 58 | # Explain predictions 59 | p <- mean(x_train$medv) 60 | 61 | # Empirical approach 62 | explanation <- explain(x_test, 63 | explainer, 64 | approach = "empirical", 65 | prediction_zero = p, 66 | n_samples = 1e2 67 | ) 68 | 69 | if (requireNamespace("ggplot2", quietly = TRUE)) { 70 | # Plot the explantion (this function) 71 | plot(explanation) 72 | } 73 | } 74 | } 75 | \author{ 76 | Martin Jullum 77 | } 78 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Nov 11th, 2021: THIS IS A MODIFIED VERSION OF THE check-full VERSION AS DESCRIBED BELOW 2 | # IT ADDS cranversion TO THE LIST OF BRANCHES TO RUN ON AND REDUCED THE NUMBER OF CONFIGS, OTHERWISE IDENTICAL 3 | 4 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 5 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 6 | # 7 | # NOTE: This workflow is overkill for most R packages and 8 | # check-standard.yaml is likely a better choice. 9 | # usethis::use_github_action("check-standard") will install it. 10 | 11 | 12 | on: 13 | push: 14 | branches: [main, master, cranversion] 15 | pull_request: 16 | branches: [main, master, cranversion] 17 | 18 | name: R-CMD-check 19 | 20 | jobs: 21 | R-CMD-check: 22 | runs-on: ${{ matrix.config.os }} 23 | 24 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 25 | 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | config: 30 | - {os: macOS-latest, r: 'release'} 31 | 32 | - {os: windows-latest, r: 'release'} 33 | 34 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 35 | - {os: ubuntu-latest, r: 'release'} 36 | - {os: ubuntu-latest, r: 'oldrel-1'} 37 | - {os: ubuntu-latest, r: 'oldrel-2'} 38 | # - {os: ubuntu-latest, r: 'oldrel-3'} 39 | 40 | env: 41 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 42 | R_KEEP_PKG_SOURCE: yes 43 | 44 | steps: 45 | - uses: actions/checkout@v2 46 | 47 | - uses: r-lib/actions/setup-pandoc@v1 48 | 49 | - uses: r-lib/actions/setup-r@v1 50 | with: 51 | r-version: ${{ matrix.config.r }} 52 | http-user-agent: ${{ matrix.config.http-user-agent }} 53 | use-public-rspm: true 54 | 55 | - uses: r-lib/actions/setup-r-dependencies@v1 56 | with: 57 | extra-packages: rcmdcheck 58 | 59 | - uses: r-lib/actions/check-r-package@v1 60 | 61 | - name: Show testthat output 62 | if: always() 63 | run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true 64 | shell: bash 65 | 66 | - name: Upload check results 67 | if: failure() 68 | uses: actions/upload-artifact@main 69 | with: 70 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 71 | path: check 72 | -------------------------------------------------------------------------------- /man/feature_combinations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/features.R 3 | \name{feature_combinations} 4 | \alias{feature_combinations} 5 | \title{Define feature combinations, and fetch additional information about each unique combination} 6 | \usage{ 7 | feature_combinations( 8 | m, 9 | exact = TRUE, 10 | n_combinations = 200, 11 | weight_zero_m = 10^6, 12 | group_num = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{m}{Positive integer. Total number of features.} 17 | 18 | \item{exact}{Logical. If \code{TRUE} all \code{2^m} combinations are generated, otherwise a 19 | subsample of the combinations is used.} 20 | 21 | \item{n_combinations}{Positive integer. Note that if \code{exact = TRUE}, 22 | \code{n_combinations} is ignored. However, if \code{m > 12} you'll need to add a positive integer 23 | value for \code{n_combinations}.} 24 | 25 | \item{weight_zero_m}{Numeric. The value to use as a replacement for infinite combination 26 | weights when doing numerical operations.} 27 | 28 | \item{group_num}{List. Contains vector of integers indicating the feature numbers for the 29 | different groups.} 30 | } 31 | \value{ 32 | A data.table that contains the following columns: 33 | \describe{ 34 | \item{id_combination}{Positive integer. Represents a unique key for each combination. Note that the table 35 | is sorted by \code{id_combination}, so that is always equal to \code{x[["id_combination"]] = 1:nrow(x)}.} 36 | \item{features}{List. Each item of the list is an integer vector where \code{features[[i]]} 37 | represents the indices of the features included in combination \code{i}. Note that all the items 38 | are sorted such that \code{features[[i]] == sort(features[[i]])} is always true.} 39 | \item{n_features}{Vector of positive integers. \code{n_features[i]} equals the number of features in combination 40 | \code{i}, i.e. \code{n_features[i] = length(features[[i]])}.}. 41 | \item{N}{Positive integer. The number of unique ways to sample \code{n_features[i]} features 42 | from \code{m} different features, without replacement.} 43 | } 44 | } 45 | \description{ 46 | Define feature combinations, and fetch additional information about each unique combination 47 | } 48 | \examples{ 49 | # All combinations 50 | x <- feature_combinations(m = 3) 51 | nrow(x) # Equals 2^3 = 8 52 | 53 | # Subsample of combinations 54 | x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2) 55 | } 56 | \author{ 57 | Nikolai Sellereite, Martin Jullum 58 | } 59 | -------------------------------------------------------------------------------- /man/sample_ctree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sampling.R 3 | \name{sample_ctree} 4 | \alias{sample_ctree} 5 | \title{Sample ctree variables from a given conditional inference tree} 6 | \usage{ 7 | sample_ctree(tree, n_samples, x_test, x_train, p, sample) 8 | } 9 | \arguments{ 10 | \item{tree}{List. Contains tree which is an object of type ctree built from the party package. 11 | Also contains given_ind, the features to condition upon.} 12 | 13 | \item{n_samples}{Numeric. Indicates how many samples to use for MCMC.} 14 | 15 | \item{x_test}{Matrix, data.frame or data.table with the features of the observation whose 16 | predictions ought to be explained (test data). Dimension \code{1xp} or \code{px1}.} 17 | 18 | \item{x_train}{Matrix, data.frame or data.table with training data.} 19 | 20 | \item{p}{Positive integer. The number of features.} 21 | 22 | \item{sample}{Boolean. True indicates that the method samples from the terminal node 23 | of the tree whereas False indicates that the method takes all the observations if it is 24 | less than n_samples.} 25 | } 26 | \value{ 27 | data.table with \code{n_samples} (conditional) Gaussian samples 28 | } 29 | \description{ 30 | Sample ctree variables from a given conditional inference tree 31 | } 32 | \examples{ 33 | if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) { 34 | m <- 10 35 | n <- 40 36 | n_samples <- 50 37 | mu <- rep(1, m) 38 | cov_mat <- cov(matrix(rnorm(n * m), n, m)) 39 | x_train <- data.table::data.table(MASS::mvrnorm(n, mu, cov_mat)) 40 | x_test <- MASS::mvrnorm(1, mu, cov_mat) 41 | x_test_dt <- data.table::setDT(as.list(x_test)) 42 | given_ind <- c(4, 7) 43 | dependent_ind <- (1:dim(x_train)[2])[-given_ind] 44 | x <- x_train[, given_ind, with = FALSE] 45 | y <- x_train[, dependent_ind, with = FALSE] 46 | df <- data.table::data.table(cbind(y, x)) 47 | colnames(df) <- c(paste0("Y", 1:ncol(y)), paste0("V", given_ind)) 48 | ynam <- paste0("Y", 1:ncol(y)) 49 | fmla <- as.formula(paste(paste(ynam, collapse = "+"), "~ .")) 50 | datact <- party::ctree(fmla, data = df, controls = party::ctree_control( 51 | minbucket = 7, 52 | mincriterion = 0.95 53 | )) 54 | tree <- list(tree = datact, given_ind = given_ind, dependent_ind = dependent_ind) 55 | shapr:::sample_ctree( 56 | tree = tree, n_samples = n_samples, x_test = x_test_dt, x_train = x_train, 57 | p = length(x_test), sample = TRUE 58 | ) 59 | } 60 | } 61 | \author{ 62 | Annabelle Redelmeier 63 | } 64 | \keyword{internal} 65 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | 2 | # Minor release, shapr 0.2.0 3 | 4 | * Adds the ctree approach from new paper 5 | * Simplified supported for custom models 6 | * Adds comprehensive check suite for feature consistency 7 | 8 | ## Test environments 9 | 10 | * GitHub Actions (windows-latest): R 4.0 11 | * GitHub Actions (ubuntu-16.04): R 4.0, 3.6, 3.5 12 | * GitHub Actions (macOS-latest): R-release, 4.0 13 | * win-builder (x86_64-w64-mingw32): R 4.0, 3.6, R-devel 14 | * local Ubuntu 18.04: R 3.6 15 | * local Windows 10: R 4.0 16 | * R-hub (windows-x86_64-devel): R-devel 17 | * R-hub (macos-highsierra-release-cran): R-release 18 | 19 | * local Ubuntu 18.04: R 3.6 (without packages in Suggests): 20 | ```devtools::check(vignettes = FALSE, env_vars=c(`_R_CHECK_DEPENDS_ONLY_` = "true"))``` 21 | 22 | ## R CMD check results 23 | 24 | There were no ERRORs or WARNINGs. 25 | 26 | There was 2 NOTES 27 | 28 | *NOTE 1 (on local Windows 10: R 4.0): 29 | 30 | Note: information on .o files for i386 is not available 31 | Note: information on .o files for x64 is not available 32 | File 'C:/Users/jullum/Dropbox/Local_work/Git/shapr.Rcheck/shapr/libs/i386/shapr.dll': 33 | Found '_exit', possibly from '_exit' (C) 34 | Found 'abort', possibly from 'abort' (C), 'runtime' (Fortran) 35 | Found 'exit', possibly from 'exit' (C), 'stop' (Fortran) 36 | Found 'printf', possibly from 'printf' (C) 37 | File 'C:/Users/jullum/Dropbox/Local_work/Git/shapr.Rcheck/shapr/libs/x64/shapr.dll': 38 | Found '_exit', possibly from '_exit' (C) 39 | Found 'abort', possibly from 'abort' (C), 'runtime' (Fortran) 40 | Found 'exit', possibly from 'exit' (C), 'stop' (Fortran) 41 | Found 'printf', possibly from 'printf' (C) 42 | 43 | Compiled code should not call entry points which might terminate R nor 44 | write to stdout/stderr instead of to the console, nor use Fortran I/O 45 | nor system RNGs. The detected symbols are linked into the code but 46 | might come from libraries and not actually be called. 47 | 48 | See 'Writing portable packages' in the 'Writing R Extensions' manual. 49 | 50 | > I believe this is a false-positive ref https://stackoverflow.com/questions/64402688/information-on-o-files-for-x64-is-not-available-note-on-r-package-checks-using 51 | 52 | *NOTE 2 (on all winbuilder + R-hub servers) 53 | 54 | Days since last update: 6 55 | 56 | > The previous release was a basic patch after the package was taken off CRAN. This is a proper release with new features. 57 | 58 | ## Downstream dependencies 59 | There are currently no downstream dependencies for this package. 60 | -------------------------------------------------------------------------------- /man/create_ctree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sampling.R 3 | \name{create_ctree} 4 | \alias{create_ctree} 5 | \title{Make all conditional inference trees} 6 | \usage{ 7 | create_ctree( 8 | given_ind, 9 | x_train, 10 | mincriterion, 11 | minsplit, 12 | minbucket, 13 | use_partykit = "on_error" 14 | ) 15 | } 16 | \arguments{ 17 | \item{given_ind}{Numeric value. Indicates which features are conditioned on.} 18 | 19 | \item{x_train}{Numeric vector. Indicates the specific values of features for individual i.} 20 | 21 | \item{mincriterion}{Numeric value or vector equal to 1 - alpha where alpha is the nominal level of the conditional 22 | independence tests. 23 | Can also be a vector equal to the length of the number of features indicating which mincriterion to use 24 | when conditioning on various numbers of features.} 25 | 26 | \item{minsplit}{Numeric value. Equal to the value that the sum of the left and right daughter nodes need to exceed.} 27 | 28 | \item{minbucket}{Numeric value. Equal to the minimum sum of weights in a terminal node.} 29 | 30 | \item{use_partykit}{String. In some semi-rare cases \code{partyk::ctree} runs into an error related to the LINPACK 31 | used by R. To get around this problem, one may fall back to using the newer (but slower) \code{partykit::ctree} 32 | function, which is a reimplementation of the same method. Setting this parameter to \code{"on_error"} (default) 33 | falls back to \code{partykit::ctree}, if \code{party::ctree} fails. Other options are \code{"never"}, which always 34 | uses \code{party::ctree}, and \code{"always"}, which always uses \code{partykit::ctree}. A warning message is 35 | created whenever \code{partykit::ctree} is used.} 36 | } 37 | \value{ 38 | List with conditional inference tree and the variables conditioned/not conditioned on. 39 | } 40 | \description{ 41 | Make all conditional inference trees 42 | } 43 | \examples{ 44 | if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) { 45 | m <- 10 46 | n <- 40 47 | n_samples <- 50 48 | mu <- rep(1, m) 49 | cov_mat <- cov(matrix(rnorm(n * m), n, m)) 50 | x_train <- data.table::data.table(MASS::mvrnorm(n, mu, cov_mat)) 51 | given_ind <- c(4, 7) 52 | mincriterion <- 0.95 53 | minsplit <- 20 54 | minbucket <- 7 55 | sample <- TRUE 56 | create_ctree( 57 | given_ind = given_ind, x_train = x_train, 58 | mincriterion = mincriterion, minsplit = minsplit, 59 | minbucket = minbucket, use_partykit = "on_error" 60 | ) 61 | } 62 | } 63 | \author{ 64 | Annabelle Redelmeier, Martin Jullum 65 | } 66 | \keyword{internal} 67 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(explain,combined) 4 | S3method(explain,copula) 5 | S3method(explain,ctree) 6 | S3method(explain,ctree_comb_mincrit) 7 | S3method(explain,empirical) 8 | S3method(explain,gaussian) 9 | S3method(explain,independence) 10 | S3method(get_model_specs,gam) 11 | S3method(get_model_specs,glm) 12 | S3method(get_model_specs,lm) 13 | S3method(get_model_specs,ranger) 14 | S3method(get_model_specs,xgb.Booster) 15 | S3method(model_checker,default) 16 | S3method(model_checker,gam) 17 | S3method(model_checker,glm) 18 | S3method(model_checker,lm) 19 | S3method(model_checker,ranger) 20 | S3method(model_checker,xgb.Booster) 21 | S3method(plot,shapr) 22 | S3method(predict_model,default) 23 | S3method(predict_model,gam) 24 | S3method(predict_model,glm) 25 | S3method(predict_model,lm) 26 | S3method(predict_model,ranger) 27 | S3method(predict_model,xgb.Booster) 28 | S3method(prepare_data,copula) 29 | S3method(prepare_data,ctree) 30 | S3method(prepare_data,empirical) 31 | S3method(prepare_data,gaussian) 32 | S3method(prepare_data,independence) 33 | S3method(print,shapr) 34 | export(aicc_full_single_cpp) 35 | export(check_features) 36 | export(compute_shapley) 37 | export(correction_matrix_cpp) 38 | export(create_ctree) 39 | export(explain) 40 | export(feature_combinations) 41 | export(feature_matrix_cpp) 42 | export(get_data_specs) 43 | export(get_model_specs) 44 | export(hat_matrix_cpp) 45 | export(mahalanobis_distance_cpp) 46 | export(make_dummies) 47 | export(model_checker) 48 | export(observation_impute_cpp) 49 | export(predict_model) 50 | export(prepare_and_predict) 51 | export(prepare_data) 52 | export(preprocess_data) 53 | export(rss_cpp) 54 | export(shapr) 55 | export(update_data) 56 | export(weight_matrix_cpp) 57 | importFrom(Rcpp,sourceCpp) 58 | importFrom(data.table,":=") 59 | importFrom(data.table,as.data.table) 60 | importFrom(data.table,between) 61 | importFrom(data.table,copy) 62 | importFrom(data.table,data.table) 63 | importFrom(data.table,fread) 64 | importFrom(data.table,fwrite) 65 | importFrom(data.table,is.data.table) 66 | importFrom(data.table,month) 67 | importFrom(data.table,rbindlist) 68 | importFrom(data.table,setcolorder) 69 | importFrom(data.table,setkey) 70 | importFrom(data.table,setnames) 71 | importFrom(data.table,uniqueN) 72 | importFrom(data.table,year) 73 | importFrom(graphics,hist) 74 | importFrom(graphics,plot) 75 | importFrom(graphics,rect) 76 | importFrom(stats,as.formula) 77 | importFrom(stats,contrasts) 78 | importFrom(stats,model.frame) 79 | importFrom(stats,model.matrix) 80 | importFrom(stats,predict) 81 | importFrom(stats,setNames) 82 | importFrom(utils,head) 83 | importFrom(utils,methods) 84 | importFrom(utils,tail) 85 | useDynLib(shapr, .registration = TRUE) 86 | -------------------------------------------------------------------------------- /man/predict_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/models.R 3 | \name{predict_model} 4 | \alias{predict_model} 5 | \alias{predict_model.default} 6 | \alias{predict_model.lm} 7 | \alias{predict_model.glm} 8 | \alias{predict_model.ranger} 9 | \alias{predict_model.xgb.Booster} 10 | \alias{predict_model.gam} 11 | \title{Generate predictions for different model classes} 12 | \usage{ 13 | predict_model(x, newdata) 14 | 15 | \method{predict_model}{default}(x, newdata) 16 | 17 | \method{predict_model}{lm}(x, newdata) 18 | 19 | \method{predict_model}{glm}(x, newdata) 20 | 21 | \method{predict_model}{ranger}(x, newdata) 22 | 23 | \method{predict_model}{xgb.Booster}(x, newdata) 24 | 25 | \method{predict_model}{gam}(x, newdata) 26 | } 27 | \arguments{ 28 | \item{x}{Model object for the model to be explained.} 29 | 30 | \item{newdata}{A data frame (or matrix) in which to look for variables with which to predict.} 31 | } 32 | \value{ 33 | Numeric 34 | } 35 | \description{ 36 | Performs prediction of response \code{\link[stats]{lm}}, \code{\link[stats]{glm}}, 37 | \code{\link[ranger]{ranger}}, \code{\link[mgcv:gam]{mgcv::gam}} and 38 | \code{\link[xgboost:xgb.train]{xgboost::xgb.train}} with binary or continuous 39 | response. See details for more information. 40 | } 41 | \details{ 42 | The following models are currently supported: 43 | \itemize{ 44 | \item \code{\link[stats:lm]{stats::lm}} 45 | \item \code{\link[stats:glm]{stats::glm}} 46 | \item \code{\link[ranger:ranger]{ranger::ranger}} 47 | \item \code{\link[mgcv:gam]{mgcv::gam}} 48 | \item \code{\link[xgboost:xgb.train]{xgboost::xgb.train}} 49 | } 50 | 51 | The returned object \code{p} always satisfies the following properties: 52 | \itemize{ 53 | \item \code{is.atomic(p)} equals \code{TRUE} 54 | \item \code{is.double(p)} equals \code{TRUE} 55 | } 56 | 57 | If you have a binary classification model we'll always return the probability prediction 58 | for a single class. 59 | 60 | For more details on how to explain other types of models (i.e. custom models), see the Advanced usage section 61 | of the vignette: \cr 62 | From R: \code{vignette("understanding_shapr", package = "shapr")} \cr 63 | Web: \url{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#explain-custom-models} 64 | } 65 | \examples{ 66 | if (requireNamespace("MASS", quietly = TRUE)) { 67 | # Load example data 68 | data("Boston", package = "MASS") 69 | # Split data into test- and training data 70 | x_train <- head(Boston, -3) 71 | x_test <- tail(Boston, 3) 72 | # Fit a linear model 73 | model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) 74 | 75 | # Predicting for a model with a standardized format 76 | predict_model(x = model, newdata = x_test) 77 | } 78 | } 79 | \author{ 80 | Martin Jullum 81 | } 82 | \keyword{internal} 83 | -------------------------------------------------------------------------------- /inst/scripts/example_custom_model.R: -------------------------------------------------------------------------------- 1 | library(gbm) 2 | library(shapr) 3 | 4 | # Load data 5 | data("Boston", package = "MASS") 6 | 7 | # Create test- and training data 8 | x_var <- c("lstat", "rm", "dis", "indus") 9 | y_var <- "medv" 10 | 11 | x_train <- as.matrix(Boston[-1:-6, x_var]) 12 | y_train <- Boston[-1:-6, y_var] 13 | x_test <- as.matrix(Boston[1:6, x_var]) 14 | 15 | form = as.formula(paste0(y_var,"~",paste0(x_var,collapse="+"))) 16 | 17 | library(gbm) 18 | 19 | xy_train <- data.frame(x_train,medv = y_train) 20 | 21 | 22 | # Fitting a gbm model 23 | set.seed(825) 24 | model <- gbm::gbm( 25 | form, 26 | data = xy_train, 27 | distribution = "gaussian" 28 | ) 29 | 30 | #### Full feature versions of the three required model functions #### 31 | 32 | predict_model.gbm <- function(x, newdata) { 33 | 34 | if (!requireNamespace('gbm', quietly = TRUE)) { 35 | stop('The gbm package is required for predicting train models') 36 | } 37 | 38 | model_type <- ifelse( 39 | x$distribution$name %in% c("bernoulli","adaboost"), 40 | "classification", 41 | "regression" 42 | ) 43 | if (model_type == "classification") { 44 | 45 | predict(x, as.data.frame(newdata), type = "response",n.trees = x$n.trees) 46 | } else { 47 | 48 | predict(x, as.data.frame(newdata),n.trees = x$n.trees) 49 | } 50 | } 51 | 52 | get_model_specs.gbm <- function(x){ 53 | feature_list = list() 54 | feature_list$labels <- labels(x$Terms) 55 | m <- length(feature_list$labels) 56 | 57 | feature_list$classes <- attr(x$Terms,"dataClasses")[-1] 58 | feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) 59 | feature_list$factor_levels[feature_list$classes=="factor"] <- NA # the model object doesn't contain factor levels info 60 | 61 | return(feature_list) 62 | } 63 | 64 | # Prepare the data for explanation 65 | set.seed(123) 66 | explainer <- shapr(xy_train, model) 67 | p0 <- mean(xy_train[,y_var]) 68 | explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0) 69 | # Plot results 70 | plot(explanation) 71 | 72 | 73 | # Minimal version of the three required model functions 74 | # Note: Working only for this exact version of the model class 75 | # Avoiding to define get_model_specs skips all feature 76 | # consistency checking between your data and model 77 | 78 | # Removing the previously defined functions to simulate a fresh start 79 | rm(predict_model.gbm) 80 | rm(get_model_specs.gbm) 81 | 82 | 83 | predict_model.gbm <- function(x, newdata) { 84 | predict(x, as.data.frame(newdata),n.trees = x$n.trees) 85 | } 86 | 87 | 88 | # Prepare the data for explanation 89 | set.seed(123) 90 | explainer <- shapr(x_train, model) 91 | p0 <- mean(xy_train[,y_var]) 92 | explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0) 93 | # Plot results 94 | plot(explanation) 95 | -------------------------------------------------------------------------------- /src/impute_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //' Get imputed data 5 | //' 6 | //' @param index_xtrain Positive integer. Represents a sequence of row indices from \code{xtrain}, 7 | //' i.e. \code{min(index_xtrain) >= 1} and \code{max(index_xtrain) <= nrow(xtrain)}. 8 | //' 9 | //' @param index_s Positive integer. Represents a sequence of row indices from \code{S}, 10 | //' i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}. 11 | //' 12 | //' @param xtrain Numeric matrix. 13 | //' 14 | //' @param xtest Numeric matrix. Represents a single test observation. 15 | //' 16 | //' @param S Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals 17 | //' the total number of sampled/non-sampled feature combinations and \code{m} equals 18 | //' the total number of unique features. Note that \code{m = ncol(xtrain)}. See details 19 | //' for more information. 20 | //' 21 | //' @details \code{S(i, j) = 1} if and only if feature \code{j} is present in feature 22 | //' combination \code{i}, otherwise \code{S(i, j) = 0}. I.e. if \code{m = 3}, there 23 | //' are \code{2^3 = 8} unique ways to combine the features. In this case \code{dim(S) = c(8, 3)}. 24 | //' Let's call the features \code{x1, x2, x3} and take a closer look at the combination 25 | //' represented by \code{s = c(x1, x2)}. If this combination is represented by the second row, 26 | //' the following is true: \code{S[2, 1:3] = c(1, 1, 0)}. 27 | //' 28 | //' The returned object, \code{X}, is a numeric matrix where 29 | //' \code{dim(X) = c(length(index_xtrain), ncol(xtrain))}. If feature \code{j} is present in 30 | //' the k-th observation, that is \code{S[index_[k], j] == 1}, \code{X[k, j] = xtest[1, j]}. 31 | //' Otherwise \code{X[k, j] = xtrain[index_xtrain[k], j]}. 32 | //' 33 | //' @export 34 | //' @keywords internal 35 | //' 36 | //' @return Numeric matrix 37 | //' 38 | //' @author Nikolai Sellereite 39 | // [[Rcpp::export]] 40 | NumericMatrix observation_impute_cpp(IntegerVector index_xtrain, 41 | IntegerVector index_s, 42 | NumericMatrix xtrain, 43 | NumericMatrix xtest, 44 | IntegerMatrix S) { 45 | 46 | 47 | // Error-checks 48 | if (index_xtrain.length() != index_s.length()) 49 | Rcpp::stop("The length of index_train and index_s should be equal."); 50 | 51 | if (xtrain.ncol() != xtest.ncol()) 52 | Rcpp::stop("Number of columns in xtrain and xtest should be equal."); 53 | 54 | NumericMatrix X(index_xtrain.length(), xtrain.ncol()); 55 | 56 | for (int i = 0; i < X.nrow(); ++i) { 57 | 58 | for (int j = 0; j < X.ncol(); ++j) { 59 | 60 | if (S(index_s[i] - 1, j) > 0) { 61 | X(i, j) = xtest(0, j); 62 | } else { 63 | X(i, j) = xtrain(index_xtrain[i] - 1, j); 64 | } 65 | 66 | } 67 | } 68 | 69 | return X; 70 | } 71 | -------------------------------------------------------------------------------- /inst/scripts/example_ctree_method.R: -------------------------------------------------------------------------------- 1 | library(shapr) 2 | 3 | data("Boston", package = "MASS") 4 | 5 | x_var <- c("lstat", "rm", "dis", "indus") 6 | y_var <- "medv" 7 | 8 | #### 1) Example with just continuous features #### 9 | 10 | x_train <- as.matrix(tail(Boston[, x_var], -6)) 11 | y_train <- tail(Boston[, y_var], -6) 12 | x_test <- as.matrix(head(Boston[, x_var], 6)) 13 | 14 | # Just looking at the dependence between the features 15 | cor(x_train) 16 | 17 | # Fitting a basic xgboost model to the training data 18 | model <- xgboost::xgboost( 19 | data = x_train, 20 | label = y_train, 21 | nround = 20, 22 | verbose = FALSE 23 | ) 24 | 25 | # Prepare the data for explanation 26 | explainer <- shapr(x_train, model) 27 | 28 | # Spedifying the phi_0, i.e. the expected prediction without any features 29 | p0 <- mean(y_train) 30 | 31 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using 32 | # the ctree approach with default mincriterion = 0.95, minsplit = 20, minbucket = 7, 33 | # and sample = TRUE 34 | explanation <- explain(x_test, explainer, 35 | approach = "ctree", 36 | prediction_zero = p0) 37 | 38 | # Printing the Shapley values for the test data 39 | explanation$dt 40 | 41 | # Finally we plot the resulting explanations 42 | plot(explanation) 43 | 44 | 45 | #### 2) Example with mixed continuous and categorical features #### 46 | library(shapr) 47 | 48 | data("Boston", package = "MASS") 49 | 50 | x_var <- c("lstat", "rm", "dis", "indus") 51 | y_var <- "medv" 52 | 53 | x_train <- as.matrix(tail(Boston[, x_var], -6)) 54 | y_train <- tail(Boston[, y_var], -6) 55 | x_test <- as.matrix(head(Boston[, x_var], 6)) 56 | 57 | x_train_cat <- as.data.frame(x_train) 58 | x_test_cat <- as.data.frame(x_test) 59 | 60 | # convert to factors for illustational purpose 61 | x_train_cat$rm <- factor(round(x_train_cat$rm)) 62 | x_test_cat$rm <- factor(round(x_test_cat$rm), levels = c(8, 9, 7, 4, 5, 6)) 63 | 64 | # Make sure they have the same levels! 65 | print(levels(x_train_cat$rm)) 66 | print(levels(x_test_cat$rm)) 67 | 68 | # -- special function when using categorical data + xgboost 69 | dummylist <- make_dummies(traindata = x_train_cat, testdata = x_test_cat) 70 | 71 | x_train_dummy <- dummylist$train_dummies 72 | x_test_dummy <- dummylist$test_dummies 73 | 74 | # Fitting a basic xgboost model to the training data 75 | model_cat <- xgboost::xgboost( 76 | data = x_train_dummy, 77 | label = y_train, 78 | nround = 20, 79 | verbose = FALSE 80 | ) 81 | model_cat$feature_list <- dummylist$feature_list 82 | 83 | explainer_cat <- shapr(dummylist$traindata_new, model_cat) 84 | 85 | # Specifying the phi_0, i.e. the expected prediction without any features 86 | p0 <- mean(y_train) 87 | 88 | # dummylist$testdata_new$rm 89 | 90 | explanation_cat <- explain( 91 | dummylist$testdata_new, 92 | approach = "ctree", 93 | explainer = explainer_cat, 94 | prediction_zero = p0 95 | ) 96 | 97 | # Plot the resulting explanations for observations 1 and 6, excluding 98 | # the no-covariate effect 99 | plot(explanation_cat) 100 | -------------------------------------------------------------------------------- /src/distance.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //' (Generalized) Mahalanobis distance 5 | //' 6 | //' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}. 7 | //' 8 | //' @param featureList List of vectors indicating all factor combinations that should be included in the computations. Assumes that the first one is empty. 9 | //' @param mcov Matrix. The Sigma-matrix in the Mahalanobis distance formula (\code{stats::cov(Xtrain_mat)}) gives Mahalanobis distance, 10 | //' \code{diag(m)} gives the Euclidean distance. 11 | //' @param S_scale_dist Logical indicating 12 | //' @param Xtrain_mat Matrix 13 | //' @param Xtest_mat Matrix 14 | //' 15 | //' @export 16 | //' @keywords internal 17 | //' 18 | //' @return Array of three dimensions. Contains the squared distance for between all training and test observations for all feature combinations passed to the function. 19 | //' @author Martin Jullum 20 | // [[Rcpp::export]] 21 | arma::cube mahalanobis_distance_cpp(Rcpp::List featureList,arma::mat Xtrain_mat, arma::mat Xtest_mat, arma::mat mcov, bool S_scale_dist) { 22 | 23 | using namespace arma; 24 | 25 | // Define variables 26 | int ntrain = Xtrain_mat.n_rows; 27 | int ntest = Xtest_mat.n_rows; 28 | int p = featureList.size(); 29 | 30 | 31 | arma::mat mcov0; 32 | arma::mat cholDec; 33 | arma::mat mu0; 34 | arma::mat mu; 35 | arma::mat X; 36 | 37 | arma::cube out(ntrain,ntest,p,arma::fill::zeros); 38 | 39 | // Declaring some private variables 40 | 41 | double acc; 42 | uint32_t icol, irow, ii; 43 | double S_scale; 44 | IntegerVector temp; 45 | 46 | for (int k = 0; k < p; ++k){ 47 | temp = featureList[k]; 48 | if(temp.length() == 0) { 49 | continue; 50 | } 51 | arma::uvec theseFeatures = featureList[k]; 52 | theseFeatures = theseFeatures-1; 53 | 54 | mcov0 = mcov.submat(theseFeatures,theseFeatures); 55 | X = Xtrain_mat.cols(theseFeatures); 56 | mu0 = Xtest_mat.cols(theseFeatures); 57 | 58 | uint32_t d = X.n_cols; 59 | vec tmp(d); 60 | cholDec = trimatl(chol(mcov0).t()); 61 | vec D = cholDec.diag(); 62 | if(S_scale_dist){ 63 | S_scale = 1.0/pow(theseFeatures.n_elem,2.0); 64 | } else { 65 | S_scale = 1.0; 66 | } 67 | 68 | for (int j = 0; j < ntest; ++j) { 69 | mu = mu0.row(j); 70 | 71 | // For each of the "n" random vectors, forwardsolve the corresponding linear system. 72 | // Forwardsolve because I'm using the lower triangle Cholesky. 73 | for(icol = 0; icol < ntrain; icol++) 74 | { 75 | 76 | for(irow = 0; irow < d; irow++) 77 | { 78 | acc = 0.0; 79 | 80 | for(ii = 0; ii < irow; ii++) acc += tmp.at(ii) * cholDec.at(irow, ii); 81 | 82 | tmp.at(irow) = ( X.at(icol, irow) - mu.at(irow) - acc ) / D.at(irow); 83 | } 84 | 85 | out.at(icol,j,k) = sum(square(tmp)); 86 | } 87 | 88 | } 89 | out.slice(k) *= S_scale; 90 | } 91 | 92 | 93 | return out; 94 | } 95 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # shapr 0.2.0.9000 2 | 3 | * Development version 4 | * Added support for groupSHAP, including check of appropriate groups, examples and tests 5 | * Various modifications to input of internal functions to reflect that Shapley values may be 6 | computed both feature-wise and group-wise 7 | * Fixed bug when passing non-named data to shapr() or explain() (e.g. ```shapr(data[,1:5],model...)``` 8 | 9 | # shapr 0.2.0 10 | 11 | * Minor CRAN release 12 | * Added the new dependence modeling approach "ctree" which handles categorical features in addition 13 | to numerical ones. For more information see our paper https://doi.org/10.1007/978-3-030-57321-8_7 14 | * Added support to explain models which take as input categorical features for model classes like xgboost 15 | which originally takes only numeric input. On the user side, an additional call to the new *make_dummies* 16 | function is required. See the vignette for details. 17 | * Slight change in the user procedure for explaining predictions from custom models. This now requires 18 | only a single function *predict_model*. 19 | * Introduced a thorough system for extracting and checking the feature information in the model and the data 20 | passed to *shapr* and *explain*. The features in the data are checked for consistency with what can be extracted 21 | from the model object. If the model object is missing some of the necessary information, the info from the data 22 | is used instead. The system checks feature labels, classes, and any factor levels. 23 | * Due to the previous point, the *feature_labels* option previously used for custom models is removed. 24 | * Added a manual testing script for custom model (currently cannot be handled by testthat due to environment issues). 25 | * A few under-the-hood changes for checking in the *shapr* function. 26 | 27 | # shapr 0.1.4 28 | 29 | * Patch to fulfill CRAN policy of using packages under Suggests conditionally (in tests and examples) 30 | 31 | # shapr 0.1.3 32 | 33 | * Fix installation error on Solaris 34 | * Updated README with CRAN installation instructions and badges 35 | 36 | # shapr 0.1.2 37 | 38 | * CRAN release 39 | * Removed unused clustering code 40 | * Removed several package dependencies 41 | * Moved automatic check and pkgdown site build from Circle CI to GitHub actions 42 | * Some minor efficiency fixes 43 | * Changed stopping threshold from 12 to 13 features for none-sampling version of 44 | KernelSHAP for consistency with our recommendation 45 | * Changed package title (shortened) 46 | * Minor fixes to fulfill CRAN policy 47 | * Improved documentation 48 | * Revised internal/external and exported/non-exported functions, leading to far 49 | fewer external functions and a cleaner manual. 50 | 51 | # shapr 0.1.1 52 | 53 | * Journal of Open Source Software release 54 | * Improved installation instructions and community guidelines in README 55 | * Improved documentation 56 | * Some minor bugfixes 57 | 58 | # shapr 0.1.0 59 | 60 | * Support for custom models 61 | * Improved documentation 62 | * Automated testing using [testthat](https://github.com/r-lib/testthat) 63 | * Added vignette that gives an introduction to the package 64 | * Added webpage for package using [pkgdown](https://github.com/r-lib/pkgdown) 65 | * Improved API for end user 66 | * Various bugfixes 67 | 68 | # shapr 0.0.0.9000 69 | 70 | * First version of the package. Currently under development. 71 | -------------------------------------------------------------------------------- /src/weighted_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | //' Calculate weight matrix 5 | //' 6 | //' @param subsets List. Each of the elements equals an integer 7 | //' vector representing a valid combination of features/feature groups. 8 | //' @param m Integer. Number of features/feature groups 9 | //' @param n Integer. Number of combinations 10 | //' @param w Numeric vector of length \code{n}, i.e. \code{w[i]} equals 11 | //' the Shapley weight of feature/feature group combination \code{i}, represented by 12 | //' \code{subsets[[i]]}. 13 | //' 14 | //' @export 15 | //' @keywords internal 16 | //' 17 | //' @return Matrix of dimension n x m + 1 18 | //' @author Nikolai Sellereite 19 | // [[Rcpp::export]] 20 | arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w){ 21 | 22 | // Note that Z is a n x (m + 1) matrix, where m is the number 23 | // of unique subsets. All elements in the first column are equal to 1. 24 | // For j > 0, Z(i, j) = 1 if and only if feature/feature group j is present in 25 | // the ith combination of subsets. In example, if Z(i, j) = 1 we know that 26 | // j is present in subsets[i]. 27 | 28 | // Note that w represents the diagonal in W, where W is a diagoanl 29 | // n x n matrix. 30 | 31 | // Note that X.t() equals Z.t() * W, where w is the diagonal of W, which by 32 | // definition gives X = W * Z. Since W is a diagonal matrix we could 33 | // simplify this so that X(i, j) = sum(W(i, k) * Z(k, j)) = W(i, i) * Z(i, j)) 34 | // for all combinations of (i, j). 35 | 36 | // Note that R represents a (m + 1) * n matrix, i.e. R = (X.t() * Z)^-1 * X.t(), 37 | // where X.t() = Z.t() * W. 38 | 39 | // See \url{https://arxiv.org/pdf/1903.10464.pdf} for additional details, 40 | // i.e. section 3.2. 41 | 42 | // Define objects 43 | int n_elements; 44 | IntegerVector subset_vec; 45 | arma::mat Z(n, m + 1, arma::fill::zeros), X(n, m + 1, arma::fill::zeros); 46 | arma::mat R(m + 1, n, arma::fill::zeros); 47 | 48 | // Populate Z 49 | for (int i = 0; i < n; i++) { 50 | 51 | // Set all elements in the first column equal to 1 52 | Z(i, 0) = 1; 53 | 54 | // Extract subsets 55 | subset_vec = subsets[i]; 56 | n_elements = subset_vec.length(); 57 | if (n_elements > 0) { 58 | for (int j = 0; j < n_elements; j++) 59 | Z(i, subset_vec[j]) = 1; 60 | } 61 | } 62 | 63 | // Populate X 64 | for (int i = 0; i < n; i++) { 65 | 66 | for (int j = 0; j < Z.n_cols; j++) { 67 | 68 | X(i, j) = w[i] * Z(i, j); 69 | } 70 | } 71 | 72 | R = inv(X.t() * Z) * X.t(); 73 | 74 | return R; 75 | } 76 | 77 | //' Get feature matrix 78 | //' 79 | //' @param features List 80 | //' @param m Positive integer. Total number of features 81 | //' 82 | //' @export 83 | //' @keywords internal 84 | //' 85 | //' @return Matrix 86 | //' @author Nikolai Sellereite 87 | // [[Rcpp::export]] 88 | NumericMatrix feature_matrix_cpp(List features, int m) { 89 | 90 | // Define variables 91 | int n_combinations; 92 | n_combinations = features.length(); 93 | NumericMatrix A(n_combinations, m); 94 | 95 | // Error-check 96 | IntegerVector features_zero = features[0]; 97 | if (features_zero.length() > 0) 98 | Rcpp::stop("The first element of features should be an empty vector, i.e. integer(0)"); 99 | 100 | for (int i = 1; i < n_combinations; ++i) { 101 | 102 | NumericVector feature_vec = features[i]; 103 | 104 | for (int j = 0; j < feature_vec.length(); ++j) { 105 | 106 | A(i, feature_vec[j] - 1) = 1.0; 107 | } 108 | } 109 | 110 | return A; 111 | } 112 | 113 | -------------------------------------------------------------------------------- /man/shapr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/shapley.R 3 | \name{shapr} 4 | \alias{shapr} 5 | \title{Create an explainer object with Shapley weights for test data.} 6 | \usage{ 7 | shapr(x, model, n_combinations = NULL, group = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{Numeric matrix or data.frame/data.table. Contains the data used to estimate the (conditional) 11 | distributions for the features needed to properly estimate the conditional expectations in the Shapley formula.} 12 | 13 | \item{model}{The model whose predictions we want to explain. Run 14 | \code{\link[shapr:get_supported_models]{shapr:::get_supported_models()}} 15 | for a table of which models \code{shapr} supports natively.} 16 | 17 | \item{n_combinations}{Integer. The number of feature combinations to sample. If \code{NULL}, 18 | the exact method is used and all combinations are considered. The maximum number of 19 | combinations equals \code{2^ncol(x)}.} 20 | 21 | \item{group}{List. If \code{NULL} regular feature wise Shapley values are computed. 22 | If provided, group wise Shapley values are computed. \code{group} then has length equal to 23 | the number of groups. The list element contains character vectors with the features included 24 | in each of the different groups.} 25 | } 26 | \value{ 27 | Named list that contains the following items: 28 | \describe{ 29 | \item{exact}{Boolean. Equals \code{TRUE} if \code{n_combinations = NULL} or 30 | \code{n_combinations < 2^ncol(x)}, otherwise \code{FALSE}.} 31 | \item{n_features}{Positive integer. The number of columns in \code{x}} 32 | \item{S}{Binary matrix. The number of rows equals the number of unique combinations, and 33 | the number of columns equals the total number of features. I.e. let's say we have a case with 34 | three features. In that case we have \code{2^3 = 8} unique combinations. If the j-th 35 | observation for the i-th row equals \code{1} it indicates that the j-th feature is present in 36 | the i-th combination. Otherwise it equals \code{0}.} 37 | \item{W}{Matrix. This matrix is equal to the matrix \code{R_D} in Equation 7 in the reference 38 | of \code{link{explain}}. The Shapley value for a test observation will be equal to the matrix-vector product 39 | of \code{W} and the contribution vector.} 40 | \item{X}{data.table. Returned object from \code{\link{feature_combinations}}} 41 | \item{x_train}{data.table. Transformed \code{x} into a data.table.} 42 | \item{feature_list}{List. The \code{updated_feature_list} output from 43 | \code{\link[shapr:preprocess_data]{preprocess_data}}} 44 | } 45 | 46 | In addition to the items above, \code{model} and \code{n_combinations} are also present in the returned object. 47 | } 48 | \description{ 49 | Create an explainer object with Shapley weights for test data. 50 | } 51 | \examples{ 52 | if (requireNamespace("MASS", quietly = TRUE)) { 53 | # Load example data 54 | data("Boston", package = "MASS") 55 | df <- Boston 56 | 57 | # Example using the exact method 58 | x_var <- c("lstat", "rm", "dis", "indus") 59 | y_var <- "medv" 60 | df0 <- df[, x_var] 61 | model <- lm(medv ~ lstat + rm + dis + indus, data = df) 62 | explainer <- shapr(df0, model) 63 | 64 | print(nrow(explainer$X)) 65 | # 16 (which equals 2^4) 66 | 67 | # Example using approximation 68 | y_var <- "medv" 69 | model <- lm(medv ~ ., data = df) 70 | explainer <- shapr(df, model, n_combinations = 1e3) 71 | 72 | print(nrow(explainer$X)) 73 | 74 | # Example using approximation where n_combinations > 2^m 75 | x_var <- c("lstat", "rm", "dis", "indus") 76 | y_var <- "medv" 77 | model <- lm(medv ~ lstat + rm + dis + indus, data = df) 78 | explainer <- shapr(df0, model, n_combinations = 1e3) 79 | 80 | print(nrow(explainer$X)) 81 | # 16 (which equals 2^4) 82 | 83 | # Example using groups 84 | group <- list(A=x_var[1:2], B=x_var[3:4]) 85 | 86 | explainer_group <- shapr(df0, model, group = group) 87 | print(nrow(explainer_group$X)) 88 | # 4 (which equals 2^(#groups)) 89 | } 90 | } 91 | \author{ 92 | Nikolai Sellereite 93 | } 94 | -------------------------------------------------------------------------------- /R/predictions.R: -------------------------------------------------------------------------------- 1 | #' Calculate Shapley weights for test data 2 | #' 3 | #' @description This function should only be called internally, and not be used as 4 | #' a stand-alone function. 5 | #' 6 | #' @param dt data.table 7 | #' @param prediction_zero Numeric. The value to use for \code{phi_0}. 8 | #' @param explainer An object of class \code{explainer}. See \code{\link{shapr}}. 9 | #' 10 | #' @details If \code{dt} does not contain three columns called \code{id}, \code{id_combination} and \code{w} 11 | #' the function will fail. \code{id} represents a unique key for a given test observation, 12 | #' and \code{id_combination} is a unique key for which feature combination the row represents. \code{w} 13 | #' represents the Shapley value of feature combination given by \code{id_combination}. In addition 14 | #' to these three columns, \code{dt} should also have columns which matches the variables used 15 | #' when training the model. 16 | #' 17 | #' I.e. you have fitted a linear model using the features \code{x1}, 18 | #' \code{x2} and \code{x3}, and you want to explain 5 test observations using the exact method, i.e. 19 | #' setting \code{exact = TRUE} in \code{\link{shapr}}, the following properties should be satisfied 20 | #' \enumerate{ 21 | #' \item \code{colnames(dt)} equals \code{c("x1", "x2", "x3", "id", "id_combination", ""w)} 22 | #' \item \code{dt[, max(id)]} equals the number of test observations 23 | #' \item \code{dt[, min(id)]} equals 1L. 24 | #' \item \code{dt[, max(id_combination)]} equals \code{2^m} where m equals the number of features. 25 | #' \item \code{dt[, min(id_combination)]} equals 1L. 26 | #' \item \code{dt[, type(w)]} equals \code{double}. 27 | #' } 28 | #' 29 | #' 30 | #' @return An object of class \code{c("shapr", "list")}. For more details see \code{\link{explain}}. 31 | #' 32 | #' @keywords internal 33 | #' 34 | #' @author Nikolai Sellereite 35 | prediction <- function(dt, prediction_zero, explainer) { 36 | 37 | # Checks on input data 38 | id <- w <- id_combination <- p_hat <- NULL # due to NSE notes in R CMD check 39 | stopifnot( 40 | data.table::is.data.table(dt), 41 | !is.null(dt[["id"]]), 42 | !is.null(dt[["id_combination"]]), 43 | !is.null(dt[["w"]]) 44 | ) 45 | 46 | # Setup 47 | feature_names <- colnames(explainer$x_test) 48 | data.table::setkeyv(dt, c("id", "id_combination")) 49 | 50 | # Check that the number of test observations equals max(id) 51 | stopifnot(nrow(explainer$x_test) == dt[, max(id)]) 52 | 53 | # Reducing the prediction data.table 54 | max_id_combination <- nrow(explainer$S) 55 | V1 <- keep <- NULL # due to NSE notes in R CMD check 56 | dt[, keep := TRUE] 57 | first_element <- dt[, tail(.I, 1), .(id, id_combination)][id_combination %in% c(1, max_id_combination), V1] 58 | dt[id_combination %in% c(1, max_id_combination), keep := FALSE] 59 | dt[first_element, c("keep", "w") := list(TRUE, 1.0)] 60 | dt <- dt[keep == TRUE][, keep := NULL] 61 | 62 | # Predictions 63 | if (!all(dt[, unique(id_combination)] == 1)) { # Avoid warnings when predicting with empty newdata 64 | dt[id_combination != 1, p_hat := predict_model(explainer$model, newdata = .SD), .SDcols = feature_names] 65 | } 66 | dt[id_combination == 1, p_hat := prediction_zero] 67 | 68 | if (dt[, max(id_combination)] < max_id_combination) { 69 | p_all <- NULL 70 | } else { 71 | p_all <- dt[id_combination == max_id_combination, p_hat] 72 | names(p_all) <- 1:nrow(explainer$x_test) 73 | } 74 | 75 | 76 | # Calculate contributions 77 | dt_res <- dt[, .(k = sum((p_hat * w) / sum(w))), .(id, id_combination)] 78 | data.table::setkeyv(dt_res, c("id", "id_combination")) 79 | dt_mat <- data.table::dcast(dt_res, id_combination ~ id, value.var = "k") 80 | dt_mat[, id_combination := NULL] 81 | 82 | r <- list(p = p_all, dt_mat = dt_mat) 83 | 84 | return(r) 85 | } 86 | 87 | 88 | #' Compute shapley values 89 | #' @param explainer An \code{explain} object. 90 | #' @param contribution_mat The contribution matrix. 91 | #' @return A \code{data.table} with shapley values for each test observation. 92 | #' @export 93 | #' @keywords internal 94 | compute_shapley <- function(explainer, contribution_mat) { 95 | 96 | feature_names <- colnames(explainer$x_test) 97 | if (!explainer$is_groupwise) { 98 | shap_names <- feature_names 99 | } else { 100 | shap_names <- names(explainer$group) 101 | } 102 | 103 | 104 | kshap <- t(explainer$W %*% contribution_mat) 105 | dt_kshap <- data.table::as.data.table(kshap) 106 | colnames(dt_kshap) <- c("none", shap_names) 107 | 108 | return(dt_kshap) 109 | 110 | } 111 | -------------------------------------------------------------------------------- /inst/scripts/devel/compare_indep_implementations.R: -------------------------------------------------------------------------------- 1 | library(xgboost) 2 | library(shapr) 3 | 4 | data("Boston", package = "MASS") 5 | 6 | x_var <- c("crim", "zn", "indus","chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat")[1:6] 7 | y_var <- "medv" 8 | 9 | x_train <- as.matrix(Boston[-1:-6, x_var]) 10 | y_train <- Boston[-1:-6, y_var] 11 | x_test <- as.matrix(Boston[1:6, x_var]) 12 | 13 | x_test <- x_test[rep(1,1000),] 14 | 15 | # Looking at the dependence between the features 16 | cor(x_train) 17 | 18 | # Fitting a basic xgboost model to the training data 19 | model <- xgboost( 20 | data = x_train, 21 | label = y_train, 22 | nround = 20, 23 | verbose = FALSE 24 | ) 25 | 26 | # Prepare the data for explanation 27 | explainer <- shapr(x_train, model) 28 | 29 | # Specifying the phi_0, i.e. the expected prediction without any features 30 | p <- mean(y_train) 31 | 32 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using 33 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) 34 | t_old <- proc.time() 35 | explanation_old <- explain( 36 | x_test, 37 | approach = "empirical", 38 | type = "independence", 39 | explainer = explainer, 40 | prediction_zero = p, seed=111,n_samples = 100 41 | ) 42 | print(proc.time()-t_old) 43 | #user system elapsed 44 | #64.228 2.829 16.455 45 | 46 | t_new <- proc.time() 47 | explanation_new <- explain( 48 | x_test, 49 | approach = "independence", 50 | explainer = explainer, 51 | prediction_zero = p,seed = 111,n_samples = 100 52 | ) 53 | print(proc.time()-t_new) 54 | #user system elapsed 55 | #10.376 0.731 4.907 56 | 57 | colMeans(explanation_old$dt) 58 | colMeans(explanation_new$dt) 59 | #> colMeans(explanation_old$dt) 60 | #none crim zn indus chas nox rm 61 | #22.4459999 0.5606242 0.2137357 -0.3738064 -0.2214088 -0.3129846 0.9761603 62 | #> colMeans(explanation_new$dt) 63 | #none crim zn indus chas nox rm 64 | #22.4459999 0.5638315 0.2087042 -0.3697038 -0.2155639 -0.3173748 0.9724273 65 | 66 | t_old <- proc.time() 67 | explanation_full_old <- explain( 68 | x_test, 69 | approach = "empirical", 70 | type = "independence", 71 | explainer = explainer, 72 | prediction_zero = p, seed=111 73 | ) 74 | print(proc.time()-t_old) 75 | #user system elapsed 76 | #96.064 6.679 29.782 77 | 78 | t_new <- proc.time() 79 | explanation_full_new <- explain( 80 | x_test, 81 | approach = "independence", 82 | explainer = explainer, 83 | prediction_zero = p,seed = 111 84 | ) 85 | print(proc.time()-t_new) 86 | #user system elapsed 87 | #40.363 5.978 16.982 88 | 89 | explanation_full_old$dt 90 | explanation_full_new$dt 91 | 92 | #> explanation_full_old$dt 93 | #none crim zn indus chas nox rm 94 | #1: 22.446 0.5669854 0.2103575 -0.3720833 -0.2213789 -0.3109162 0.9693561 95 | #2: 22.446 0.5669846 0.2103578 -0.3720834 -0.2213790 -0.3109160 0.9693565 96 | #3: 22.446 0.5669852 0.2103576 -0.3720835 -0.2213789 -0.3109162 0.9693562 97 | #4: 22.446 0.5669855 0.2103575 -0.3720834 -0.2213790 -0.3109163 0.9693562 98 | #5: 22.446 0.5669851 0.2103576 -0.3720833 -0.2213794 -0.3109161 0.9693566 99 | #--- 100 | # 996: 22.446 0.5669856 0.2103575 -0.3720833 -0.2213791 -0.3109163 0.9693562 101 | #997: 22.446 0.5669851 0.2103575 -0.3720833 -0.2213790 -0.3109162 0.9693563 102 | #998: 22.446 0.5669854 0.2103576 -0.3720834 -0.2213790 -0.3109163 0.9693562 103 | #999: 22.446 0.5669853 0.2103575 -0.3720832 -0.2213791 -0.3109162 0.9693562 104 | #1000: 22.446 0.5669846 0.2103577 -0.3720833 -0.2213790 -0.3109161 0.9693565 105 | #> explanation_full_new$dt 106 | #none crim zn indus chas nox rm 107 | #1: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 108 | #2: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 109 | #3: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 110 | #4: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 111 | #5: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 112 | #--- 113 | # 996: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 114 | #997: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 115 | #998: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 116 | #999: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 117 | #1000: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563 118 | 119 | -------------------------------------------------------------------------------- /inst/REFERENCES.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{redelmeier2020explaining, 2 | title={Explaining predictive models with mixed features using Shapley values and conditional inference trees}, 3 | author={Redelmeier, Annabelle and Jullum, Martin and Aas, Kjersti}, 4 | booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction}, 5 | pages={117--137}, 6 | year={2020}, 7 | organization={Springer} 8 | } 9 | 10 | @article{aas2019explaining, 11 | title={Explaining individual predictions when features are dependent: More accurate approximations to Shapley values}, 12 | author={Aas, Kjersti and Jullum, Martin and L{\o}land, Anders}, 13 | journal={{Artificial Intelligence}}, 14 | volume={298}, 15 | year={2021} 16 | } 17 | 18 | @inproceedings{lundberg2017unified, 19 | title={A unified approach to interpreting model predictions}, 20 | author={Lundberg, Scott M and Lee, Su-In}, 21 | booktitle={Advances in Neural Information Processing Systems}, 22 | pages={4765--4774}, 23 | year={2017} 24 | } 25 | 26 | @article{kononenko2010efficient, 27 | title={An efficient explanation of individual classifications using game theory}, 28 | author={{\v{S}}trumbelj, Erik and Kononenko, Igor}, 29 | journal={Journal of Machine Learning Research}, 30 | volume={11}, 31 | number={Jan}, 32 | pages={1--18}, 33 | year={2010} 34 | } 35 | 36 | @article{vstrumbelj2014explaining, 37 | title={Explaining prediction models and individual predictions with feature contributions}, 38 | author={{\v{S}}trumbelj, Erik and Kononenko, Igor}, 39 | journal={Knowledge and information systems}, 40 | volume={41}, 41 | number={3}, 42 | pages={647--665}, 43 | year={2014}, 44 | publisher={Springer} 45 | } 46 | 47 | @article{lundberg2018consistent, 48 | title={Consistent individualized feature attribution for tree ensembles}, 49 | author={Lundberg, Scott M and Erion, Gabriel G and Lee, Su-In}, 50 | journal={arXiv preprint arXiv:1802.03888}, 51 | year={2018} 52 | } 53 | 54 | @article{hurvich1998smoothing, 55 | title={Smoothing parameter selection in nonparametric regression using an improved Akaike information criterion}, 56 | author={Hurvich, Clifford M and Simonoff, Jeffrey S and Tsai, Chih-Ling}, 57 | journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, 58 | volume={60}, 59 | number={2}, 60 | pages={271--293}, 61 | year={1998}, 62 | publisher={Wiley Online Library} 63 | } 64 | 65 | @article{Shapley53, 66 | author={Lloyd S. Shapley}, 67 | title={{A Value for N-Person Games}}, 68 | journal={{Contributions to the Theory of Games}}, 69 | volume=2, 70 | year = 1953, 71 | pages ={307--317} 72 | } 73 | 74 | @article{sklar1959fonctions, 75 | title={Fonctions de repartition an dimensions et leurs marges}, 76 | author={Sklar, M}, 77 | journal={Publ. inst. statist. univ. Paris}, 78 | volume={8}, 79 | pages={229--231}, 80 | year={1959} 81 | } 82 | 83 | @article{rosenblatt1956, 84 | author = {Murray Rosenblatt}, 85 | journal = {{The Annals of Mathematical Statistics}}, 86 | pages = {832--837}, 87 | title = {{Remarks on Some Nonparametric Estimates of a Density Function}}, 88 | volume = 27, 89 | year = 1956 90 | } 91 | 92 | @Manual{lime_api, 93 | title = {lime: Local Interpretable Model-Agnostic Explanations}, 94 | author = {Thomas Lin Pedersen and Michaël Benesty}, 95 | year = {2019}, 96 | note = {R package version 0.5.1}, 97 | url = {https://CRAN.R-project.org/package=lime}, 98 | } 99 | 100 | @article{hothorn2006unbiased, 101 | title={Unbiased recursive partitioning: A conditional inference framework}, 102 | author={Hothorn, Torsten and Hornik, Kurt and Zeileis, Achim}, 103 | journal={Journal of Computational and Graphical statistics}, 104 | volume={15}, 105 | number={3}, 106 | pages={651--674}, 107 | year={2006}, 108 | publisher={Taylor \& Francis} 109 | } 110 | 111 | @article{partykit_package, 112 | title = {{partykit}: A Modular Toolkit for Recursive Partytioning in {R}}, 113 | author = {Torsten Hothorn and Achim Zeileis}, 114 | journal = {Journal of Machine Learning Research}, 115 | year = {2015}, 116 | volume = {16}, 117 | pages = {3905-3909}, 118 | } 119 | 120 | @article{Redelmeier2020ctree, 121 | title={Explaining predictive models with mixed features using Shapley values and conditional inference trees}, 122 | author={Redelmeier, Annabelle and Jullum, Martin and Aas, Kjersti}, 123 | journal={Submitted}, 124 | year={2020} 125 | } 126 | 127 | @inproceedings{jullum2021efficient, 128 | title={Efficient and simple prediction explanations with groupShapley: A practical perspective}, 129 | author={Jullum, Martin and Redelmeier, Annabelle and Aas, Kjersti}, 130 | year={2021}, 131 | pages={28-43}, 132 | booktitle={Proceedings of the 2nd Italian Workshop on Explainable Artificial Intelligence}, 133 | publisher={CEUR Workshop Proceedings} 134 | } 135 | -------------------------------------------------------------------------------- /tests/testthat/manual_test_scripts/test_custom_models.R: -------------------------------------------------------------------------------- 1 | # Test custom models 2 | 3 | # Doing all testing from shapr 4 | # Because new functions have to be created (to use gbm with shapr), we cannot use a classic testthat set up because 5 | # shapr will not see the functions created inside of the test environment. Therefore we have to test these functions 6 | # a bit differently (and more manual) than other tests. 7 | 8 | library(testthat) 9 | library(shapr) 10 | library(gbm) 11 | library(MASS) 12 | 13 | # Data ----------- 14 | data("Boston", package = "MASS") 15 | y_var <- "medv" 16 | x_train <- tail(Boston, -6) 17 | y_train <- tail(Boston[, y_var], -6) 18 | y_train_binary <- as.factor(tail((Boston[, y_var] > 20) * 1, -6)) 19 | 20 | # convert to factors for testing purposes 21 | x_train$rad <- factor(round(x_train$rad)) 22 | x_train$chas <- factor(round(x_train$chas)) 23 | 24 | train_df <- cbind(x_train, y_train, y_train_binary) 25 | 26 | 27 | x_var_numeric <- c("lstat", "rm", "dis", "indus") 28 | x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas") 29 | 30 | train_df_used_numeric <- x_train[, x_var_numeric] 31 | train_df_used_factor <- x_train[, x_var_factor] 32 | 33 | formula_numeric <- as.formula(paste0("y_train ~ ", paste0(x_var_numeric, collapse = "+"))) 34 | formula_factor <- as.formula(paste0("y_train ~ ", paste0(x_var_factor, collapse = "+"))) 35 | 36 | # Custom model with only numeric features 37 | model_custom <- gbm::gbm(formula_numeric, data = train_df, distribution = "gaussian") 38 | expect_error(shapr(train_df_used_numeric, model_custom)) # Required model objects defined 39 | get_model_specs.gbm <- function(x) { 40 | feature_list <- list() 41 | feature_list$labels <- labels(x$Terms) 42 | m <- length(feature_list$labels) 43 | feature_list$classes <- attr(x$Terms, "dataClasses")[-1] 44 | feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) 45 | feature_list$factor_levels[feature_list$classes == "factor"] <- NA # the model object don't contain factor levels info 46 | return(feature_list) 47 | } 48 | expect_error(shapr(train_df_used_numeric, model_custom)) # predict_model objects not defined 49 | 50 | predict_model.gbm <- function(x, newdata) { 51 | if (!requireNamespace("gbm", quietly = TRUE)) { 52 | stop("The gbm package is required for predicting train models") 53 | } 54 | model_type <- ifelse( 55 | x$distribution$name %in% c("bernoulli", "adaboost"), 56 | "classification", 57 | "regression" 58 | ) 59 | if (model_type == "classification") { 60 | predict(x, as.data.frame(newdata), type = "response", n.trees = x$n.trees) 61 | } else { 62 | predict(x, as.data.frame(newdata), n.trees = x$n.trees) 63 | } 64 | } 65 | 66 | expect_silent(shapr(train_df_used_numeric, model_custom)) # Both defined, so pass silently 67 | 68 | rm(get_model_specs.gbm) 69 | 70 | expect_message(shapr(train_df_used_numeric, model_custom)) # Only predict_model defined, so warning 71 | rm(predict_model.gbm) 72 | 73 | 74 | # Custom model with factors 75 | model_custom <- gbm::gbm(formula_factor, data = train_df, distribution = "gaussian") 76 | expect_error(shapr(train_df_used_factor, model_custom)) # Required model objects defined 77 | get_model_specs.gbm <- function(x) { 78 | feature_list <- list() 79 | feature_list$labels <- labels(x$Terms) 80 | m <- length(feature_list$labels) 81 | feature_list$classes <- attr(x$Terms, "dataClasses")[-1] 82 | feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) 83 | feature_list$factor_levels[feature_list$classes == "factor"] <- NA # model object doesn't contain factor level info 84 | return(feature_list) 85 | } 86 | expect_error(shapr(train_df_used_factor, model_custom)) # predict_model objects not defined 87 | 88 | predict_model.gbm <- function(x, newdata) { 89 | if (!requireNamespace("gbm", quietly = TRUE)) { 90 | stop("The gbm package is required for predicting train models") 91 | } 92 | model_type <- ifelse( 93 | x$distribution$name %in% c("bernoulli", "adaboost"), 94 | "classification", 95 | "regression" 96 | ) 97 | if (model_type == "classification") { 98 | predict(x, as.data.frame(newdata), type = "response", n.trees = x$n.trees) 99 | } else { 100 | predict(x, as.data.frame(newdata), n.trees = x$n.trees) 101 | } 102 | } 103 | expect_message(shapr(train_df_used_factor, model_custom)) # Both defined, so pass with message as factor_level is NA 104 | 105 | rm(get_model_specs.gbm) 106 | 107 | expect_message(shapr(train_df_used_factor, model_custom)) # Only predict_model defined, so warning message returned 108 | 109 | rm(predict_model.gbm) 110 | 111 | predict_model.gbm <- function(x, newdata) NULL 112 | 113 | # Erroneous predict_model defined, so throw error + messages 114 | expect_message(expect_error(shapr(train_df_used_factor, model_custom))) 115 | 116 | 117 | rm(predict_model.gbm) 118 | -------------------------------------------------------------------------------- /tests/testthat/test-observations.R: -------------------------------------------------------------------------------- 1 | context("test-observations.R") 2 | 3 | test_that("Test observation_impute", { 4 | if (requireNamespace("MASS", quietly = TRUE)) { 5 | # Examples 6 | n <- 20 7 | m <- 2 8 | sigma <- cov(matrix(MASS::mvrnorm(m * n, 0, 1), nrow = n)) 9 | x_train <- as.matrix(MASS::mvrnorm(n, mu = rep(0, m), Sigma = sigma), ncol = m) 10 | x_test <- t(as.matrix(MASS::mvrnorm(1, mu = rep(0, m), sigma))) 11 | colnames(x_train) <- colnames(x_test) <- paste0("X", seq(m)) 12 | S <- matrix(c(1, 0, 0, 1), nrow = m) 13 | W_kernel <- matrix(rnorm(n * ncol(S), mean = 1 / n, sd = 1 / n^2), nrow = n) 14 | r <- observation_impute(W_kernel, S, x_train, x_test) 15 | 16 | # Test the default argument n_samples 17 | expect_equal( 18 | observation_impute(W_kernel, S, x_train, x_test, n_samples = 1e3), 19 | observation_impute(W_kernel, S, x_train, x_test) 20 | ) 21 | 22 | # Test the default argument w_threshold 23 | expect_equal( 24 | observation_impute(W_kernel, S, x_train, x_test, w_threshold = .7), 25 | observation_impute(W_kernel, S, x_train, x_test) 26 | ) 27 | 28 | # Test that w_threshold reduces number of rows 29 | expect_true( 30 | nrow(observation_impute(W_kernel, S, x_train, x_test, w_threshold = .7)) > 31 | nrow(observation_impute(W_kernel, S, x_train, x_test, w_threshold = 0.5)) 32 | ) 33 | 34 | # Test that n_samples reduces number of rows 35 | expect_true( 36 | nrow(observation_impute(W_kernel, S, x_train, x_test)) > 37 | nrow(observation_impute(W_kernel, S, x_train, x_test, n_samples = 10)) 38 | ) 39 | 40 | # Tests error 41 | expect_error(observation_impute(1, S, x_train, x_test)) 42 | expect_error(observation_impute(W_kernel, 1, x_train, x_test)) 43 | expect_error(observation_impute(W_kernel, tail(S, -1), x_train, x_test)) 44 | expect_error(observation_impute(tail(W_kernel, -1), S, x_train, x_test)) 45 | 46 | # Test single result 47 | cnms <- c(colnames(x_train), "id_combination", "w") 48 | expect_true(data.table::is.data.table(r)) 49 | expect_true(ncol(r) == m + 2) 50 | expect_true(all(colnames(r) == cnms)) 51 | expect_true(all(unlist(lapply(r, is.numeric)))) 52 | expect_true(is.integer(r$id_combination)) 53 | } 54 | }) 55 | 56 | 57 | test_that("Check correct index_feature usage in prepare_data", { 58 | 59 | data("Boston", package = "MASS") 60 | x_var <- c("lstat", "rm", "dis", "indus") 61 | y_var <- "medv" 62 | 63 | y_train <- tail(Boston[, y_var], 50) 64 | x <- as.matrix(head(Boston[, x_var], 2)) 65 | n_samples <- 100 66 | index_features <- 4:7 67 | w_threshold = 0.95 68 | type = "fixed_sigma" 69 | fixed_sigma_vec = 0.1 70 | n_samples_aicc = 1000 71 | eval_max_aicc = 20 72 | start_aicc = 0.1 73 | mincriterion = 0.95 74 | minsplit = 20 75 | minbucket = 7 76 | sample = TRUE 77 | 78 | explainer <- readRDS(file = "test_objects/shapley_explainer_obj.rds") 79 | explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) 80 | explainer$n_samples <- n_samples 81 | 82 | explainer$approach <- "independence" 83 | dt <- prepare_data(explainer, index_features = index_features) 84 | expect_identical(sort(dt[,unique(id_combination)]),index_features) 85 | 86 | explainer$type <- type 87 | explainer$fixed_sigma_vec <- fixed_sigma_vec 88 | explainer$n_samples_aicc <- n_samples_aicc 89 | explainer$eval_max_aicc <- eval_max_aicc 90 | explainer$start_aicc <- start_aicc 91 | explainer$w_threshold <- w_threshold 92 | explainer$cov_mat <- stats::cov(explainer$x_train) 93 | 94 | explainer$approach <- "empirical" 95 | dt <- prepare_data(explainer, index_features = index_features) 96 | expect_identical(sort(dt[,unique(id_combination)]),index_features) 97 | 98 | explainer$mu <- unname(colMeans(explainer$x_train)) 99 | explainer$approach <- "gaussian" 100 | dt <- prepare_data(explainer, index_features = index_features) 101 | expect_identical(sort(dt[,unique(id_combination)]),index_features) 102 | 103 | explainer$x_test_gaussian <- explainer$x_test # Shortcut 104 | explainer$approach <- "copula" 105 | dt <- prepare_data(explainer, index_features = index_features) 106 | expect_identical(sort(dt[,unique(id_combination)]),index_features) 107 | 108 | explainer$x_test_gaussian <- explainer$x_test # Shortcut 109 | explainer$approach <- "copula" 110 | dt <- prepare_data(explainer, index_features = index_features) 111 | expect_identical(sort(dt[,unique(id_combination)]),index_features) 112 | 113 | explainer$mincriterion <- mincriterion 114 | explainer$minsplit <- minsplit 115 | explainer$minbucket <- minbucket 116 | explainer$sample <- sample 117 | explainer$approach <- "ctree" 118 | explainer$x_test <- preprocess_data(x, explainer$feature_list)$x_dt 119 | dt <- prepare_data(explainer, index_features = index_features) 120 | expect_identical(sort(dt[,unique(id_combination)]),index_features) 121 | 122 | }) 123 | -------------------------------------------------------------------------------- /inst/joss_paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | # Example from https://joss.readthedocs.io/en/latest/submitting.html 3 | title: 'shapr: An R-package for explaining machine learning models with dependence-aware Shapley values' 4 | tags: 5 | - R 6 | - explainable AI 7 | - interpretable machine learning 8 | - shapley values 9 | - feature dependence 10 | authors: 11 | - name: Nikolai Sellereite 12 | orcid: 0000-0002-4671-0337 13 | affiliation: 1 # (Multiple affiliations must be quoted) 14 | - name: Martin Jullum 15 | orcid: 0000-0003-3908-5155 16 | affiliation: 1 17 | affiliations: 18 | - name: Norwegian Computing Center 19 | index: 1 20 | citation_author: Sellereite and Jullum 21 | date: 20 November 2019 22 | year: 2019 23 | formatted_doi: XX.XXXXX/joss.XXXXX 24 | bibliography: paper.bib 25 | output: rticles::joss_article 26 | csl: apa.csl 27 | journal: JOSS 28 | --- 29 | 30 | 31 | # Summary 32 | 33 | A common task within machine learning is to train a model to predict an unknown outcome 34 | (response variable) based on a set of known input variables/features. 35 | When using such models for real life applications, it is often crucial to understand why a certain set of features lead 36 | to a specific prediction. 37 | Most machine learning models are, however, complicated and hard to understand, so that they are often viewed as 38 | "black-boxes", that produce some output from some input. 39 | 40 | Shapley values [@Shapley53] is a concept from cooperative game theory used to distribute fairly a joint payoff among the 41 | cooperating players. 42 | @kononenko2010efficient and later @lundberg2017unified proposed to use the Shapley value framework to explain 43 | predictions by distributing the prediction value on the input features. 44 | Established methods and implementations for explaining predictions with Shapley values like Shapley 45 | Sampling Values [@vstrumbelj2014explaining], SHAP/Kernel SHAP [@lundberg2017unified], and to some extent 46 | TreeSHAP/TreeExplainer [@lundberg2018consistent; @Lundberg2020], assume that the features are independent when 47 | approximating the Shapley values. 48 | The `R`-package `shapr`, however, implements the methodology proposed by @aas2019explaining, where predictions are explained while 49 | accounting for the dependence between the features, resulting in significantly more accurate approximations to the 50 | Shapley values. 51 | 52 | 53 | 54 | # Implementation 55 | 56 | `shapr` implements a variant of the Kernel SHAP methodology [@lundberg2017unified] for efficiently dealing with the 57 | combinatorial problem related to the Shapley value formula. 58 | The main methodological contribution of @aas2019explaining is three different methods to estimate certain conditional 59 | expectation quantities, referred to as the _empirical_, _Gaussian_ and _copula_ approach. Additionaly, the user has 60 | the option of combining the three approaches. 61 | The implementation supports explanation of models fitted with the following functions natively: `stats::lm` [@rCore], `stats::glm` [@rCore], 62 | `ranger::ranger` [@ranger], `mgcv::gam` [@mgcv] and `xgboost::xgboost`/`xgboost::xgb.train` [@xgboost]. 63 | Moreover, the package supports explanation of custom models by supplying two simple additional class functions. 64 | 65 | For reference, the package also includes a benchmark implementation of the original (independence assuming) version of 66 | Kernel SHAP [@lundberg2017unified], providing identical results to the "official" Kernel SHAP `Python` package `shap`. 67 | This allows the user to easily see the effect and importance of accounting for the feature dependence. 68 | 69 | The user interface in the package has largely been adopted from the `R`-package `lime` [@limeRpackage]. 70 | The user first sets up the explainability framework with the `shapr` function. 71 | Then the output from `shapr` is provided to the `explain` function, along with the data to explain the prediction 72 | and the method that should be used to estimate the aforementioned conditional expectations. 73 | 74 | The majority of the code is in plain `R` [@rCore], while the most time consuming operations are coded in `C++` 75 | through the `Rcpp` package [@rcppRpackage] and `RcppArmadillo` package [@eddelbuettel2014rcpparmadillo] for computational speed up. 76 | For RAM efficiency and computational speed up of typical bookeeping operations, we utilize the `data.table` 77 | package [@datatableRpackage] which does operations "by reference", i.e. without memory copies. 78 | 79 | For a detailed description of the underlying methodology that the package implements, we refer to the 80 | [paper](https://arxiv.org/abs/1903.10464) [@aas2019explaining] which uses the package in examples and simulation 81 | studies. 82 | To get started with the package, we recommend going through the package vignette and introductory examples 83 | available at the package's [pkgdown site](https://norskregnesentral.github.io/shapr/). 84 | 85 | # Acknowledgement 86 | 87 | This work was supported by the Norwegian Research Council grant 237718 (Big Insight). 88 | 89 | 90 | # References 91 | -------------------------------------------------------------------------------- /inst/joss_paper/paper.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | # Example from https://joss.readthedocs.io/en/latest/submitting.html 3 | title: 'shapr: An R-package for explaining machine learning models with dependence-aware Shapley values' 4 | tags: 5 | - R 6 | - explainable AI 7 | - interpretable machine learning 8 | - shapley values 9 | - feature dependence 10 | authors: 11 | - name: Nikolai Sellereite 12 | orcid: 0000-0002-4671-0337 13 | affiliation: 1 # (Multiple affiliations must be quoted) 14 | - name: Martin Jullum 15 | orcid: 0000-0003-3908-5155 16 | affiliation: 1 17 | affiliations: 18 | - name: Norwegian Computing Center 19 | index: 1 20 | citation_author: Sellereite and Jullum 21 | date: 20 November 2019 22 | year: 2019 23 | formatted_doi: XX.XXXXX/joss.XXXXX 24 | bibliography: paper.bib 25 | output: rticles::joss_article 26 | csl: apa.csl 27 | journal: JOSS 28 | --- 29 | 30 | 31 | # Summary 32 | 33 | A common task within machine learning is to train a model to predict an unknown outcome 34 | (response variable) based on a set of known input variables/features. 35 | When using such models for real life applications, it is often crucial to understand why a certain set of features lead 36 | to a specific prediction. 37 | Most machine learning models are, however, complicated and hard to understand, so that they are often viewed as 38 | "black-boxes", that produce some output from some input. 39 | 40 | Shapley values [@Shapley53] is a concept from cooperative game theory used to distribute fairly a joint payoff among the 41 | cooperating players. 42 | @kononenko2010efficient and later @lundberg2017unified proposed to use the Shapley value framework to explain 43 | predictions by distributing the prediction value on the input features. 44 | Established methods and implementations for explaining predictions with Shapley values like Shapley 45 | Sampling Values [@vstrumbelj2014explaining], SHAP/Kernel SHAP [@lundberg2017unified], and to some extent 46 | TreeSHAP/TreeExplainer [@lundberg2018consistent; @Lundberg2020], assume that the features are independent when 47 | approximating the Shapley values. 48 | The `R`-package `shapr`, however, implements the methodology proposed by @aas2019explaining, where predictions are explained while 49 | accounting for the dependence between the features, resulting in significantly more accurate approximations to the 50 | Shapley values. 51 | 52 | 53 | 54 | # Implementation 55 | 56 | `shapr` implements a variant of the Kernel SHAP methodology [@lundberg2017unified] for efficiently dealing with the 57 | combinatorial problem related to the Shapley value formula. 58 | The main methodological contribution of @aas2019explaining is three different methods to estimate certain conditional 59 | expectation quantities, referred to as the _empirical_, _Gaussian_ and _copula_ approach. Additionaly, the user has 60 | the option of combining the three approaches. 61 | The implementation supports explanation of models fitted with the following functions natively: `stats::lm` [@rCore], `stats::glm` [@rCore], 62 | `ranger::ranger` [@ranger], `mgcv::gam` [@mgcv] and `xgboost::xgboost`/`xgboost::xgb.train` [@xgboost]. 63 | Moreover, the package supports explanation of custom models by supplying two simple additional class functions. 64 | 65 | For reference, the package also includes a benchmark implementation of the original (independence assuming) version of 66 | Kernel SHAP [@lundberg2017unified], providing identical results to the "official" Kernel SHAP `Python` package `shap`. 67 | This allows the user to easily see the effect and importance of accounting for the feature dependence. 68 | 69 | The user interface in the package has largely been adopted from the `R`-package `lime` [@limeRpackage]. 70 | The user first sets up the explainability framework with the `shapr` function. 71 | Then the output from `shapr` is provided to the `explain` function, along with the data to explain the prediction 72 | and the method that should be used to estimate the aforementioned conditional expectations. 73 | 74 | The majority of the code is in plain `R` [@rCore], while the most time consuming operations are coded in `C++` 75 | through the `Rcpp` package [@rcppRpackage] and `RcppArmadillo` package [@eddelbuettel2014rcpparmadillo] for computational speed up. 76 | 77 | For RAM efficiency and computational speed up of typical bookeeping operations, we utilize the `data.table` 78 | package [@datatableRpackage] which does operations "by reference", i.e. without memory copies. 79 | 80 | For a detailed description of the underlying methodology that the package implements, we refer to the 81 | [paper](https://arxiv.org/abs/1903.10464) [@aas2019explaining] which uses the package in examples and simulation 82 | studies. 83 | To get started with the package, we recommend going through the package vignette and introductory examples 84 | available at the package's [pkgdown site](https://norskregnesentral.github.io/shapr/). 85 | 86 | # Acknowledgement 87 | 88 | This work was supported by the Norwegian Research Council grant 237718 (Big Insight). 89 | 90 | 91 | # References 92 | -------------------------------------------------------------------------------- /R/plot.R: -------------------------------------------------------------------------------- 1 | #' Plot of the Shapley value explanations 2 | #' 3 | #' @description Plots the individual prediction explanations. 4 | #' 5 | #' @param x An \code{shapr} object. See \code{\link{explain}}. 6 | #' @param digits Integer. Number of significant digits to use in the feature description 7 | #' @param plot_phi0 Logical. Whether to include \code{phi0} in the plot 8 | #' @param index_x_test Integer vector. Which of the test observations to plot. E.g. if you have 9 | #' explained 10 observations using \code{\link{explain}}, you can generate a plot for the first 5 10 | #' observations by setting \code{index_x_test = 1:5}. 11 | #' @param top_k_features Integer. How many features to include in the plot. E.g. if you have 15 12 | #' features in your model you can plot the 5 most important features, for each explanation, by setting 13 | #' \code{top_k_features = 1:5}. 14 | #' @param ... Currently not used. 15 | #' 16 | #' @details See \code{vignette("understanding_shapr", package = "shapr")} for an example of 17 | #' how you should use the function. 18 | #' 19 | #' @return ggplot object with plots of the Shapley value explanations 20 | #' 21 | #' @export 22 | #' @examples 23 | #' if (requireNamespace("MASS", quietly = TRUE)) { 24 | #' #' # Load example data 25 | #' data("Boston", package = "MASS") 26 | #' 27 | #' # Split data into test- and training data 28 | #' x_train <- head(Boston, -3) 29 | #' x_test <- tail(Boston, 3) 30 | #' 31 | #' # Fit a linear model 32 | #' model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) 33 | #' 34 | #' # Create an explainer object 35 | #' explainer <- shapr(x_train, model) 36 | #' 37 | #' # Explain predictions 38 | #' p <- mean(x_train$medv) 39 | #' 40 | #' # Empirical approach 41 | #' explanation <- explain(x_test, 42 | #' explainer, 43 | #' approach = "empirical", 44 | #' prediction_zero = p, 45 | #' n_samples = 1e2 46 | #' ) 47 | #' 48 | #' if (requireNamespace("ggplot2", quietly = TRUE)) { 49 | #' # Plot the explantion (this function) 50 | #' plot(explanation) 51 | #' } 52 | #' } 53 | #' @author Martin Jullum 54 | plot.shapr <- function(x, 55 | digits = 3, 56 | plot_phi0 = TRUE, 57 | index_x_test = NULL, 58 | top_k_features = NULL, 59 | ...) { 60 | if (!requireNamespace("ggplot2", quietly = TRUE)) { 61 | stop("ggplot2 is not installed. Please run install.packages('ggplot2')") 62 | } 63 | 64 | if (is.null(index_x_test)) index_x_test <- seq(nrow(x$x_test)) 65 | if (is.null(top_k_features)) top_k_features <- ncol(x$x_test) + 1 66 | id <- phi <- NULL # due to NSE notes in R CMD check 67 | 68 | is_groupwise <- x$is_groupwise 69 | 70 | # melting Kshap 71 | shap_names <- colnames(x$dt)[-1] 72 | KshapDT <- data.table::copy(x$dt) 73 | KshapDT[, id := .I] 74 | meltKshap <- data.table::melt(KshapDT, id.vars = "id", value.name = "phi") 75 | meltKshap[, sign := factor(sign(phi), levels = c(1, -1), labels = c("Increases", "Decreases"))] 76 | 77 | # Converting and melting Xtest 78 | if (!is_groupwise) { 79 | desc_mat <- format(x$x_test, digits = digits) 80 | for (i in 1:ncol(desc_mat)) { 81 | desc_mat[, i] <- paste0(shap_names[i], " = ", desc_mat[, i]) 82 | } 83 | } else { 84 | desc_mat <- format(x$dt[, -1], digits = digits) 85 | for (i in 1:ncol(desc_mat)) { 86 | desc_mat[, i] <- paste0(shap_names[i]) 87 | } 88 | } 89 | 90 | desc_dt <- data.table::as.data.table(cbind(none = "none", desc_mat)) 91 | melt_desc_dt <- data.table::melt(desc_dt[, id := .I], id.vars = "id", value.name = "description") 92 | 93 | # Data table for plotting 94 | plotting_dt <- merge(meltKshap, melt_desc_dt) 95 | 96 | 97 | # Adding the predictions 98 | predDT <- data.table::data.table(id = KshapDT$id, pred = x$p) 99 | plotting_dt <- merge(plotting_dt, predDT, by = "id") 100 | 101 | # Adding header for each individual plot 102 | header <- variable <- pred <- description <- NULL # due to NSE notes in R CMD check 103 | plotting_dt[, header := paste0("id: ", id, ", pred = ", format(pred, digits = digits + 1))] 104 | 105 | if (!plot_phi0) { 106 | plotting_dt <- plotting_dt[variable != "none"] 107 | } 108 | plotting_dt <- plotting_dt[id %in% index_x_test] 109 | plotting_dt[, rank := data.table::frank(-abs(phi)), by = "id"] 110 | plotting_dt <- plotting_dt[rank <= top_k_features] 111 | plotting_dt[, description := factor(description, levels = unique(description[order(abs(phi))]))] 112 | 113 | # Plotting 114 | gg <- ggplot2::ggplot(plotting_dt) + 115 | ggplot2::facet_wrap(~header, scales = "free_y", labeller = "label_value", ncol = 2) + 116 | ggplot2::geom_col(ggplot2::aes(x = description, y = phi, fill = sign)) + 117 | ggplot2::coord_flip() + 118 | ggplot2::scale_fill_manual(values = c("steelblue", "lightsteelblue"), drop = TRUE) + 119 | ggplot2::labs( 120 | y = "Feature contribution", 121 | x = "Feature", 122 | fill = "", 123 | title = "Shapley value prediction explanation" 124 | ) + 125 | ggplot2::theme( 126 | legend.position = "bottom", 127 | plot.title = ggplot2::element_text(hjust = 0.5) 128 | ) 129 | 130 | return(gg) 131 | } 132 | -------------------------------------------------------------------------------- /inst/scripts/compare_shap_python.R: -------------------------------------------------------------------------------- 1 | library(MASS) 2 | library(xgboost) 3 | library(shapr) 4 | library(data.table) 5 | 6 | # Python settings 7 | # Using the virtual environment here "../../Python/.venv/bin/python", as set by 8 | #Sys.setenv(RETICULATE_PYTHON = "../../Python/.venv/bin/python") in the .Rprofile 9 | library(reticulate) 10 | 11 | # Install some packages 12 | #py_install("xgboost") 13 | #py_install("shap") 14 | #py_install("pandas") 15 | 16 | data("Boston") 17 | 18 | x_var <- c("lstat", "rm", "dis", "indus") 19 | y_var <- "medv" 20 | 21 | x_train <- as.matrix(tail(Boston[, x_var], -6)) 22 | y_train <- tail(Boston[, y_var], -6) 23 | x_test <- as.matrix(head(Boston[, x_var], 6)) 24 | 25 | # Creating a larger test data set (600 observations) for more realistic function time calls. 26 | # Modifying x_test to repeat the 6 test observations 100 times 27 | x_test = rep(1,100) %x% x_test 28 | colnames(x_test) <- colnames(x_train) 29 | 30 | # Reading the R format version of the xgboost model to avoid crash reading same xgboost model in R and Python 31 | model <- readRDS(system.file("model_objects", "xgboost_model_object.rds", package = "shapr")) 32 | 33 | pred_test <- predict(model,x_test) 34 | 35 | # Spedifying the phi_0, i.e. the expected prediction without any features 36 | p0 <- mean(predict(model,x_train))# adjustment from the standard mean(y_train) to comply with the shap implementation 37 | 38 | time_R_start <- proc.time() 39 | # Prepare the data for explanation 40 | explainer <- shapr(x_train, model) 41 | 42 | time_R_prepare <- proc.time() 43 | 44 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using 45 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) 46 | explanation_independence <- explain(x_test, explainer, approach = "independence", prediction_zero = p0) 47 | 48 | time_R_indep0 <- proc.time() 49 | 50 | explanation_largesigma <- explain(x_test, explainer, approach = "empirical", type = "fixed_sigma", 51 | fixed_sigma_vec = 10000, w_threshold = 1, prediction_zero = p0) 52 | 53 | time_R_largesigma0 <- proc.time() 54 | 55 | time_R_indep <- time_R_indep0 - time_R_start 56 | time_R_largesigma <- (time_R_largesigma0 - time_R_indep0) + (time_R_prepare- time_R_start) 57 | 58 | # Printing the Shapley values for the test data 59 | Kshap_indep <- explanation_independence$dt 60 | Kshap_largesigma <- explanation_largesigma$dt 61 | 62 | head(Kshap_indep) 63 | #> Kshap_indep 64 | # none lstat rm dis indus 65 | #1: 22,41355 7,116128 0,5203017 -1,91427784 3,1657530 66 | #2: 22,41355 2,173011 -1,2201068 -0,47653736 0,3620256 67 | #3: 22,41355 8,280909 3,7869719 -1,96968536 0,6037250 68 | #4: 22,41355 8,384073 2,9590225 -2,19376523 1,8672685 69 | #5: 22,41355 4,212031 3,8319436 -0,06695137 1,3392699 70 | #6: 22,41355 3,295275 -1,2450126 -0,70618891 1,0924035 71 | 72 | head(Kshap_largesigma) 73 | #> Kshap_largesigma 74 | # none lstat rm dis indus 75 | #1: 22,41355 7,116128 0,5203018 -1,9142779 3,1657530 76 | #2: 22,41355 2,173011 -1,2201069 -0,4765373 0,3620255 77 | #3: 22,41355 8,280910 3,7869718 -1,9696854 0,6037249 78 | #4: 22,41355 8,384073 2,9590226 -2,1937652 1,8672685 79 | #5: 22,41355 4,212031 3,8319435 -0,0669514 1,3392700 80 | #6: 22,41355 3,295275 -1,2450126 -0,7061889 1,0924036 81 | 82 | 83 | # Checking the difference between the methods 84 | mean(abs(as.matrix(Kshap_indep)-as.matrix(Kshap_largesigma))) 85 | #[1] 8.404487e-08 # Numerically identical 86 | 87 | 88 | 89 | #### Running shap from Python #### 90 | reticulate::py_run_file(system.file("scripts", "shap_python_script.py", package = "shapr")) 91 | # Writes Python objects to the list py # 92 | 93 | # Checking that the predictions are identical 94 | sum((pred_test-py$py_pred_test)^2) 95 | 96 | head(Kshap_indep) 97 | #> Kshap_indep 98 | # none lstat rm dis indus 99 | #1: 22,41355 7,116128 0,5203017 -1,91427784 3,1657530 100 | #2: 22,41355 2,173011 -1,2201068 -0,47653736 0,3620256 101 | #3: 22,41355 8,280909 3,7869719 -1,96968536 0,6037250 102 | #4: 22,41355 8,384073 2,9590225 -2,19376523 1,8672685 103 | #5: 22,41355 4,212031 3,8319436 -0,06695137 1,3392699 104 | #6: 22,41355 3,295275 -1,2450126 -0,70618891 1,0924035 105 | 106 | head(py$Kshap_shap) 107 | #> py$Kshap_shap 108 | # none lstat rm dis indus 109 | #1 22,41355 7,116128 0,5203018 -1,91427784 3,1657530 110 | #2 22,41355 2,173011 -1,2201069 -0,47653727 0,3620255 111 | #3 22,41355 8,280910 3,7869719 -1,96968537 0,6037250 112 | #4 22,41355 8,384073 2,9590226 -2,19376508 1,8672686 113 | #5 22,41355 4,212031 3,8319435 -0,06695135 1,3392701 114 | #6 22,41355 3,295275 -1,2450126 -0,70618891 1,0924036 115 | 116 | 117 | # Checking difference between our R implementtaion and the shap implementation i Python 118 | mean(abs(as.matrix(Kshap_indep)-as.matrix(py$Kshap_shap))) 119 | #[1] 1,300368e-07 # Numerically identical 120 | 121 | # Checking the running time of the different methods 122 | time_R_indep[3] 123 | time_R_largesigma[3] 124 | py$time_py 125 | 126 | #> time_R_indep[3] 127 | #elapsed 128 | #7,417 129 | #> time_R_largesigma[3] 130 | #elapsed 131 | #6,271 132 | #> py$time_py 133 | #[1] 21,23536 134 | 135 | # Our R implementation is about 3 times faster than the the shap package on this task. 136 | # Might be some overhead by calling Python from R, but it shouldn't be even close to that much. 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /inst/joss_paper/paper.bib: -------------------------------------------------------------------------------- 1 | @article{aas2019explaining, 2 | title={Explaining individual predictions when features are dependent: More accurate approximations to {S}hapley values}, 3 | author={Aas, Kjersti and Jullum, Martin and L{\o}land, Anders}, 4 | journal={Arxiv preprint arxiv:1903.10464}, 5 | year={2019} 6 | } 7 | 8 | @inproceedings{lundberg2017unified, 9 | title={A unified approach to interpreting model predictions}, 10 | author={Lundberg, Scott M and Lee, Su-In}, 11 | booktitle={Advances in Neural Information Processing Systems}, 12 | pages={4765--4774}, 13 | year={2017} 14 | } 15 | 16 | @article{kononenko2010efficient, 17 | title={An efficient explanation of individual classifications using game theory}, 18 | author={{\v{S}}trumbelj, Erik and Kononenko, Igor}, 19 | journal={Journal of machine learning research}, 20 | volume={11}, 21 | number={Jan}, 22 | pages={1--18}, 23 | year={2010} 24 | } 25 | 26 | @article{vstrumbelj2014explaining, 27 | title={Explaining prediction models and individual predictions with feature contributions}, 28 | author={{\v{S}}trumbelj, Erik and Kononenko, Igor}, 29 | journal={Knowledge and information systems}, 30 | volume={41}, 31 | number={3}, 32 | pages={647--665}, 33 | year={2014}, 34 | publisher={Springer}, 35 | doi={10.1007/s10115-013-0679-x}, 36 | } 37 | 38 | @article{lundberg2018consistent, 39 | title={Consistent individualized feature attribution for tree ensembles}, 40 | author={Lundberg, Scott M and Erion, Gabriel G and Lee, Su-In}, 41 | journal={Arxiv preprint arxiv:1802.03888}, 42 | year={2018} 43 | } 44 | 45 | @article{hurvich1998smoothing, 46 | title={Smoothing parameter selection in nonparametric regression using an improved Akaike information criterion}, 47 | author={Hurvich, Clifford M and Simonoff, Jeffrey S and Tsai, Chih-Ling}, 48 | journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, 49 | volume={60}, 50 | number={2}, 51 | pages={271--293}, 52 | year={1998}, 53 | publisher={Wiley Online Library}, 54 | doi={10.1111/1467-9868.00125} 55 | } 56 | 57 | @Article{Shapley53, 58 | author={Lloyd S. Shapley}, 59 | title={A value for n-person games}, 60 | journal={Contributions to the theory of games}, 61 | volume=2, 62 | year = 1953, 63 | pages ={307--317} 64 | } 65 | 66 | @article{sklar1959fonctions, 67 | title={Fonctions de repartition an dimensions et leurs marges}, 68 | author={Sklar, M}, 69 | journal={Publ. inst. statist. univ. Paris}, 70 | volume={8}, 71 | pages={229--231}, 72 | year={1959} 73 | } 74 | 75 | @article{rosenblatt1956, 76 | author = {Murray Rosenblatt}, 77 | journal = {{The Annals of Mathematical Statistics}}, 78 | pages = {832--837}, 79 | title = {{Remarks on Some Nonparametric Estimates of a Density Function}}, 80 | volume = 27, 81 | year = 1956, 82 | doi = {10.1214/aoms/1177728190} 83 | } 84 | 85 | 86 | @Manual{limeRpackage, 87 | title = {{lime}: Local Interpretable Model-Agnostic Explanations}, 88 | author = {Thomas Lin Pedersen and Michaël Benesty}, 89 | year = {2019}, 90 | note = {R package version 0.5.1}, 91 | url = {https://CRAN.R-project.org/package=lime}, 92 | } 93 | 94 | 95 | @Article{rcppRpackage, 96 | title = {{Rcpp}: Seamless {R} and {C++} Integration}, 97 | author = {Dirk Eddelbuettel and Romain Fran\c{c}ois}, 98 | journal = {Journal of statistical software}, 99 | year = {2011}, 100 | volume = {40}, 101 | number = {8}, 102 | pages = {1--18}, 103 | url = {http://www.jstatsoft.org/v40/i08/}, 104 | doi = {10.18637/jss.v040.i08}, 105 | } 106 | 107 | @Manual{rCore, 108 | title = {R: A language and environment for statistical computing}, 109 | author = {{R Core Team}}, 110 | organization = {R foundation for statistical computing}, 111 | year = {2019}, 112 | url = {https://www.R-project.org/}, 113 | } 114 | 115 | @Manual{datatableRpackage, 116 | title = {{data.table}: Extension of `data.frame`}, 117 | author = {Matt Dowle and Arun Srinivasan}, 118 | year = {2019}, 119 | note = {R package version 1.12.2}, 120 | url = {https://CRAN.R-project.org/package=data.table}, 121 | } 122 | 123 | 124 | @article{Lundberg2020, 125 | author = {Scott M. Lundberg and 126 | Gabriel G. Erion and 127 | Hugh Chen and 128 | Alex DeGrave and 129 | Jordan M. Prutkin and 130 | Bala Nair and 131 | Ronit Katz and 132 | Jonathan Himmelfarb and 133 | Nisha Bansal and 134 | Su{-}In Lee}, 135 | title = {From local explanations to global understanding with explainable {AI} for trees}, 136 | journal = {Nature machine intelligence}, 137 | volume = {2}, 138 | pages = {56-67}, 139 | year = {2020}, 140 | doi = {10.1038/s42256-019-0138-9} 141 | } 142 | 143 | @article{eddelbuettel2014rcpparmadillo, 144 | title={RcppArmadillo: {A}ccelerating {R} with high-performance {C}++ linear algebra}, 145 | author={Eddelbuettel, Dirk and Sanderson, Conrad}, 146 | journal={Computational statistics \& data analysis}, 147 | volume={71}, 148 | pages={1054--1063}, 149 | year={2014}, 150 | publisher={Elsevier}, 151 | doi = {10.1016/j.csda.2013.02.005} 152 | } 153 | 154 | @Article{ranger, 155 | title = {{ranger}: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}}, 156 | author = {Marvin N. Wright and Andreas Ziegler}, 157 | journal = {Journal of statistical software}, 158 | year = {2017}, 159 | volume = {77}, 160 | number = {1}, 161 | pages = {1--17}, 162 | doi = {10.18637/jss.v077.i01}, 163 | } 164 | 165 | @Manual{xgboost, 166 | title = {xgboost: Extreme Gradient Boosting}, 167 | author = {Tianqi Chen and Tong He and Michael Benesty and Vadim Khotilovich and Yuan Tang and Hyunsu Cho and Kailong Chen and Rory Mitchell and Ignacio Cano and Tianyi Zhou and Mu Li and Junyuan Xie and Min Lin and Yifeng Geng and Yutian Li}, 168 | year = {2019}, 169 | note = {R package version 0.90.0.2}, 170 | url = {https://CRAN.R-project.org/package=xgboost}, 171 | } 172 | 173 | @Book{mgcv, 174 | title = {Generalized Additive Models: An Introduction with {R}}, 175 | year = {2017}, 176 | author = {S.N Wood}, 177 | edition = {2}, 178 | publisher = {Chapman and Hall/CRC}, 179 | } 180 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #' Computing single H matrix in AICc-function using the Mahalanobis distance 5 | #' 6 | #' @param X matrix with "covariates" 7 | #' @param mcov covariance matrix 8 | #' @param S_scale_dist logical indicating whether the Mahalanobis distance should be scaled with the number of variables 9 | #' @param h numeric specifying the scaling (sigma) 10 | #' 11 | #' @export 12 | #' @keywords internal 13 | #' 14 | #' @return Matrix of dimension \code{ncol(X)*ncol(X)} 15 | #' @author Martin Jullum 16 | hat_matrix_cpp <- function(X, mcov, S_scale_dist, h) { 17 | .Call(`_shapr_hat_matrix_cpp`, X, mcov, S_scale_dist, h) 18 | } 19 | 20 | #' sigma_hat_sq-function 21 | #' 22 | #' @param H Matrix. Output from \code{\link{hat_matrix_cpp}} 23 | #' @param y Vector, i.e. representing the response variable 24 | #' 25 | #' @export 26 | #' @keywords internal 27 | #' 28 | #' @return Scalar 29 | #' 30 | #' @author Martin Jullum 31 | rss_cpp <- function(H, y) { 32 | .Call(`_shapr_rss_cpp`, H, y) 33 | } 34 | 35 | #' correction term with trace_input in AICc formula 36 | #' 37 | #' @param tr_H numeric giving the trace of H 38 | #' @param n numeric given the number of rows in H 39 | #' @export 40 | #' @keywords internal 41 | #' 42 | #' @return Scalar 43 | #' @author Martin Jullum 44 | correction_matrix_cpp <- function(tr_H, n) { 45 | .Call(`_shapr_correction_matrix_cpp`, tr_H, n) 46 | } 47 | 48 | #' Temp-function for computing the full AICc with several X's etc 49 | #' 50 | #' @param X matrix with "covariates" 51 | #' @param mcov covariance matrix 52 | #' @param S_scale_dist logical indicating whether the Mahalanobis distance should be scaled with the number of variables 53 | #' @param h numeric specifying the scaling (sigma) 54 | #' @param y vector with the "response variable" 55 | #' 56 | #' @export 57 | #' @keywords internal 58 | #' 59 | #' @return Scalar with the numeric value of the AICc formula 60 | #' @author Martin Jullum 61 | aicc_full_single_cpp <- function(X, mcov, S_scale_dist, h, y) { 62 | .Call(`_shapr_aicc_full_single_cpp`, X, mcov, S_scale_dist, h, y) 63 | } 64 | 65 | #' AICc formula for several sets, alternative definition 66 | #' 67 | #' @param h Numeric. Specifies the scaling (sigma) 68 | #' @param X_list List 69 | #' @param mcov_list List 70 | #' @param S_scale_dist Logical. Indicates whether Mahalanobis distance should be scaled with the 71 | #' number of variables 72 | #' @param y_list List. 73 | #' @param negative Logical. 74 | #' @keywords internal 75 | #' 76 | #' @return Scalar with the numeric value of the AICc formula 77 | #' 78 | #' @author Martin Jullum 79 | aicc_full_cpp <- function(h, X_list, mcov_list, S_scale_dist, y_list, negative) { 80 | .Call(`_shapr_aicc_full_cpp`, h, X_list, mcov_list, S_scale_dist, y_list, negative) 81 | } 82 | 83 | #' (Generalized) Mahalanobis distance 84 | #' 85 | #' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}. 86 | #' 87 | #' @param featureList List of vectors indicating all factor combinations that should be included in the computations. Assumes that the first one is empty. 88 | #' @param mcov Matrix. The Sigma-matrix in the Mahalanobis distance formula (\code{stats::cov(Xtrain_mat)}) gives Mahalanobis distance, 89 | #' \code{diag(m)} gives the Euclidean distance. 90 | #' @param S_scale_dist Logical indicating 91 | #' @param Xtrain_mat Matrix 92 | #' @param Xtest_mat Matrix 93 | #' 94 | #' @export 95 | #' @keywords internal 96 | #' 97 | #' @return Array of three dimensions. Contains the squared distance for between all training and test observations for all feature combinations passed to the function. 98 | #' @author Martin Jullum 99 | mahalanobis_distance_cpp <- function(featureList, Xtrain_mat, Xtest_mat, mcov, S_scale_dist) { 100 | .Call(`_shapr_mahalanobis_distance_cpp`, featureList, Xtrain_mat, Xtest_mat, mcov, S_scale_dist) 101 | } 102 | 103 | #' @keywords internal 104 | sample_features_cpp <- function(m, n_features) { 105 | .Call(`_shapr_sample_features_cpp`, m, n_features) 106 | } 107 | 108 | #' Get imputed data 109 | #' 110 | #' @param index_xtrain Positive integer. Represents a sequence of row indices from \code{xtrain}, 111 | #' i.e. \code{min(index_xtrain) >= 1} and \code{max(index_xtrain) <= nrow(xtrain)}. 112 | #' 113 | #' @param index_s Positive integer. Represents a sequence of row indices from \code{S}, 114 | #' i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}. 115 | #' 116 | #' @param xtrain Numeric matrix. 117 | #' 118 | #' @param xtest Numeric matrix. Represents a single test observation. 119 | #' 120 | #' @param S Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals 121 | #' the total number of sampled/non-sampled feature combinations and \code{m} equals 122 | #' the total number of unique features. Note that \code{m = ncol(xtrain)}. See details 123 | #' for more information. 124 | #' 125 | #' @details \code{S(i, j) = 1} if and only if feature \code{j} is present in feature 126 | #' combination \code{i}, otherwise \code{S(i, j) = 0}. I.e. if \code{m = 3}, there 127 | #' are \code{2^3 = 8} unique ways to combine the features. In this case \code{dim(S) = c(8, 3)}. 128 | #' Let's call the features \code{x1, x2, x3} and take a closer look at the combination 129 | #' represented by \code{s = c(x1, x2)}. If this combination is represented by the second row, 130 | #' the following is true: \code{S[2, 1:3] = c(1, 1, 0)}. 131 | #' 132 | #' The returned object, \code{X}, is a numeric matrix where 133 | #' \code{dim(X) = c(length(index_xtrain), ncol(xtrain))}. If feature \code{j} is present in 134 | #' the k-th observation, that is \code{S[index_[k], j] == 1}, \code{X[k, j] = xtest[1, j]}. 135 | #' Otherwise \code{X[k, j] = xtrain[index_xtrain[k], j]}. 136 | #' 137 | #' @export 138 | #' @keywords internal 139 | #' 140 | #' @return Numeric matrix 141 | #' 142 | #' @author Nikolai Sellereite 143 | observation_impute_cpp <- function(index_xtrain, index_s, xtrain, xtest, S) { 144 | .Call(`_shapr_observation_impute_cpp`, index_xtrain, index_s, xtrain, xtest, S) 145 | } 146 | 147 | #' Calculate weight matrix 148 | #' 149 | #' @param subsets List. Each of the elements equals an integer 150 | #' vector representing a valid combination of features/feature groups. 151 | #' @param m Integer. Number of features/feature groups 152 | #' @param n Integer. Number of combinations 153 | #' @param w Numeric vector of length \code{n}, i.e. \code{w[i]} equals 154 | #' the Shapley weight of feature/feature group combination \code{i}, represented by 155 | #' \code{subsets[[i]]}. 156 | #' 157 | #' @export 158 | #' @keywords internal 159 | #' 160 | #' @return Matrix of dimension n x m + 1 161 | #' @author Nikolai Sellereite 162 | weight_matrix_cpp <- function(subsets, m, n, w) { 163 | .Call(`_shapr_weight_matrix_cpp`, subsets, m, n, w) 164 | } 165 | 166 | #' Get feature matrix 167 | #' 168 | #' @param features List 169 | #' @param m Positive integer. Total number of features 170 | #' 171 | #' @export 172 | #' @keywords internal 173 | #' 174 | #' @return Matrix 175 | #' @author Nikolai Sellereite 176 | feature_matrix_cpp <- function(features, m) { 177 | .Call(`_shapr_feature_matrix_cpp`, features, m) 178 | } 179 | 180 | -------------------------------------------------------------------------------- /inst/scripts/devel/compare_explain_batch.R: -------------------------------------------------------------------------------- 1 | 2 | library(xgboost) 3 | #library(shapr) 4 | library(data.table) 5 | 6 | data("Boston", package = "MASS") 7 | 8 | x_var <- c("lstat", "rm", "dis", "indus")#,"nox","age","tax","ptratio") 9 | y_var <- "medv" 10 | 11 | x_train <- as.matrix(Boston[-1:-6, x_var]) 12 | y_train <- Boston[-1:-6, y_var] 13 | x_test <- as.matrix(Boston[1:6, x_var]) 14 | 15 | # Fitting a basic xgboost model to the training data 16 | model <- xgboost( 17 | data = x_train, 18 | label = y_train, 19 | nround = 20, 20 | verbose = FALSE 21 | ) 22 | 23 | # THIS IS GENERATED FROM MASTER BRANCH 24 | # Prepare the data for explanation 25 | library(shapr) 26 | explainer <- shapr(x_train, model,n_combinations = 100) 27 | p = mean(y_train) 28 | gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000) 29 | emp = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000) 30 | copula = explain(x_test, explainer, "copula", prediction_zero = p, n_samples = 10000) 31 | indep = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000) 32 | comb = explain(x_test, explainer, c("gaussian", "gaussian", "empirical", "empirical"), prediction_zero = p, n_samples = 10000) 33 | ctree = explain(x_test, explainer, "ctree", mincriterion = 0.95, prediction_zero = p, n_samples = 10000) 34 | ctree2 = explain(x_test, explainer, "ctree", mincriterion = c(0.95, 0.95, 0.95, 0.95), prediction_zero = p, n_samples = 10000) 35 | #saveRDS(list(gauss = gauss, empirical = emp, copula = copula, indep = indep, comb = comb, ctree = ctree, ctree_comb = ctree2), file = "inst/scripts/devel/master_res2.rds") 36 | # saveRDS(list(ctree = ctree, ctree_comb = ctree2), file = "inst/scripts/devel/master_res_ctree.rds") 37 | 38 | 39 | detach("package:shapr", unload = TRUE) 40 | devtools::load_all() 41 | nobs = 6 42 | x_test <- as.matrix(Boston[1:nobs, x_var]) 43 | explainer <- shapr(x_train, model,n_combinations = 100) 44 | p = mean(y_train) 45 | gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000, n_batches = 1) 46 | emp = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000, n_batches = 1) 47 | copula = explain(x_test, explainer, "copula", prediction_zero = p, n_samples = 10000, n_batches = 1) 48 | indep = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 1) 49 | comb = explain(x_test, explainer, c("gaussian", "gaussian", "empirical", "empirical"), prediction_zero = p, n_samples = 10000, n_batches = 1) 50 | ctree = explain(x_test, explainer, "ctree", mincriterion = 0.95, prediction_zero = p, n_samples = 10000, n_batches = 1) 51 | ctree2 = explain(x_test, explainer, "ctree", mincriterion = c(0.95, 0.95, 0.95, 0.95), prediction_zero = p, n_samples = 10000, n_batches = 1) 52 | 53 | res = readRDS("inst/scripts/devel/master_res2.rds") 54 | 55 | # Compare res 56 | all.equal(res$gauss$dt, gauss$dt) # TRUE 57 | all.equal(res$empirical$dt, emp$dt) # TRUE 58 | 59 | res$comb$dt 60 | comb$dt 61 | 62 | # With batches 63 | gauss_b = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000, n_batches = 3) 64 | emp_b = explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000, n_batches = 3) 65 | 66 | gauss_b$dt 67 | res$gauss$dt 68 | 69 | emp_b$dt 70 | res$empirical$dt 71 | 72 | #### MJ stuff here: 73 | 74 | explain.independence2 <- function(x, explainer, approach, prediction_zero, 75 | n_samples = 1e3, n_batches = 1, seed = 1, only_return_contrib_dt = FALSE, ...) { 76 | 77 | 78 | if (!is.null(seed)) set.seed(seed) 79 | 80 | # Add arguments to explainer object 81 | explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) 82 | explainer$approach <- approach 83 | explainer$n_samples <- n_samples 84 | 85 | r <- prepare_and_predict(explainer, n_batches, prediction_zero, only_return_contrib_dt, ...) 86 | } 87 | 88 | 89 | prepare_data.independence2 <- function(x, index_features = NULL, ...) { 90 | id <- id_combination <- w <- NULL # due to NSE notes in R CMD check 91 | 92 | if (is.null(index_features)) { 93 | index_features <- x$X[, .I] 94 | } 95 | 96 | S <- x$S[index_features, ] 97 | x_train <- as.matrix(x$x_train) 98 | n_train <- nrow(x_train) 99 | n_samples <- min(x$n_samples, n_train) 100 | 101 | index_s <- rep(seq(nrow(S)), each = n_samples) 102 | w <- 1 / x$n_samples 103 | 104 | n_col <- nrow(x$x_test) 105 | 106 | dt_l <- list() 107 | for (i in seq(n_col)) { 108 | x_test <- x$x_test[i, , drop = FALSE] 109 | 110 | # sampling index_xtrain 111 | index_xtrain <- c(replicate(nrow(S), sample(x = seq(n_train), size = n_samples, replace = F))) 112 | 113 | # Generate data used for prediction 114 | dt_p <- observation_impute_cpp( 115 | index_xtrain = index_xtrain, 116 | index_s = index_s, 117 | xtrain = x_train, 118 | xtest = x_test, 119 | S = S 120 | ) 121 | 122 | # Add keys 123 | dt_l[[i]] <- data.table::as.data.table(dt_p) 124 | data.table::setnames(dt_l[[i]], colnames(x_train)) 125 | dt_l[[i]][, id_combination := index_s] 126 | dt_l[[i]][, w := w] # IS THIS NECESSARY? 127 | dt_l[[i]][, id := i] 128 | } 129 | 130 | 131 | dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) 132 | return(dt) 133 | } 134 | 135 | 136 | 137 | 138 | # Using independence with n_samples > nrow(x_train) such that no sampling is performed 139 | 140 | indep1 = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 1) 141 | indep2 = explain(x_test, explainer, "independence2", prediction_zero = p, n_samples = 10000, n_batches = 1) 142 | 143 | all.equal(indep1,indep2) # TRUE 144 | 145 | indep1_batch_2 = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 2) 146 | 147 | all.equal(indep1,indep1_batch_2) # TRUE 148 | 149 | indep1_batch_5 = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 5) 150 | 151 | all.equal(indep1,indep1_batch_5) # TRUE 152 | 153 | comb_indep_1_batch_1 = explain(x_test, explainer, c("independence", "independence", "independence", "independence"), prediction_zero = p, n_samples = 10000, n_batches = 1) 154 | 155 | all.equal(indep1,comb_indep_1_batch_1) # TRUE 156 | 157 | comb_indep_1_batch_2 = explain(x_test, explainer, c("independence", "independence", "independence", "independence"), prediction_zero = p, n_samples = 10000, n_batches = 2) 158 | 159 | all.equal(indep1,comb_indep_1_batch_2) # TRUE 160 | 161 | comb_indep_1_2_batch_1 = explain(x_test, explainer, c("independence", "independence", "independence2", "independence2"), prediction_zero = p, n_samples = 10000, n_batches = 1) 162 | 163 | all.equal(indep1,comb_indep_1_2_batch_1) #TRUE 164 | 165 | comb_indep_1_2_batch_2 = explain(x_test, explainer, c("independence", "independence", "independence2", "independence2"), prediction_zero = p, n_samples = 10000, n_batches = 2) 166 | 167 | all.equal(indep1,comb_indep_1_2_batch_2) #TRUE 168 | 169 | comb_indep_1_2_batch_5 = explain(x_test, explainer, c("independence", "independence", "independence2", "independence2"), prediction_zero = p, n_samples = 10000, n_batches = 5) 170 | 171 | all.equal(indep1,comb_indep_1_2_batch_5) #TRUE 172 | 173 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | bibliography: ./inst/REFERENCES.bib 4 | --- 5 | 6 | 7 | 8 | ```{r setup, include = FALSE} 9 | knitr::opts_chunk$set( 10 | collapse = TRUE, 11 | comment = "#>", 12 | fig.path = "man/figures/README-", 13 | out.width = "100%", 14 | tidy = "styler" 15 | ) 16 | 17 | ``` 18 | 19 | # shapr 20 | 21 | 22 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/shapr)](https://cran.r-project.org/package=shapr) 23 | [![CRAN_Downloads_Badge](https://cranlogs.r-pkg.org/badges/grand-total/shapr)](https://cran.r-project.org/package=shapr) 24 | [![R build status](https://github.com/NorskRegnesentral/shapr/workflows/R-CMD-check/badge.svg)](https://github.com/NorskRegnesentral/shapr/actions?query=workflow%3AR-CMD-check) 25 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html) 26 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 27 | [![DOI](https://joss.theoj.org/papers/10.21105/joss.02027/status.svg)](https://doi.org/10.21105/joss.02027) 28 | 29 | 30 | # NOTE: This package is undergoing severe restructuring. A new version (with breaking changes) will be available on GitHub soon. 31 | 32 | The most common machine learning task is to train a model which is able to predict an unknown outcome (response variable) based on a set of known input variables/features. 33 | When using such models for real life applications, it is often crucial to understand why a certain set of features lead to exactly that prediction. 34 | However, explaining predictions from complex, or seemingly simple, machine learning models is a practical and ethical question, as well as a legal issue. Can I trust the model? Is it biased? Can I explain it to others? We want to explain individual predictions from a complex machine learning model by learning simple, interpretable explanations. 35 | 36 | Shapley values is the only prediction explanation framework with a solid theoretical foundation (@lundberg2017unified). Unless the true distribution of the features are known, and there are less than say 10-15 features, these Shapley values needs to be estimated/approximated. 37 | Popular methods like Shapley Sampling Values (@vstrumbelj2014explaining), SHAP/Kernel SHAP (@lundberg2017unified), and to some extent TreeSHAP (@lundberg2018consistent), assume that the features are independent when approximating the Shapley values for prediction explanation. This may lead to very inaccurate Shapley values, and consequently wrong interpretations of the predictions. @aas2019explaining extends and improves the Kernel SHAP method of @lundberg2017unified to account for the dependence between the features, resulting in significantly more accurate approximations to the Shapley values. 38 | [See the paper for details](https://arxiv.org/abs/1903.10464). 39 | 40 | This package implements the methodology of @aas2019explaining. 41 | 42 | The following methodology/features are currently implemented: 43 | 44 | - Native support of explanation of predictions from models fitted with the following functions 45 | `stats::glm`, `stats::lm`,`ranger::ranger`, `xgboost::xgboost`/`xgboost::xgb.train` and `mgcv::gam`. 46 | - Accounting for feature dependence assuming the features are Gaussian (@aas2019explaining). 47 | - Accounting for feature dependence with a Gaussian copula (Gaussian dependence structure, any marginal) (@aas2019explaining). 48 | - Accounting for feature dependence using the Mahalanobis distance based empirical (conditional) distribution approach of @aas2019explaining. 49 | - Accounting for feature dependence using conditional inference trees (@redelmeier2020explaining). 50 | - Combining any of the four methods. 51 | - Optional use of the AICc criterion of @hurvich1998smoothing when optimizing the bandwidth parameter in the empirical (conditional) approach of @aas2019explaining. 52 | - Functionality for visualizing the explanations. 53 | - Support for models not supported natively. 54 | 55 | 62 | 63 | Future releases will include: 64 | 65 | - Support for parallelization over explanations, Monte Carlo sampling and features subsets for non-parallelizable prediction functions. 66 | - Computational improvement of the AICc optimization approach, 67 | - Adaptive selection of method to account for the feature dependence. 68 | 69 | Note that both the features and the prediction must be numeric. The approach is constructed for continuous features. Discrete features may also work just fine with the empirical (conditional) distribution approach. 70 | Unlike SHAP and TreeSHAP, we decompose probability predictions directly to ease the interpretability, i.e. not via log odds transformations. 71 | The application programming interface (API) of `shapr` is inspired by @lime_api. 72 | 73 | ## Installation 74 | 75 | To install the current stable release from CRAN, use 76 | 77 | ```{r, eval = FALSE} 78 | install.packages("shapr") 79 | ``` 80 | 81 | To install the current development version, use 82 | 83 | ```{r, eval = FALSE} 84 | remotes::install_github("NorskRegnesentral/shapr") 85 | ``` 86 | 87 | If you would like to install all packages of the models we currently support, use 88 | 89 | ```{r, eval = FALSE} 90 | remotes::install_github("NorskRegnesentral/shapr", dependencies = TRUE) 91 | ``` 92 | 93 | 94 | If you would also like to build and view the vignette locally, use 95 | ```{r, eval = FALSE} 96 | remotes::install_github("NorskRegnesentral/shapr", dependencies = TRUE, build_vignettes = TRUE) 97 | vignette("understanding_shapr", "shapr") 98 | ``` 99 | 100 | You can always check out the latest version of the vignette [here](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html). 101 | 102 | ## Example 103 | `shapr` supports computation of Shapley values with any predictive model which takes a set of numeric features and produces a numeric outcome. 104 | 105 | The following example shows how a simple `xgboost` model is trained using the *Boston Housing Data*, and how `shapr` explains the individual predictions. 106 | 107 | 108 | ```{r basic_example, warning = FALSE} 109 | library(xgboost) 110 | library(shapr) 111 | 112 | data("Boston", package = "MASS") 113 | 114 | x_var <- c("lstat", "rm", "dis", "indus") 115 | y_var <- "medv" 116 | 117 | ind_x_test <- 1:6 118 | x_train <- as.matrix(Boston[-ind_x_test, x_var]) 119 | y_train <- Boston[-ind_x_test, y_var] 120 | x_test <- as.matrix(Boston[ind_x_test, x_var]) 121 | 122 | # Looking at the dependence between the features 123 | cor(x_train) 124 | 125 | # Fitting a basic xgboost model to the training data 126 | model <- xgboost( 127 | data = x_train, 128 | label = y_train, 129 | nround = 20, 130 | verbose = FALSE 131 | ) 132 | 133 | # Prepare the data for explanation 134 | explainer <- shapr(x_train, model) 135 | 136 | # Specifying the phi_0, i.e. the expected prediction without any features 137 | p <- mean(y_train) 138 | 139 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using 140 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) 141 | explanation <- explain( 142 | x_test, 143 | approach = "empirical", 144 | explainer = explainer, 145 | prediction_zero = p 146 | ) 147 | 148 | # Printing the Shapley values for the test data. 149 | # For more information about the interpretation of the values in the table, see ?shapr::explain. 150 | print(explanation$dt) 151 | 152 | # Finally we plot the resulting explanations 153 | plot(explanation) 154 | ``` 155 | 156 | ## Contribution 157 | 158 | All feedback and suggestions are very welcome. Details on how to contribute can be found 159 | [here](https://norskregnesentral.github.io/shapr/CONTRIBUTING.html). If you have any questions or comments, feel 160 | free to open an issue [here](https://github.com/NorskRegnesentral/shapr/issues). 161 | 162 | Please note that the 'shapr' project is released with a 163 | [Contributor Code of Conduct](https://norskregnesentral.github.io/shapr/CODE_OF_CONDUCT.html). 164 | By contributing to this project, you agree to abide by its terms. 165 | 166 | ## References 167 | 168 | 169 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // hat_matrix_cpp 15 | arma::mat hat_matrix_cpp(arma::mat X, arma::mat mcov, bool S_scale_dist, double h); 16 | RcppExport SEXP _shapr_hat_matrix_cpp(SEXP XSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP, SEXP hSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 21 | Rcpp::traits::input_parameter< arma::mat >::type mcov(mcovSEXP); 22 | Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP); 23 | Rcpp::traits::input_parameter< double >::type h(hSEXP); 24 | rcpp_result_gen = Rcpp::wrap(hat_matrix_cpp(X, mcov, S_scale_dist, h)); 25 | return rcpp_result_gen; 26 | END_RCPP 27 | } 28 | // rss_cpp 29 | double rss_cpp(arma::mat H, arma::vec y); 30 | RcppExport SEXP _shapr_rss_cpp(SEXP HSEXP, SEXP ySEXP) { 31 | BEGIN_RCPP 32 | Rcpp::RObject rcpp_result_gen; 33 | Rcpp::RNGScope rcpp_rngScope_gen; 34 | Rcpp::traits::input_parameter< arma::mat >::type H(HSEXP); 35 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 36 | rcpp_result_gen = Rcpp::wrap(rss_cpp(H, y)); 37 | return rcpp_result_gen; 38 | END_RCPP 39 | } 40 | // correction_matrix_cpp 41 | double correction_matrix_cpp(double tr_H, int n); 42 | RcppExport SEXP _shapr_correction_matrix_cpp(SEXP tr_HSEXP, SEXP nSEXP) { 43 | BEGIN_RCPP 44 | Rcpp::RObject rcpp_result_gen; 45 | Rcpp::RNGScope rcpp_rngScope_gen; 46 | Rcpp::traits::input_parameter< double >::type tr_H(tr_HSEXP); 47 | Rcpp::traits::input_parameter< int >::type n(nSEXP); 48 | rcpp_result_gen = Rcpp::wrap(correction_matrix_cpp(tr_H, n)); 49 | return rcpp_result_gen; 50 | END_RCPP 51 | } 52 | // aicc_full_single_cpp 53 | arma::vec aicc_full_single_cpp(arma::mat X, arma::mat mcov, bool S_scale_dist, double h, arma::vec y); 54 | RcppExport SEXP _shapr_aicc_full_single_cpp(SEXP XSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP, SEXP hSEXP, SEXP ySEXP) { 55 | BEGIN_RCPP 56 | Rcpp::RObject rcpp_result_gen; 57 | Rcpp::RNGScope rcpp_rngScope_gen; 58 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 59 | Rcpp::traits::input_parameter< arma::mat >::type mcov(mcovSEXP); 60 | Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP); 61 | Rcpp::traits::input_parameter< double >::type h(hSEXP); 62 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 63 | rcpp_result_gen = Rcpp::wrap(aicc_full_single_cpp(X, mcov, S_scale_dist, h, y)); 64 | return rcpp_result_gen; 65 | END_RCPP 66 | } 67 | // aicc_full_cpp 68 | double aicc_full_cpp(double h, Rcpp::List X_list, Rcpp::List mcov_list, bool S_scale_dist, Rcpp::List y_list, bool negative); 69 | RcppExport SEXP _shapr_aicc_full_cpp(SEXP hSEXP, SEXP X_listSEXP, SEXP mcov_listSEXP, SEXP S_scale_distSEXP, SEXP y_listSEXP, SEXP negativeSEXP) { 70 | BEGIN_RCPP 71 | Rcpp::RObject rcpp_result_gen; 72 | Rcpp::RNGScope rcpp_rngScope_gen; 73 | Rcpp::traits::input_parameter< double >::type h(hSEXP); 74 | Rcpp::traits::input_parameter< Rcpp::List >::type X_list(X_listSEXP); 75 | Rcpp::traits::input_parameter< Rcpp::List >::type mcov_list(mcov_listSEXP); 76 | Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP); 77 | Rcpp::traits::input_parameter< Rcpp::List >::type y_list(y_listSEXP); 78 | Rcpp::traits::input_parameter< bool >::type negative(negativeSEXP); 79 | rcpp_result_gen = Rcpp::wrap(aicc_full_cpp(h, X_list, mcov_list, S_scale_dist, y_list, negative)); 80 | return rcpp_result_gen; 81 | END_RCPP 82 | } 83 | // mahalanobis_distance_cpp 84 | arma::cube mahalanobis_distance_cpp(Rcpp::List featureList, arma::mat Xtrain_mat, arma::mat Xtest_mat, arma::mat mcov, bool S_scale_dist); 85 | RcppExport SEXP _shapr_mahalanobis_distance_cpp(SEXP featureListSEXP, SEXP Xtrain_matSEXP, SEXP Xtest_matSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP) { 86 | BEGIN_RCPP 87 | Rcpp::RObject rcpp_result_gen; 88 | Rcpp::RNGScope rcpp_rngScope_gen; 89 | Rcpp::traits::input_parameter< Rcpp::List >::type featureList(featureListSEXP); 90 | Rcpp::traits::input_parameter< arma::mat >::type Xtrain_mat(Xtrain_matSEXP); 91 | Rcpp::traits::input_parameter< arma::mat >::type Xtest_mat(Xtest_matSEXP); 92 | Rcpp::traits::input_parameter< arma::mat >::type mcov(mcovSEXP); 93 | Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP); 94 | rcpp_result_gen = Rcpp::wrap(mahalanobis_distance_cpp(featureList, Xtrain_mat, Xtest_mat, mcov, S_scale_dist)); 95 | return rcpp_result_gen; 96 | END_RCPP 97 | } 98 | // sample_features_cpp 99 | List sample_features_cpp(int m, IntegerVector n_features); 100 | RcppExport SEXP _shapr_sample_features_cpp(SEXP mSEXP, SEXP n_featuresSEXP) { 101 | BEGIN_RCPP 102 | Rcpp::RObject rcpp_result_gen; 103 | Rcpp::RNGScope rcpp_rngScope_gen; 104 | Rcpp::traits::input_parameter< int >::type m(mSEXP); 105 | Rcpp::traits::input_parameter< IntegerVector >::type n_features(n_featuresSEXP); 106 | rcpp_result_gen = Rcpp::wrap(sample_features_cpp(m, n_features)); 107 | return rcpp_result_gen; 108 | END_RCPP 109 | } 110 | // observation_impute_cpp 111 | NumericMatrix observation_impute_cpp(IntegerVector index_xtrain, IntegerVector index_s, NumericMatrix xtrain, NumericMatrix xtest, IntegerMatrix S); 112 | RcppExport SEXP _shapr_observation_impute_cpp(SEXP index_xtrainSEXP, SEXP index_sSEXP, SEXP xtrainSEXP, SEXP xtestSEXP, SEXP SSEXP) { 113 | BEGIN_RCPP 114 | Rcpp::RObject rcpp_result_gen; 115 | Rcpp::RNGScope rcpp_rngScope_gen; 116 | Rcpp::traits::input_parameter< IntegerVector >::type index_xtrain(index_xtrainSEXP); 117 | Rcpp::traits::input_parameter< IntegerVector >::type index_s(index_sSEXP); 118 | Rcpp::traits::input_parameter< NumericMatrix >::type xtrain(xtrainSEXP); 119 | Rcpp::traits::input_parameter< NumericMatrix >::type xtest(xtestSEXP); 120 | Rcpp::traits::input_parameter< IntegerMatrix >::type S(SSEXP); 121 | rcpp_result_gen = Rcpp::wrap(observation_impute_cpp(index_xtrain, index_s, xtrain, xtest, S)); 122 | return rcpp_result_gen; 123 | END_RCPP 124 | } 125 | // weight_matrix_cpp 126 | arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w); 127 | RcppExport SEXP _shapr_weight_matrix_cpp(SEXP subsetsSEXP, SEXP mSEXP, SEXP nSEXP, SEXP wSEXP) { 128 | BEGIN_RCPP 129 | Rcpp::RObject rcpp_result_gen; 130 | Rcpp::RNGScope rcpp_rngScope_gen; 131 | Rcpp::traits::input_parameter< List >::type subsets(subsetsSEXP); 132 | Rcpp::traits::input_parameter< int >::type m(mSEXP); 133 | Rcpp::traits::input_parameter< int >::type n(nSEXP); 134 | Rcpp::traits::input_parameter< NumericVector >::type w(wSEXP); 135 | rcpp_result_gen = Rcpp::wrap(weight_matrix_cpp(subsets, m, n, w)); 136 | return rcpp_result_gen; 137 | END_RCPP 138 | } 139 | // feature_matrix_cpp 140 | NumericMatrix feature_matrix_cpp(List features, int m); 141 | RcppExport SEXP _shapr_feature_matrix_cpp(SEXP featuresSEXP, SEXP mSEXP) { 142 | BEGIN_RCPP 143 | Rcpp::RObject rcpp_result_gen; 144 | Rcpp::RNGScope rcpp_rngScope_gen; 145 | Rcpp::traits::input_parameter< List >::type features(featuresSEXP); 146 | Rcpp::traits::input_parameter< int >::type m(mSEXP); 147 | rcpp_result_gen = Rcpp::wrap(feature_matrix_cpp(features, m)); 148 | return rcpp_result_gen; 149 | END_RCPP 150 | } 151 | 152 | static const R_CallMethodDef CallEntries[] = { 153 | {"_shapr_hat_matrix_cpp", (DL_FUNC) &_shapr_hat_matrix_cpp, 4}, 154 | {"_shapr_rss_cpp", (DL_FUNC) &_shapr_rss_cpp, 2}, 155 | {"_shapr_correction_matrix_cpp", (DL_FUNC) &_shapr_correction_matrix_cpp, 2}, 156 | {"_shapr_aicc_full_single_cpp", (DL_FUNC) &_shapr_aicc_full_single_cpp, 5}, 157 | {"_shapr_aicc_full_cpp", (DL_FUNC) &_shapr_aicc_full_cpp, 6}, 158 | {"_shapr_mahalanobis_distance_cpp", (DL_FUNC) &_shapr_mahalanobis_distance_cpp, 5}, 159 | {"_shapr_sample_features_cpp", (DL_FUNC) &_shapr_sample_features_cpp, 2}, 160 | {"_shapr_observation_impute_cpp", (DL_FUNC) &_shapr_observation_impute_cpp, 5}, 161 | {"_shapr_weight_matrix_cpp", (DL_FUNC) &_shapr_weight_matrix_cpp, 4}, 162 | {"_shapr_feature_matrix_cpp", (DL_FUNC) &_shapr_feature_matrix_cpp, 2}, 163 | {NULL, NULL, 0} 164 | }; 165 | 166 | RcppExport void R_init_shapr(DllInfo *dll) { 167 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 168 | R_useDynamicSymbols(dll, FALSE); 169 | } 170 | -------------------------------------------------------------------------------- /tests/testthat/test-a-shapley.R: -------------------------------------------------------------------------------- 1 | context("test-shapley.R") 2 | 3 | RNGversion(vstr = "3.5.0") 4 | 5 | test_that("Basic test functions in shapley.R", { 6 | 7 | # Load data ----------- 8 | if (requireNamespace("MASS", quietly = TRUE)) { 9 | data("Boston", package = "MASS") 10 | x_var <- c("lstat", "rm", "dis", "indus") 11 | x_train <- tail(Boston[, x_var], 50) 12 | 13 | # Load premade lm model. Path needs to be relative to testthat directory in the package 14 | model <- readRDS("model_objects/lm_model_object.rds") 15 | 16 | # Prepare the data for explanation 17 | explainer <- shapr(x_train, model) 18 | 19 | expect_known_value(explainer, 20 | file = "test_objects/shapley_explainer_obj.rds", 21 | update = F 22 | ) 23 | } 24 | }) 25 | 26 | 27 | test_that("Testing data input to shapr in shapley.R", { 28 | if (requireNamespace("MASS", quietly = TRUE)) { 29 | data("Boston", package = "MASS") 30 | 31 | y_var <- "medv" 32 | x_train <- tail(Boston, -6) 33 | y_train <- tail(Boston[, y_var], -6) 34 | y_train_binary <- as.factor(tail((Boston[, y_var] > 20) * 1, -6)) 35 | 36 | # convert to factors for testing purposes 37 | x_train$rad <- factor(round(x_train$rad)) 38 | x_train$chas <- factor(round(x_train$chas)) 39 | 40 | train_df <- cbind(x_train, y_train, y_train_binary) 41 | 42 | 43 | x_var_numeric <- c("lstat", "rm", "dis", "indus") 44 | x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas") 45 | 46 | train_df_used_numeric <- x_train[, x_var_numeric] 47 | train_df_used_factor <- x_train[, x_var_factor] 48 | 49 | formula_numeric <- as.formula(paste0("y_train ~ ", paste0(x_var_numeric, collapse = "+"))) 50 | formula_factor <- as.formula(paste0("y_train ~ ", paste0(x_var_factor, collapse = "+"))) 51 | 52 | formula_binary_numeric <- as.formula(paste0("y_train_binary ~ ", paste0(x_var_numeric, collapse = "+"))) 53 | formula_binary_factor <- as.formula(paste0("y_train_binary ~ ", paste0(x_var_factor, collapse = "+"))) 54 | 55 | dummylist <- make_dummies(traindata = x_train[, x_var_factor], testdata = x_train[, x_var_factor]) 56 | 57 | # List of models to run silently 58 | l_numeric <- list( 59 | stats::lm(formula_numeric, data = train_df), 60 | stats::glm(formula_numeric, data = train_df) 61 | ) 62 | 63 | if (requireNamespace("mgcv", quietly = TRUE)) { 64 | l_numeric[[length(l_numeric) + 1]] <- mgcv::gam(formula_numeric, data = train_df) 65 | } 66 | 67 | l_factor <- list( 68 | stats::lm(formula_factor, data = train_df), 69 | stats::glm(formula_factor, data = train_df) 70 | ) 71 | 72 | if (requireNamespace("mgcv", quietly = TRUE)) { 73 | l_factor[[length(l_factor) + 1]] <- mgcv::gam(formula_factor, data = train_df) 74 | } 75 | 76 | if (requireNamespace("xgboost", quietly = TRUE)) { 77 | l_factor[[length(l_factor) + 1]] <- xgboost::xgboost( 78 | data = dummylist$train_dummies, 79 | label = y_train, 80 | nrounds = 3, verbose = FALSE 81 | ) 82 | l_factor[[length(l_factor)]]$feature_list <- dummylist$feature_list 83 | } 84 | 85 | 86 | for (i in seq_along(l_numeric)) { 87 | expect_silent(shapr(train_df_used_numeric, l_numeric[[i]])) # No modification 88 | expect_message(shapr(train_df, l_numeric[[i]])) # Features dropped 89 | } 90 | 91 | for (i in seq_along(l_factor)) { 92 | expect_silent(shapr(train_df_used_factor, l_factor[[i]])) # No modification 93 | expect_message(shapr(train_df, l_factor[[i]])) # Features dropped 94 | } 95 | 96 | 97 | # Testing errors on incompatible model and data 98 | # Missing features 99 | model <- stats::lm(formula_factor, data = train_df) 100 | data_error <- train_df[, -3] 101 | expect_error(shapr(data_error, model)) 102 | 103 | # Duplicated column names 104 | data_error <- train_df_used_factor 105 | data_error <- cbind(data_error, lstat = 1) 106 | expect_error(shapr(data_error, model)) 107 | 108 | # Empty column names in data 109 | data_error <- train_df 110 | colnames(data_error) <- NULL 111 | expect_error(shapr(data_error, model)) 112 | 113 | # Empty column names in model (ok if found in data -- and we trust it) 114 | if (requireNamespace("xgboost", quietly = TRUE)) { 115 | data_with_colnames <- data_without_colnames <- as.matrix(train_df_used_numeric) 116 | colnames(data_without_colnames) <- NULL 117 | 118 | model_xgb <- xgboost::xgboost( 119 | data = data_without_colnames, label = y_train, 120 | nrounds = 3, verbose = FALSE 121 | ) 122 | expect_message(shapr(data_with_colnames, model_xgb)) 123 | } 124 | 125 | # Data feature with incorrect class 126 | data_error <- train_df_used_factor 127 | data_error$lstat <- as.logical(data_error$lstat > 15) 128 | expect_error(shapr(data_error, model)) 129 | 130 | # non-matching factor levels 131 | data_error <- head(train_df_used_factor) 132 | data_error$rad <- droplevels(data_error$rad) 133 | expect_error(shapr(data_error, model)) 134 | } 135 | }) 136 | 137 | test_that("Basic test functions for grouping in shapley.R", { 138 | 139 | # Load data ----------- 140 | if (requireNamespace("MASS", quietly = TRUE)) { 141 | data("Boston", package = "MASS") 142 | x_var <- c("lstat", "rm", "dis", "indus") 143 | x_train <- tail(Boston[, x_var], 50) 144 | 145 | # Load premade lm model. Path needs to be relative to testthat directory in the package 146 | model <- readRDS("model_objects/lm_model_object.rds") 147 | 148 | group1_num <- list( 149 | c(1, 3), 150 | c(2, 4) 151 | ) 152 | 153 | group1 <- lapply(group1_num, function(x) { 154 | x_var[x] 155 | }) 156 | 157 | 158 | group2_num <- list( 159 | c(1), 160 | c(2), 161 | c(3), 162 | c(4) 163 | ) 164 | 165 | group2 <- lapply(group2_num, function(x) { 166 | x_var[x] 167 | }) 168 | 169 | # Prepare the data for explanation 170 | explainer1 <- shapr(x_train, model, group = group1) 171 | explainer2 <- shapr(x_train, model, group = group2) 172 | 173 | set.seed(123) 174 | explainer1_2 <- shapr(x_train, model, group = group1, n_combinations = 5) 175 | set.seed(1234) 176 | explainer2_2 <- shapr(x_train, model, group = group2, n_combinations = 5) 177 | 178 | expect_known_value(explainer1, 179 | file = "test_objects/shapley_explainer_group1_obj.rds", 180 | update = F 181 | ) 182 | expect_known_value(explainer2, 183 | file = "test_objects/shapley_explainer_group2_obj.rds", 184 | update = F 185 | ) 186 | expect_known_value(explainer1_2, 187 | file = "test_objects/shapley_explainer_group1_2_obj.rds", 188 | update = F 189 | ) 190 | expect_known_value(explainer2_2, 191 | file = "test_objects/shapley_explainer_group2_2_obj.rds", 192 | update = F 193 | ) 194 | 195 | } 196 | }) 197 | 198 | 199 | test_that("Testing data input to shapr for grouping in shapley.R", { 200 | if (requireNamespace("MASS", quietly = TRUE)) { 201 | data("Boston", package = "MASS") 202 | 203 | x_var <- c("lstat", "rm", "dis", "indus") 204 | not_x_var <- "crim" 205 | 206 | x_train <- as.matrix(tail(Boston[, x_var], -6)) 207 | xy_train <- tail(Boston, -6) 208 | group_num <- list( 209 | c(1, 3), 210 | c(2, 4) 211 | ) 212 | 213 | group <- lapply(group_num, function(x) { 214 | x_var[x] 215 | }) 216 | names(group) <- c("A", "B") 217 | 218 | group_no_names <- lapply(group_num, function(x) { 219 | x_var[x] 220 | }) 221 | 222 | group_error_1 <- list( 223 | c(x_var[1:2], not_x_var), 224 | x_var[3:4] 225 | ) 226 | 227 | group_error_2 <- list( 228 | x_var[1], 229 | x_var[3:4] 230 | ) 231 | 232 | group_error_3 <- list( 233 | x_var[c(1, 2)], 234 | x_var[c(1, 3, 4)] 235 | ) 236 | 237 | group_error_4 <- list( 238 | x_var[c(1, 2)], 239 | x_var[c(1, 3, 4)] 240 | ) 241 | 242 | 243 | # Fitting models 244 | formula <- as.formula(paste0("medv ~ ", paste0(x_var, collapse = "+"))) 245 | model <- stats::lm(formula = formula, data = xy_train) 246 | 247 | 248 | # Expect silent 249 | expect_silent(shapr(x = x_train, model = model, group = group)) 250 | 251 | # Expect message for missing names 252 | expect_message(shapr(x = x_train, model = model, group = group_no_names)) 253 | 254 | 255 | # Expect error when group is not a list 256 | expect_error(shapr(x_train, model, group = x_var)) 257 | 258 | 259 | # Expect error that group does not include names of features 260 | expect_error(shapr(x = x_train, model = model, group = group_num)) 261 | 262 | # Expect error when x_train/model does not use a feature mentioned in the group 263 | expect_error(shapr(x_train, model, group = group_error_1)) 264 | 265 | # Expect error when group does not contain a feature used by the model 266 | expect_error(shapr(x_train, model, group = group_error_2)) 267 | 268 | # Expect error when group does duplicated features 269 | expect_error(shapr(x_train, model, group = group_error_3)) 270 | } 271 | }) 272 | --------------------------------------------------------------------------------