├── .github
    ├── .gitignore
    ├── workflows
    │   ├── remove-old-artifacts.yml
    │   ├── lint.yaml
    │   ├── pkgdown.yaml
    │   └── R-CMD-check.yaml
    └── CONTRIBUTING.md
├── .covrignore
├── inst
    ├── joss_paper
    │   ├── .gitignore
    │   ├── paper.pdf
    │   ├── paper.md
    │   ├── paper.Rmd
    │   └── paper.bib
    ├── scripts
    │   ├── devel
    │   │   ├── master_res.rds
    │   │   ├── master_res2.rds
    │   │   ├── devel_non_exact_grouping.R
    │   │   ├── same_seed_as_master.R
    │   │   ├── compare_indep_implementations.R
    │   │   └── compare_explain_batch.R
    │   ├── create_lm_model_object.R
    │   ├── create_xgboost_model_object.R
    │   ├── shap_python_script.py
    │   ├── readme_example.R
    │   ├── example_custom_model.R
    │   ├── example_ctree_method.R
    │   └── compare_shap_python.R
    ├── model_objects
    │   ├── lm_model_object.rds
    │   ├── xgboost_model_object.rds
    │   ├── xgboost_model_object_raw
    │   ├── xgboost_model_object_cat.rds
    │   └── xgboost_model_object_raw_cat
    └── REFERENCES.bib
├── src
    ├── .gitignore
    ├── Makevars
    ├── Makevars.win
    ├── features.cpp
    ├── impute_data.cpp
    ├── distance.cpp
    ├── weighted_matrix.cpp
    └── RcppExports.cpp
├── vignettes
    └── .gitignore
├── LICENSE
├── tests
    ├── testthat.R
    └── testthat
    │   ├── model_objects
    │       └── lm_model_object.rds
    │   ├── test_objects
    │       ├── shapley_explainer_obj.rds
    │       ├── explanation_explain_obj_list.rds
    │       ├── shapley_explainer_group1_2_obj.rds
    │       ├── shapley_explainer_group1_obj.rds
    │       ├── shapley_explainer_group2_2_obj.rds
    │       ├── shapley_explainer_group2_obj.rds
    │       ├── explanation_explain_group_obj_list.rds
    │       ├── explanation_explain_obj_list_fixed.rds
    │       └── explanation_explain_obj_list_no_ctree.rds
    │   ├── test-transformation.R
    │   ├── test-plot.R
    │   ├── test-predictions.R
    │   ├── test-src_weighted_matrix.R
    │   ├── test-src_impute_data.R
    │   ├── manual_test_scripts
    │       └── test_custom_models.R
    │   ├── test-observations.R
    │   └── test-a-shapley.R
├── man
    ├── figures
    │   ├── README-basic_example-1.png
    │   └── NR-logo_utvidet_r32g60b136_small.png
    ├── get_supported_models.Rd
    ├── rss_cpp.Rd
    ├── feature_matrix_cpp.Rd
    ├── compute_shapley.Rd
    ├── correction_matrix_cpp.Rd
    ├── gaussian_transform.Rd
    ├── inv_gaussian_transform.Rd
    ├── get_list_approaches.Rd
    ├── feature_group.Rd
    ├── hat_matrix_cpp.Rd
    ├── sample_gaussian.Rd
    ├── aicc_full_cpp.Rd
    ├── check_groups.Rd
    ├── gaussian_transform_separate.Rd
    ├── aicc_full_single_cpp.Rd
    ├── weight_matrix_cpp.Rd
    ├── weight_matrix.Rd
    ├── create_S_batch.Rd
    ├── apply_dummies.Rd
    ├── feature_group_not_exact.Rd
    ├── sample_copula.Rd
    ├── shapley_weights.Rd
    ├── mahalanobis_distance_cpp.Rd
    ├── process_groups.Rd
    ├── update_data.Rd
    ├── sample_combinations.Rd
    ├── get_data_specs.Rd
    ├── model_checker.Rd
    ├── prepare_and_predict.Rd
    ├── preprocess_data.Rd
    ├── check_features.Rd
    ├── make_dummies.Rd
    ├── observation_impute.Rd
    ├── get_model_specs.Rd
    ├── prediction.Rd
    ├── observation_impute_cpp.Rd
    ├── prepare_data.Rd
    ├── plot.shapr.Rd
    ├── feature_combinations.Rd
    ├── sample_ctree.Rd
    ├── create_ctree.Rd
    ├── predict_model.Rd
    └── shapr.Rd
├── R
    ├── utils.R
    ├── zzz.R
    ├── shapr-package.R
    ├── transformation.R
    ├── predictions.R
    ├── plot.R
    └── RcppExports.R
├── .Rbuildignore
├── .gitignore
├── shapr.Rproj
├── _pkgdown.yml
├── .lintr
├── LICENSE.md
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── cran-comments.md
├── NAMESPACE
├── NEWS.md
└── README.Rmd


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.covrignore:
--------------------------------------------------------------------------------
1 | R/global.R
2 | R/zzz.R
3 | 


--------------------------------------------------------------------------------
/inst/joss_paper/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Norsk Regnesentral
3 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(shapr)
3 | 
4 | test_check("shapr")
5 | 


--------------------------------------------------------------------------------
/inst/joss_paper/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/joss_paper/paper.pdf


--------------------------------------------------------------------------------
/inst/scripts/devel/master_res.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/scripts/devel/master_res.rds


--------------------------------------------------------------------------------
/inst/scripts/devel/master_res2.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/scripts/devel/master_res2.rds


--------------------------------------------------------------------------------
/inst/model_objects/lm_model_object.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/lm_model_object.rds


--------------------------------------------------------------------------------
/man/figures/README-basic_example-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/man/figures/README-basic_example-1.png


--------------------------------------------------------------------------------
/inst/model_objects/xgboost_model_object.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object.rds


--------------------------------------------------------------------------------
/inst/model_objects/xgboost_model_object_raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object_raw


--------------------------------------------------------------------------------
/inst/model_objects/xgboost_model_object_cat.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object_cat.rds


--------------------------------------------------------------------------------
/inst/model_objects/xgboost_model_object_raw_cat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/inst/model_objects/xgboost_model_object_raw_cat


--------------------------------------------------------------------------------
/man/figures/NR-logo_utvidet_r32g60b136_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/man/figures/NR-logo_utvidet_r32g60b136_small.png


--------------------------------------------------------------------------------
/tests/testthat/model_objects/lm_model_object.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/model_objects/lm_model_object.rds


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | CXX_STD = CXX11
2 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
3 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
4 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | CXX_STD = CXX11
2 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
3 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test_objects/shapley_explainer_obj.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_obj.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/explanation_explain_obj_list.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_obj_list.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/shapley_explainer_group1_2_obj.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group1_2_obj.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/shapley_explainer_group1_obj.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group1_obj.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/shapley_explainer_group2_2_obj.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group2_2_obj.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/shapley_explainer_group2_obj.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/shapley_explainer_group2_obj.rds


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | unique_features <- function(x) {
3 |   unique(
4 |     unlist(
5 |       strsplit(x, split = ":", fixed = TRUE)
6 |     )
7 |   )
8 | }
9 | 


--------------------------------------------------------------------------------
/tests/testthat/test_objects/explanation_explain_group_obj_list.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_group_obj_list.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/explanation_explain_obj_list_fixed.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_obj_list_fixed.rds


--------------------------------------------------------------------------------
/tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtr13/shapr/master/tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(libname = find.package("shapr"), pkgname = "shapr") {
 2 | 
 3 |   # CRAN Note avoidance
 4 |   utils::globalVariables(
 5 |     c(
 6 |       ".", ".N", ".I", ".GRP", ".SD"
 7 |     )
 8 |   )
 9 |   invisible()
10 | }
11 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^README\.Rmd$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | ^Rprof\.out$
 5 | ^\.circleci$
 6 | ^\.lintr$
 7 | ^\.covrignore$
 8 | ^LICENSE\.md$
 9 | inst/compare_lundberg\.xgb\.obj
10 | ^\.github$
11 | ^CODE_OF_CONDUCT\.md$
12 | ^cran-comments\.md$
13 | ^docs$
14 | ^CRAN-RELEASE$
15 | ^inst/scripts$
16 | ^doc$
17 | ^Meta$
18 | ^_pkgdown\.yml$
19 | ^pkgdown$
20 | ^.Rprofile
21 | 


--------------------------------------------------------------------------------
/man/get_supported_models.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/models.R
 3 | \name{get_supported_models}
 4 | \alias{get_supported_models}
 5 | \title{Provides a data.table with the supported models}
 6 | \usage{
 7 | get_supported_models()
 8 | }
 9 | \description{
10 | Provides a data.table with the supported models
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | Rprof.out
 6 | src/*.o
 7 | src/*.so
 8 | 
 9 | *.o
10 | 
11 | *.so
12 | 
13 | tmp/
14 | 
15 | \.RDataTmp
16 | 
17 | *.out
18 | 
19 | *.Rout
20 | 
21 | src-i386/
22 | 
23 | src-x64/
24 | 
25 | paper_experiments/res/single_res/
26 | 
27 | paper_experiments/res/single_res_old/
28 | inst/doc
29 | 
30 | docs/*
31 | doc
32 | Meta
33 | docs
34 | 


--------------------------------------------------------------------------------
/inst/scripts/create_lm_model_object.R:
--------------------------------------------------------------------------------
 1 | # Load data -----------
 2 | data("Boston", package = "MASS")
 3 | df <- tail(Boston, 50)
 4 | 
 5 | # Fit linear model
 6 | set.seed(123)
 7 | model <- lm(medv ~ lstat + rm + dis + indus, data = df)
 8 | 
 9 | saveRDS(object = model, "inst/model_objects/lm_model_object.rds")
10 | 
11 | # Used for testing as well, so need a copy un the testthat directory
12 | saveRDS(object = model, "tests/testthat/model_objects/lm_model_object.rds")
13 | 


--------------------------------------------------------------------------------
/src/features.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | //' @keywords internal
 5 | // [[Rcpp::export]]
 6 | List sample_features_cpp(int m, IntegerVector n_features) {
 7 | 
 8 |     int n = n_features.length();
 9 |     List l(n);
10 | 
11 |     for (int i = 0; i < n; i++) {
12 | 
13 |         int s = n_features[i];
14 |         IntegerVector k = sample(m, s);
15 |         std::sort(k.begin(), k.end());
16 |         l[i] = k;
17 | 
18 |     }
19 | 
20 |     return l;
21 | }
22 | 


--------------------------------------------------------------------------------
/.github/workflows/remove-old-artifacts.yml:
--------------------------------------------------------------------------------
 1 | name: Remove old artifacts
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Every day at 1am
 6 |     - cron: '0 1 * * *'
 7 | 
 8 | jobs:
 9 |   remove-old-artifacts:
10 |     runs-on: ubuntu-latest
11 |     timeout-minutes: 10
12 | 
13 |     steps:
14 |     - name: Remove old artifacts
15 |       uses: c-hive/gha-remove-artifacts@v1
16 |       with:
17 |         age: '1 week'
18 |         skip-recent: 6
19 |         # Optional inputs
20 |         # skip-tags: true
21 | 


--------------------------------------------------------------------------------
/shapr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source --no-lock
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/man/rss_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{rss_cpp}
 4 | \alias{rss_cpp}
 5 | \title{sigma_hat_sq-function}
 6 | \usage{
 7 | rss_cpp(H, y)
 8 | }
 9 | \arguments{
10 | \item{H}{Matrix. Output from \code{\link{hat_matrix_cpp}}}
11 | 
12 | \item{y}{Vector, i.e. representing the response variable}
13 | }
14 | \value{
15 | Scalar
16 | }
17 | \description{
18 | sigma_hat_sq-function
19 | }
20 | \author{
21 | Martin Jullum
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/feature_matrix_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{feature_matrix_cpp}
 4 | \alias{feature_matrix_cpp}
 5 | \title{Get feature matrix}
 6 | \usage{
 7 | feature_matrix_cpp(features, m)
 8 | }
 9 | \arguments{
10 | \item{features}{List}
11 | 
12 | \item{m}{Positive integer. Total number of features}
13 | }
14 | \value{
15 | Matrix
16 | }
17 | \description{
18 | Get feature matrix
19 | }
20 | \author{
21 | Nikolai Sellereite
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://norskregnesentral.github.io/shapr/
 2 | 
 3 | navbar:
 4 |   structure:
 5 |     left:  [home,  articles, news, reference]
 6 |     right: [github]
 7 |   components:
 8 |     articles:
 9 |       text: Vignettes
10 |       menu:
11 |       - text: "Explaining individual machine learning predictions with Shapley values"
12 |         href: articles/understanding_shapr.html
13 |     news:
14 |       text: News
15 |       href: news/index.html
16 |     reference:
17 |       text: Manual
18 |       href: reference/index.html
19 | 


--------------------------------------------------------------------------------
/man/compute_shapley.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predictions.R
 3 | \name{compute_shapley}
 4 | \alias{compute_shapley}
 5 | \title{Compute shapley values}
 6 | \usage{
 7 | compute_shapley(explainer, contribution_mat)
 8 | }
 9 | \arguments{
10 | \item{explainer}{An \code{explain} object.}
11 | 
12 | \item{contribution_mat}{The contribution matrix.}
13 | }
14 | \value{
15 | A \code{data.table} with shapley values for each test observation.
16 | }
17 | \description{
18 | Compute shapley values
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/correction_matrix_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{correction_matrix_cpp}
 4 | \alias{correction_matrix_cpp}
 5 | \title{correction term with trace_input in AICc formula}
 6 | \usage{
 7 | correction_matrix_cpp(tr_H, n)
 8 | }
 9 | \arguments{
10 | \item{tr_H}{numeric giving the trace of H}
11 | 
12 | \item{n}{numeric given the number of rows in H}
13 | }
14 | \value{
15 | Scalar
16 | }
17 | \description{
18 | correction term with trace_input in AICc formula
19 | }
20 | \author{
21 | Martin Jullum
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/gaussian_transform.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/transformation.R
 3 | \name{gaussian_transform}
 4 | \alias{gaussian_transform}
 5 | \title{Transforms a sample to standardized normal distribution}
 6 | \usage{
 7 | gaussian_transform(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Numeric vector.The data which should be transformed to a standard normal distribution.}
11 | }
12 | \value{
13 | Numeric vector of length \code{length(x)}
14 | }
15 | \description{
16 | Transforms a sample to standardized normal distribution
17 | }
18 | \author{
19 | Martin Jullum
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/R/shapr-package.R:
--------------------------------------------------------------------------------
 1 | #' @importFrom data.table data.table fread fwrite setnames := year month
 2 | #' uniqueN setkey as.data.table copy between is.data.table setcolorder rbindlist
 3 | #'
 4 | #' @importFrom graphics plot hist rect
 5 | #'
 6 | #' @importFrom utils head tail methods
 7 | #'
 8 | #' @importFrom stats predict
 9 | #'
10 | #' @importFrom stats as.formula
11 | #'
12 | #' @importFrom stats model.matrix
13 | #'
14 | #' @importFrom stats model.frame
15 | #'
16 | #' @importFrom stats setNames
17 | #'
18 | #' @importFrom stats contrasts
19 | #'
20 | #' @importFrom Rcpp sourceCpp
21 | #'
22 | #' @keywords internal
23 | #'
24 | #' @useDynLib shapr, .registration = TRUE
25 | NULL
26 | 


--------------------------------------------------------------------------------
/.lintr:
--------------------------------------------------------------------------------
 1 | linters: with_defaults(
 2 |         line_length_linter = lintr::line_length_linter(120),
 3 |         object_name_linter = NULL,
 4 |         object_usage_linter = NULL,
 5 |         seq_linter = NULL,
 6 |         cyclocomp_linter = lintr::cyclocomp_linter()
 7 |     )
 8 | exclusions: list(
 9 |         "inst/scripts/compare_shap_python.R",
10 |         "inst/scripts/create_lm_model_object.R",
11 |         "inst/scripts/create_xgboost_model_object.R",
12 |         "inst/scripts/example_ctree_model.R",
13 |         "inst/scripts/example_custom_model.R",
14 |         "inst/scripts/readme_example.R",
15 |         "inst/scripts/shap_python_script.py",
16 |         "inst/scripts/devel/compare_indep_implementations.R",
17 |         "R/RcppExports.R",
18 |         "R/zzz.R"
19 |     )
20 | 


--------------------------------------------------------------------------------
/man/inv_gaussian_transform.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/transformation.R
 3 | \name{inv_gaussian_transform}
 4 | \alias{inv_gaussian_transform}
 5 | \title{Transforms new data to a standardized normal distribution}
 6 | \usage{
 7 | inv_gaussian_transform(zx, n_z)
 8 | }
 9 | \arguments{
10 | \item{zx}{Numeric vector. The first \code{n_z} items are the Gaussian data, and the last part is
11 | the data with the original transformation.}
12 | 
13 | \item{n_z}{Positive integer. Number of elements of \code{zx} that belongs to new data.}
14 | }
15 | \value{
16 | Numeric vector of length \code{n_z}
17 | }
18 | \description{
19 | Transforms new data to a standardized normal distribution
20 | }
21 | \author{
22 | Martin Jullum
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/get_list_approaches.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/explanation.R
 3 | \name{get_list_approaches}
 4 | \alias{get_list_approaches}
 5 | \title{Helper function used in \code{\link{explain.combined}}}
 6 | \usage{
 7 | get_list_approaches(n_features, approach)
 8 | }
 9 | \arguments{
10 | \item{n_features}{Integer vector. Note that
11 | \code{length(n_features) <= 2^m}, where \code{m} equals the number
12 | of features.}
13 | 
14 | \item{approach}{Character vector of length \code{m}. All elements should be
15 | either \code{"empirical"}, \code{"gaussian"} or \code{"copula"}.}
16 | }
17 | \value{
18 | List
19 | }
20 | \description{
21 | Helper function used in \code{\link{explain.combined}}
22 | }
23 | \author{
24 | Nikolai Sellereite
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/feature_group.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/features.R
 3 | \name{feature_group}
 4 | \alias{feature_group}
 5 | \title{Analogue to feature_exact, but for groups instead.}
 6 | \usage{
 7 | feature_group(group_num, weight_zero_m = 10^6)
 8 | }
 9 | \arguments{
10 | \item{group_num}{List. Contains vector of integers indicating the feature numbers for the
11 | different groups.}
12 | 
13 | \item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special
14 | cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.}
15 | }
16 | \value{
17 | data.table with all feature group combinations, shapley weights etc.
18 | }
19 | \description{
20 | Analogue to feature_exact, but for groups instead.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: lint
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |     steps:
17 |       - uses: actions/checkout@v2
18 | 
19 |       - uses: r-lib/actions/setup-r@v1
20 |         with:
21 |           use-public-rspm: true
22 | 
23 |       - uses: r-lib/actions/setup-r-dependencies@v1
24 |         with:
25 |           extra-packages: lintr
26 | 
27 |       - name: Lint
28 |         run: lintr::lint_package()
29 |         shell: Rscript {0}
30 | 


--------------------------------------------------------------------------------
/man/hat_matrix_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{hat_matrix_cpp}
 4 | \alias{hat_matrix_cpp}
 5 | \title{Computing single H matrix in AICc-function using the Mahalanobis distance}
 6 | \usage{
 7 | hat_matrix_cpp(X, mcov, S_scale_dist, h)
 8 | }
 9 | \arguments{
10 | \item{X}{matrix with "covariates"}
11 | 
12 | \item{mcov}{covariance matrix}
13 | 
14 | \item{S_scale_dist}{logical indicating whether the Mahalanobis distance should be scaled with the number of variables}
15 | 
16 | \item{h}{numeric specifying the scaling (sigma)}
17 | }
18 | \value{
19 | Matrix of dimension \code{ncol(X)*ncol(X)}
20 | }
21 | \description{
22 | Computing single H matrix in AICc-function using the Mahalanobis distance
23 | }
24 | \author{
25 | Martin Jullum
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/man/sample_gaussian.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sampling.R
 3 | \name{sample_gaussian}
 4 | \alias{sample_gaussian}
 5 | \title{Sample conditional Gaussian variables}
 6 | \usage{
 7 | sample_gaussian(index_given, n_samples, mu, cov_mat, m, x_test)
 8 | }
 9 | \arguments{
10 | \item{index_given}{Integer vector. The indices of the features to condition upon. Note that
11 | \code{min(index_given) >= 1} and \code{max(index_given) <= m}.}
12 | 
13 | \item{m}{Positive integer. The total number of features.}
14 | 
15 | \item{x_test}{Numeric matrix. Contains the features of the observation whose
16 | predictions ought to be explained (test data).}
17 | }
18 | \value{
19 | data.table
20 | }
21 | \description{
22 | Sample conditional Gaussian variables
23 | }
24 | \author{
25 | Martin Jullum
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/inst/scripts/create_xgboost_model_object.R:
--------------------------------------------------------------------------------
 1 | library(xgboost)
 2 | 
 3 | data("Boston")
 4 | 
 5 | x_var <- c("lstat", "rm", "dis", "indus")
 6 | y_var <- "medv"
 7 | 
 8 | x_train <- as.matrix(tail(Boston[, x_var], -6))
 9 | y_train <- tail(Boston[, y_var], -6)
10 | x_test <- as.matrix(head(Boston[, x_var], 6))
11 | 
12 | # Creating a larger test data set (300 observations) for more realistic function time calls.
13 | # Modifying x_test to repeat the 6 test observations 50 times
14 | x_test = rep(1,50) %x% x_test
15 | colnames(x_test) <- colnames(x_train)
16 | 
17 | # Fitting a basic xgboost model to the training data
18 | model <- xgboost(
19 |   data = x_train,
20 |   label = y_train,
21 |   nround = 20
22 | )
23 | 
24 | saveRDS(model,file = "inst/model_objects/xgboost_model_object.rds") 
25 | 
26 | xgb.save(model=model,fname = "inst/model_objects/xgboost_model_object_raw") 
27 | 


--------------------------------------------------------------------------------
/man/aicc_full_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{aicc_full_cpp}
 4 | \alias{aicc_full_cpp}
 5 | \title{AICc formula for several sets, alternative definition}
 6 | \usage{
 7 | aicc_full_cpp(h, X_list, mcov_list, S_scale_dist, y_list, negative)
 8 | }
 9 | \arguments{
10 | \item{h}{Numeric. Specifies the scaling (sigma)}
11 | 
12 | \item{X_list}{List}
13 | 
14 | \item{mcov_list}{List}
15 | 
16 | \item{S_scale_dist}{Logical. Indicates whether Mahalanobis distance should be scaled with the
17 | number of variables}
18 | 
19 | \item{y_list}{List.}
20 | 
21 | \item{negative}{Logical.}
22 | }
23 | \value{
24 | Scalar with the numeric value of the AICc formula
25 | }
26 | \description{
27 | AICc formula for several sets, alternative definition
28 | }
29 | \author{
30 | Martin Jullum
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/check_groups.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/features.R
 3 | \name{check_groups}
 4 | \alias{check_groups}
 5 | \title{Check that the group parameter has the right form and content}
 6 | \usage{
 7 | check_groups(feature_labels, group)
 8 | }
 9 | \arguments{
10 | \item{feature_labels}{Vector of characters. Contains the feature labels used by the model}
11 | 
12 | \item{group}{List. If \code{NULL} regular feature wise Shapley values are computed.
13 | If provided, group wise Shapley values are computed. \code{group} then has length equal to
14 | the number of groups. The list element contains character vectors with the features included
15 | in each of the different groups.}
16 | }
17 | \value{
18 | Error or NULL
19 | }
20 | \description{
21 | Check that the group parameter has the right form and content
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/gaussian_transform_separate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/transformation.R
 3 | \name{gaussian_transform_separate}
 4 | \alias{gaussian_transform_separate}
 5 | \title{Transforms new data to standardized normal (dimension 1) based on other data transformations}
 6 | \usage{
 7 | gaussian_transform_separate(yx, n_y)
 8 | }
 9 | \arguments{
10 | \item{yx}{Numeric vector. The first \code{n_y} items is the data that is transformed, and last
11 | part is the data with the original transformation.}
12 | 
13 | \item{n_y}{Positive integer. Number of elements of \code{yx} that belongs to the gaussian data.}
14 | }
15 | \value{
16 | Vector of back-transformed Gaussian data
17 | }
18 | \description{
19 | Transforms new data to standardized normal (dimension 1) based on other data transformations
20 | }
21 | \author{
22 | Martin Jullum
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/aicc_full_single_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{aicc_full_single_cpp}
 4 | \alias{aicc_full_single_cpp}
 5 | \title{Temp-function for computing the full AICc with several X's etc}
 6 | \usage{
 7 | aicc_full_single_cpp(X, mcov, S_scale_dist, h, y)
 8 | }
 9 | \arguments{
10 | \item{X}{matrix with "covariates"}
11 | 
12 | \item{mcov}{covariance matrix}
13 | 
14 | \item{S_scale_dist}{logical indicating whether the Mahalanobis distance should be scaled with the number of variables}
15 | 
16 | \item{h}{numeric specifying the scaling (sigma)}
17 | 
18 | \item{y}{vector with the "response variable"}
19 | }
20 | \value{
21 | Scalar with the numeric value of the AICc formula
22 | }
23 | \description{
24 | Temp-function for computing the full AICc with several X's etc
25 | }
26 | \author{
27 | Martin Jullum
28 | }
29 | \keyword{internal}
30 | 


--------------------------------------------------------------------------------
/man/weight_matrix_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{weight_matrix_cpp}
 4 | \alias{weight_matrix_cpp}
 5 | \title{Calculate weight matrix}
 6 | \usage{
 7 | weight_matrix_cpp(subsets, m, n, w)
 8 | }
 9 | \arguments{
10 | \item{subsets}{List. Each of the elements equals an integer
11 | vector representing a valid combination of features/feature groups.}
12 | 
13 | \item{m}{Integer. Number of features/feature groups}
14 | 
15 | \item{n}{Integer. Number of combinations}
16 | 
17 | \item{w}{Numeric vector of length \code{n}, i.e. \code{w[i]} equals
18 | the Shapley weight of feature/feature group combination \code{i}, represented by
19 | \code{subsets[[i]]}.}
20 | }
21 | \value{
22 | Matrix of dimension n x m + 1
23 | }
24 | \description{
25 | Calculate weight matrix
26 | }
27 | \author{
28 | Nikolai Sellereite
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/man/weight_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shapley.R
 3 | \name{weight_matrix}
 4 | \alias{weight_matrix}
 5 | \title{Calculate weighted matrix}
 6 | \usage{
 7 | weight_matrix(X, normalize_W_weights = TRUE, is_groupwise = FALSE)
 8 | }
 9 | \arguments{
10 | \item{X}{data.table}
11 | 
12 | \item{normalize_W_weights}{Logical. Whether to normalize the weights for the combinations to sum to 1 for
13 | increased numerical stability before solving the WLS (weighted least squares). Applies to all combinations
14 | except combination \code{1} and \code{2^m}.}
15 | 
16 | \item{is_groupwise}{Logical. Indicating whether group wise Shapley values are to be computed.}
17 | }
18 | \value{
19 | Numeric matrix. See \code{\link{weight_matrix_cpp}} for more information.
20 | }
21 | \description{
22 | Calculate weighted matrix
23 | }
24 | \author{
25 | Nikolai Sellereite, Martin Jullum
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/man/create_S_batch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/explanation.R
 3 | \name{create_S_batch}
 4 | \alias{create_S_batch}
 5 | \title{Compute Shapley values in batches}
 6 | \usage{
 7 | create_S_batch(explainer, n_batches, index_S = NULL)
 8 | }
 9 | \arguments{
10 | \item{explainer}{The binary matrix \code{S} returned from \code{\link{shapr}}.}
11 | 
12 | \item{n_batches}{Numeric value specifying how many batches \code{S} should be split into.}
13 | 
14 | \item{index_S}{Numeric vector specifying which rows of \code{S} that should be considered.}
15 | }
16 | \value{
17 | A list of length \code{n_batches}.
18 | }
19 | \description{
20 | Create a list of indexes used to compute Shapley values in batches.
21 | }
22 | \details{
23 | If \code{index_S} is not \code{NULL} then the number of batches is scaled such that the
24 | total number of batches is equal \code{n_batches} and not within the rows specified by\code{index_S}.
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/apply_dummies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/features.R
 3 | \name{apply_dummies}
 4 | \alias{apply_dummies}
 5 | \title{Apply dummy variables - this is an internal function intended only to be used in
 6 | predict_model.xgb.Booster()}
 7 | \usage{
 8 | apply_dummies(feature_list, testdata)
 9 | }
10 | \arguments{
11 | \item{feature_list}{List. The \code{feature_list} object in the output object after running
12 | \code{\link[shapr:make_dummies]{make_dummies}}}
13 | 
14 | \item{testdata}{data.table or data.frame. New data that has the same
15 | feature names, types, and levels as \code{feature_list}.}
16 | }
17 | \value{
18 | A data.table with all features but where the factors in \code{testdata} are
19 | one-hot encoded variables as specified in feature_list
20 | }
21 | \description{
22 | Apply dummy variables - this is an internal function intended only to be used in
23 | predict_model.xgb.Booster()
24 | }
25 | \author{
26 | Annabelle Redelmeier, Martin Jullum
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master, cranversion]
 6 |     tags: ['*']
 7 | 
 8 | name: pkgdown
 9 | 
10 | jobs:
11 |   pkgdown:
12 |     runs-on: ubuntu-latest
13 |     env:
14 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - uses: r-lib/actions/setup-pandoc@v1
19 | 
20 |       - uses: r-lib/actions/setup-r@v1
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v1
25 |         with:
26 |           extra-packages: pkgdown
27 |           needs: website
28 | 
29 |       - name: Deploy package
30 |         run: |
31 |           git config --local user.name "$GITHUB_ACTOR"
32 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
33 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
34 | 


--------------------------------------------------------------------------------
/man/feature_group_not_exact.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/features.R
 3 | \name{feature_group_not_exact}
 4 | \alias{feature_group_not_exact}
 5 | \title{Analogue to feature_not_exact, but for groups instead.}
 6 | \usage{
 7 | feature_group_not_exact(group_num, n_combinations = 200, weight_zero_m = 10^6)
 8 | }
 9 | \arguments{
10 | \item{group_num}{List. Contains vector of integers indicating the feature numbers for the
11 | different groups.}
12 | 
13 | \item{n_combinations}{Integer. The number of feature combinations to sample. If \code{NULL},
14 | the exact method is used and all combinations are considered. The maximum number of
15 | combinations equals \code{2^ncol(x)}.}
16 | 
17 | \item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special
18 | cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.}
19 | }
20 | \value{
21 | data.table with all feature group combinations, shapley weights etc.
22 | }
23 | \description{
24 | Analogue to feature_not_exact, but for groups instead.
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2019 Norsk Regnesentral
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/man/sample_copula.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sampling.R
 3 | \name{sample_copula}
 4 | \alias{sample_copula}
 5 | \title{Sample conditional variables using the Gaussian copula approach}
 6 | \usage{
 7 | sample_copula(
 8 |   index_given,
 9 |   n_samples,
10 |   mu,
11 |   cov_mat,
12 |   m,
13 |   x_test_gaussian,
14 |   x_train,
15 |   x_test
16 | )
17 | }
18 | \arguments{
19 | \item{index_given}{Integer vector. The indices of the features to condition upon. Note that
20 | \code{min(index_given) >= 1} and \code{max(index_given) <= m}.}
21 | 
22 | \item{m}{Positive integer. The total number of features.}
23 | 
24 | \item{x_test_gaussian}{Numeric matrix. Contains the observation whose predictions ought to be explained (test data),
25 | after quantile-transforming them to standard Gaussian variables.}
26 | 
27 | \item{x_test}{Numeric matrix. Contains the features of the observation whose
28 | predictions ought to be explained (test data).}
29 | }
30 | \value{
31 | data.table
32 | }
33 | \description{
34 | Sample conditional variables using the Gaussian copula approach
35 | }
36 | \author{
37 | Martin Jullum
38 | }
39 | \keyword{internal}
40 | 


--------------------------------------------------------------------------------
/man/shapley_weights.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shapley.R
 3 | \name{shapley_weights}
 4 | \alias{shapley_weights}
 5 | \title{Calculate Shapley weight}
 6 | \usage{
 7 | shapley_weights(m, N, n_components, weight_zero_m = 10^6)
 8 | }
 9 | \arguments{
10 | \item{m}{Positive integer. Total number of features/feature groups.}
11 | 
12 | \item{N}{Positive integer. The number of unique combinations when sampling \code{n_components} features/feature
13 | groups, without replacement, from a sample space consisting of \code{m} different features/feature groups.}
14 | 
15 | \item{n_components}{Positive integer. Represents the number of features/feature groups you want to sample from
16 | a feature space consisting of \code{m} unique features/feature groups. Note that \code{ 0 < = n_components <= m}.}
17 | 
18 | \item{weight_zero_m}{Positive integer. Represents the Shapley weight for two special
19 | cases, i.e. the case where you have either \code{0} or \code{m} features/feature groups.}
20 | }
21 | \value{
22 | Numeric
23 | }
24 | \description{
25 | Calculate Shapley weight
26 | }
27 | \author{
28 | Nikolai Sellereite
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/inst/scripts/shap_python_script.py:
--------------------------------------------------------------------------------
 1 | #### Python code ####
 2 | import xgboost as xgb
 3 | import shap
 4 | import numpy as np
 5 | import pandas as pd
 6 | import time
 7 | 
 8 | model = xgb.Booster()  # init model
 9 | model.load_model("inst/model_objects/xgboost_model_object_raw")
10 | 
11 | ## kernel shap sends data as numpy array which has no column names, so we fix it
12 | def xgb_predict(data_asarray):
13 |   data_asDmatrix =  xgb.DMatrix(data_asarray)
14 |   return model.predict(data_asDmatrix)
15 | 
16 | py_pred_test = xgb_predict(r.x_test) # Test predictions in python
17 | 
18 | #
19 | 
20 | #### Applying kernelshap
21 | 
22 | time_py_start = time.perf_counter()
23 | 
24 | shap_kernel_explainer = shap.KernelExplainer(xgb_predict, r.x_train)
25 | Kshap_shap0 = shap_kernel_explainer.shap_values(r.x_test,nsamples = int(100000),l1_reg=0)
26 | 
27 | time_py_end = time.perf_counter()
28 | 
29 | time_py = time_py_end-time_py_start
30 | 
31 | getattr(shap_kernel_explainer,'expected_value') # This is phi0, not used at all below
32 | 
33 | Kshap_shap = pd.DataFrame(Kshap_shap0,columns = r.x_var)
34 | 
35 | Kshap_shap.insert(0,"none",getattr(shap_kernel_explainer,'expected_value'),True) # Adding the none column
36 | 


--------------------------------------------------------------------------------
/man/mahalanobis_distance_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{mahalanobis_distance_cpp}
 4 | \alias{mahalanobis_distance_cpp}
 5 | \title{(Generalized) Mahalanobis distance}
 6 | \usage{
 7 | mahalanobis_distance_cpp(
 8 |   featureList,
 9 |   Xtrain_mat,
10 |   Xtest_mat,
11 |   mcov,
12 |   S_scale_dist
13 | )
14 | }
15 | \arguments{
16 | \item{featureList}{List of vectors indicating all factor combinations that should be included in the computations. Assumes that the first one is empty.}
17 | 
18 | \item{Xtrain_mat}{Matrix}
19 | 
20 | \item{Xtest_mat}{Matrix}
21 | 
22 | \item{mcov}{Matrix. The Sigma-matrix in the Mahalanobis distance formula (\code{stats::cov(Xtrain_mat)}) gives Mahalanobis distance,
23 | \code{diag(m)} gives the Euclidean distance.}
24 | 
25 | \item{S_scale_dist}{Logical indicating}
26 | }
27 | \value{
28 | Array of three dimensions. Contains the squared distance for between all training and test observations for all feature combinations passed to the function.
29 | }
30 | \description{
31 | Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}.
32 | }
33 | \author{
34 | Martin Jullum
35 | }
36 | \keyword{internal}
37 | 


--------------------------------------------------------------------------------
/man/process_groups.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocess_data.R
 3 | \name{process_groups}
 4 | \alias{process_groups}
 5 | \title{Process (check and update names) the group list}
 6 | \usage{
 7 | process_groups(group, feature_labels)
 8 | }
 9 | \arguments{
10 | \item{group}{List. If \code{NULL} regular feature wise Shapley values are computed.
11 | If provided, group wise Shapley values are computed. \code{group} then has length equal to
12 | the number of groups. The list element contains character vectors with the features included
13 | in each of the different groups.}
14 | 
15 | \item{feature_labels}{Vector of characters. Contains the feature labels used by the model}
16 | }
17 | \value{
18 | List with two named elements: \code{group}: The input, but with group names if non-existing,
19 | \code{group_num} a corresponding group list with names replaced by feature number
20 | }
21 | \description{
22 | Process (check and update names) the group list
23 | }
24 | \details{
25 | This function takes care of all preprocessing and checking of the provided data in \code{x} against
26 | the feature_list which is typically the output from \code{\link[shapr:get_model_specs]{get_model_specs}}
27 | }
28 | \author{
29 | Martin Jullum
30 | }
31 | \keyword{internal}
32 | 


--------------------------------------------------------------------------------
/inst/scripts/readme_example.R:
--------------------------------------------------------------------------------
 1 | library(xgboost)
 2 | library(shapr)
 3 | 
 4 | data("Boston", package = "MASS")
 5 | 
 6 | x_var <- c("lstat", "rm", "dis", "indus")
 7 | y_var <- "medv"
 8 | 
 9 | x_train <- as.matrix(Boston[-1:-6, x_var])
10 | y_train <- Boston[-1:-6, y_var]
11 | x_test <- as.matrix(Boston[1:6, x_var])
12 | 
13 | # Looking at the dependence between the features
14 | cor(x_train)
15 | 
16 | # Fitting a basic xgboost model to the training data
17 | model <- xgboost(
18 |   data = x_train,
19 |   label = y_train,
20 |   nround = 20,
21 |   verbose = FALSE
22 | )
23 | 
24 | # Prepare the data for explanation
25 | explainer <- shapr(x_train, model)
26 | 
27 | # Specifying the phi_0, i.e. the expected prediction without any features
28 | p <- mean(y_train)
29 | 
30 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using
31 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default)
32 | explanation <- explain(
33 |   x_test,
34 |   approach = "empirical",
35 |   explainer = explainer,
36 |   prediction_zero = p
37 | )
38 | 
39 | # Printing the Shapley values for the test data.
40 | # For more information about the interpretation of the values in the table, see ?shapr::explain.
41 | print(explanation$dt)
42 | 
43 | # Finally we plot the resulting explanations
44 | plot(explanation)
45 | 


--------------------------------------------------------------------------------
/man/update_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocess_data.R
 3 | \name{update_data}
 4 | \alias{update_data}
 5 | \title{Updates data by reference according to the updater argument.}
 6 | \usage{
 7 | update_data(data, updater)
 8 | }
 9 | \arguments{
10 | \item{data}{data.table. Data that ought to be updated.}
11 | 
12 | \item{updater}{List. The object should be the output from
13 | \code{\link[shapr:check_features]{check_features}}.}
14 | }
15 | \value{
16 | NULL.
17 | }
18 | \description{
19 | \code{data} is updated, i.e. unused columns and factor levels are removed as described in
20 | \code{updater}. This is done by reference, i.e. updates the object being passed to data even if nothing is
21 | returned by the function itself.
22 | }
23 | \examples{
24 | # Load example data
25 | if (requireNamespace("MASS", quietly = TRUE)) {
26 |   data("Boston", package = "MASS")
27 |   # Split data into test- and training data
28 |   x_train <- data.table::as.data.table(head(Boston))
29 |   x_train[, rad := as.factor(rad)]
30 |   data_features <- get_data_specs(x_train)
31 |   model <- lm(medv ~ lstat + rm + rad + indus, data = x_train)
32 | 
33 |   model_features <- get_model_specs(model)
34 |   updater <- check_features(model_features, data_features)
35 |   update_data(x_train, updater)
36 | }
37 | }
38 | \author{
39 | Martin Jullum
40 | }
41 | \keyword{internal}
42 | 


--------------------------------------------------------------------------------
/tests/testthat/test-transformation.R:
--------------------------------------------------------------------------------
 1 | context("test-transformation.R")
 2 | 
 3 | test_that("Test inv_gaussian_transform", {
 4 | 
 5 |   # Example -----------
 6 |   zx <- rnorm(50)
 7 |   n_z <- 30
 8 | 
 9 |   x <- inv_gaussian_transform(zx, n_z)
10 | 
11 |   # Tests -----------
12 |   expect_true(is.atomic(x))
13 |   expect_true(is.double(x))
14 | 
15 |   expect_equal(length(x), n_z)
16 |   expect_true(min(x) >= min(zx[-c(1:n_z)]))
17 |   expect_true(max(x) <= max(zx[-c(1:n_z)]))
18 | 
19 |   # Erros -----------
20 |   expect_error(inv_gaussian_transform(zx, length(zx)))
21 |   expect_error(inv_gaussian_transform(zx, length(zx) + 1))
22 | })
23 | 
24 | test_that("Test gaussian_transform_separate", {
25 | 
26 |   # Example -----------
27 |   yx <- rnorm(50)
28 |   n_y <- 30
29 | 
30 |   x <- gaussian_transform_separate(yx, n_y)
31 | 
32 |   # Tests -----------
33 |   expect_true(is.atomic(x))
34 |   expect_true(is.double(x))
35 |   expect_equal(length(x), n_y)
36 | 
37 |   # Erros -----------
38 |   expect_error(gaussian_transform_separate(yx, length(yx)))
39 |   expect_error(gaussian_transform_separate(yx, length(yx) + 1))
40 | })
41 | 
42 | test_that("Test gaussian_transform", {
43 | 
44 |   # Example -----------
45 |   y <- rnorm(50)
46 |   x <- gaussian_transform(y)
47 | 
48 |   # Tests -----------
49 |   expect_true(is.atomic(x))
50 |   expect_true(is.double(x))
51 |   expect_equal(length(x), length(y))
52 | })
53 | 


--------------------------------------------------------------------------------
/man/sample_combinations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sampling.R
 3 | \name{sample_combinations}
 4 | \alias{sample_combinations}
 5 | \title{Helper function to sample a combination of training and testing rows, which does not risk
 6 | getting the same observation twice. Need to improve this help file.}
 7 | \usage{
 8 | sample_combinations(ntrain, ntest, nsamples, joint_sampling = TRUE)
 9 | }
10 | \arguments{
11 | \item{ntrain}{Positive integer. Number of training observations to sample from.}
12 | 
13 | \item{ntest}{Positive integer. Number of test observations to sample from.}
14 | 
15 | \item{nsamples}{Positive integer. Number of samples.}
16 | 
17 | \item{joint_sampling}{Logical. Indicates whether train- and test data should be sampled
18 | separately or in a joint sampling space. If they are sampled separately (which typically
19 | would be used when optimizing more than one distribution at once) we sample with replacement
20 | if \code{nsamples > ntrain}. Note that this solution is not optimal. Be careful if you're
21 | doing optimization over every test observation when \code{nsamples > ntrain}.}
22 | }
23 | \value{
24 | data.frame
25 | }
26 | \description{
27 | Helper function to sample a combination of training and testing rows, which does not risk
28 | getting the same observation twice. Need to improve this help file.
29 | }
30 | \author{
31 | Martin Jullum
32 | }
33 | \keyword{internal}
34 | 


--------------------------------------------------------------------------------
/inst/scripts/devel/devel_non_exact_grouping.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ### NOTE: THIS DOES NO LONGER WORK AS WE SWITCH TO exact when a large n_combinations is used, but the checks
 3 | ### confirms the code works as intended.
 4 | 
 5 | library(xgboost)
 6 | library(shapr)
 7 | library(data.table)
 8 | 
 9 | #### Testing grouping function
10 | 
11 | data("Boston", package = "MASS")
12 | 
13 | x_var <- c("lstat", "rm", "dis", "indus","age","tax","ptratio","black","nox")
14 | y_var <- "medv"
15 | 
16 | x_train <- as.matrix(Boston[-1:-6, x_var])
17 | y_train <- Boston[-1:-6, y_var]
18 | x_test <- as.matrix(Boston[1:6, x_var])
19 | 
20 | # Looking at the dependence between the features
21 | cor(x_train)
22 | 
23 | # Fitting a basic xgboost model to the training data
24 | model <- xgboost(
25 |   data = x_train,
26 |   label = y_train,
27 |   nround = 20,
28 |   verbose = FALSE
29 | )
30 | 
31 | group <- list(A=x_var[1:3],B=x_var[4:5],C=x_var[7],D=x_var[c(6,8)],E=x_var[9])
32 | 
33 | explainer1 <- shapr(x_train, model,group = group,n_combinations=10^ 6)
34 | 
35 | explainer2 <- shapr(x_train, model,group = group)
36 | 
37 | explanation1 <- explain(
38 |   x_test,
39 |   approach = "independence",
40 |   explainer = explainer1,
41 |   prediction_zero = p
42 | )
43 | 
44 | explanation2 <- explain(
45 |   x_test,
46 |   approach = "independence",
47 |   explainer = explainer2,
48 |   prediction_zero = p
49 | )
50 | 
51 | 
52 | explanation2$dt-explanation1$dt # All small differences!
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (https://www.contributor-covenant.org), version 1.0.0, available at 
25 | https://contributor-covenant.org/version/1/0/0/.
26 | 


--------------------------------------------------------------------------------
/man/get_data_specs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocess_data.R
 3 | \name{get_data_specs}
 4 | \alias{get_data_specs}
 5 | \title{Fetches feature information from a given data set}
 6 | \usage{
 7 | get_data_specs(x)
 8 | }
 9 | \arguments{
10 | \item{x}{matrix, data.frame or data.table The data to extract feature information from.}
11 | }
12 | \value{
13 | A list with the following elements:
14 | \describe{
15 |   \item{labels}{character vector with the feature names to compute Shapley values for}
16 |   \item{classes}{a named character vector with the labels as names and the class types as elements}
17 |   \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements
18 |   (NULL if the feature is not a factor)}
19 | }
20 | }
21 | \description{
22 | Fetches feature information from a given data set
23 | }
24 | \details{
25 | This function is used to extract the feature information to be checked against the corresponding
26 | information extracted from the model and other data sets. The function is called from
27 | \code{\link[shapr:preprocess_data]{preprocess_data}}
28 | and \code{\link[shapr:make_dummies]{make_dummies}}
29 | }
30 | \examples{
31 | # Load example data
32 | if (requireNamespace("MASS", quietly = TRUE)) {
33 |   data("Boston", package = "MASS")
34 |   # Split data into test- and training data
35 |   x_train <- data.table::as.data.table(head(Boston))
36 |   x_train[, rad := as.factor(rad)]
37 |   get_data_specs(x_train)
38 | }
39 | }
40 | \author{
41 | Martin Jullum
42 | }
43 | \keyword{internal}
44 | 


--------------------------------------------------------------------------------
/inst/scripts/devel/same_seed_as_master.R:
--------------------------------------------------------------------------------
 1 | library(xgboost)
 2 | #library(shapr)
 3 | library(data.table)
 4 | 
 5 | data("Boston", package = "MASS")
 6 | 
 7 | x_var <- c("lstat", "rm", "dis", "indus")#,"nox","age","tax","ptratio")
 8 | y_var <- "medv"
 9 | 
10 | x_train <- as.matrix(Boston[-1:-6, x_var])
11 | y_train <- Boston[-1:-6, y_var]
12 | x_test <- as.matrix(Boston[1:6, x_var])
13 | 
14 | # Fitting a basic xgboost model to the training data
15 | model <- xgboost(
16 |   data = x_train,
17 |   label = y_train,
18 |   nround = 20,
19 |   verbose = FALSE
20 | )
21 | # THIS IS GENERATED FROM MASTER BRANCH
22 | # Prepare the data for explanation
23 | explainer <- shapr(x_train, model,n_combinations = 100)
24 | p = mean(y_train)
25 | gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000)
26 | emp =  explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000)
27 | copula =  explain(x_test, explainer, "copula", prediction_zero = p, n_samples = 10000)
28 | indep = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000)
29 | comb = explain(x_test, explainer, c("gaussian", "gaussian", "empirical", "empirical"), prediction_zero = p, n_samples = 10000)
30 | ctree = explain(x_test, explainer, "ctree", mincriterion = 0.95, prediction_zero = p, n_samples = 10000)
31 | ctree2 = explain(x_test, explainer, "ctree", mincriterion = c(0.95, 0.95, 0.95, 0.95), prediction_zero = p, n_samples = 10000)
32 | 
33 | 
34 | # results from master
35 | 
36 | res_master = readRDS("inst/scripts/devel/master_res.rds")
37 | 
38 | all.equal(comb$dt, res_master$comb$dt) #TRUE
39 | all.equal(comb$p, res_master$comb$p) #TRUE
40 | 


--------------------------------------------------------------------------------
/man/model_checker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/models.R
 3 | \name{model_checker}
 4 | \alias{model_checker}
 5 | \alias{model_checker.default}
 6 | \alias{model_checker.lm}
 7 | \alias{model_checker.glm}
 8 | \alias{model_checker.ranger}
 9 | \alias{model_checker.gam}
10 | \alias{model_checker.xgb.Booster}
11 | \title{Check that the type of model is supported by the explanation method}
12 | \usage{
13 | model_checker(x)
14 | 
15 | \method{model_checker}{default}(x)
16 | 
17 | \method{model_checker}{lm}(x)
18 | 
19 | \method{model_checker}{glm}(x)
20 | 
21 | \method{model_checker}{ranger}(x)
22 | 
23 | \method{model_checker}{gam}(x)
24 | 
25 | \method{model_checker}{xgb.Booster}(x)
26 | }
27 | \arguments{
28 | \item{x}{Model object for the model to be explained.}
29 | }
30 | \value{
31 | Error or NULL
32 | }
33 | \description{
34 | The function checks whether the model given by \code{x} is supported.
35 | If \code{x} is not a supported model the function will return an error message, otherwise it return NULL
36 | (meaning all types of models with this class is supported)
37 | }
38 | \details{
39 | See \code{\link{predict_model}} for more information about
40 | what type of models \code{shapr} currently support.
41 | }
42 | \examples{
43 | if (requireNamespace("MASS", quietly = TRUE)) {
44 |   # Load example data
45 |   data("Boston", package = "MASS")
46 |   # Split data into test- and training data
47 |   x_train <- head(Boston, -3)
48 |   # Fit a linear model
49 |   model <- lm(medv ~ lstat + rm + dis + indus, data = x_train)
50 | 
51 |   # Checking the model object
52 |   model_checker(x = model)
53 | }
54 | }
55 | \keyword{internal}
56 | 


--------------------------------------------------------------------------------
/man/prepare_and_predict.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/explanation.R
 3 | \name{prepare_and_predict}
 4 | \alias{prepare_and_predict}
 5 | \title{Calculate Shapley values}
 6 | \usage{
 7 | prepare_and_predict(explainer, n_batches, prediction_zero, ...)
 8 | }
 9 | \arguments{
10 | \item{explainer}{An \code{explainer} object to use for explaining the observations.
11 | See \code{\link{shapr}}.}
12 | 
13 | \item{n_batches}{Positive integer.
14 | Specifies how many batches the total number of feature combinations should be split into when calculating the
15 | contribution function for each test observation.
16 | The default value is 1.
17 | Increasing the number of batches may significantly reduce the RAM allocation for models with many features.
18 | This typically comes with a small increase in computation time.}
19 | 
20 | \item{prediction_zero}{Numeric. The prediction value for unseen data, typically equal to the mean of
21 | the response.}
22 | 
23 | \item{...}{Arguments passed to \code{\link{prepare_data}} with exception of \code{only_return_contrib_dt},
24 | which is only passed to explain. If \code{TRUE} the
25 | \code{data.table} from \code{\link{prediction}} is returned, else an object of class \code{shapr}.
26 | Each column (except for \code{row_id}) correspond to the vector \code{v_D} in Equation 7 in the reference.
27 | The Shapley values can be calculated by \code{t(explainer$W \%*\% dt_contrib[, -"row_id"]))}}
28 | }
29 | \value{
30 | A list. See \code{\link{explain}} for more information.
31 | }
32 | \description{
33 | Sample covariate values, predict and calculate Shapley values. The sampling and prediction can be done in batches
34 | if \code{n_batches} is greater than 1.
35 | }
36 | \keyword{internal}
37 | 


--------------------------------------------------------------------------------
/man/preprocess_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocess_data.R
 3 | \name{preprocess_data}
 4 | \alias{preprocess_data}
 5 | \title{Process (check and update) data according to specified feature list}
 6 | \usage{
 7 | preprocess_data(x, feature_list)
 8 | }
 9 | \arguments{
10 | \item{x}{matrix, data.frame or data.table. The data to check input for and update
11 | according to the specification in \code{feature_list}.}
12 | 
13 | \item{feature_list}{List. Output from running \code{\link[shapr:get_data_specs]{get_data_specs}} or
14 | \code{\link[shapr:get_model_specs]{get_model_specs}}}
15 | }
16 | \value{
17 | List with two named elements: \code{x_dt}: Checked and updated data \code{x} in data.table format, and
18 | \code{update_feature_list} the output from \code{\link[shapr:check_features]{check_features}}
19 | }
20 | \description{
21 | Process (check and update) data according to specified feature list
22 | }
23 | \details{
24 | This function takes care of all preprocessing and checking of the provided data in \code{x} against
25 | the feature_list which is typically the output from \code{\link[shapr:get_model_specs]{get_model_specs}}
26 | }
27 | \examples{
28 | # Load example data
29 | if (requireNamespace("MASS", quietly = TRUE)) {
30 |   data("Boston", package = "MASS")
31 |   # Split data into test- and training data
32 |   x_train <- data.table::as.data.table(head(Boston))
33 |   x_train[, rad := as.factor(rad)]
34 |   data_features <- get_data_specs(x_train)
35 |   model <- lm(medv ~ lstat + rm + rad + indus, data = x_train)
36 | 
37 |   model_features <- get_model_specs(model)
38 |   preprocess_data(x_train, model_features)
39 | }
40 | }
41 | \author{
42 | Martin Jullum
43 | }
44 | \keyword{internal}
45 | 


--------------------------------------------------------------------------------
/tests/testthat/test-plot.R:
--------------------------------------------------------------------------------
 1 | context("test-plot.R")
 2 | 
 3 | test_that("Test plot.shapr", {
 4 |   if (requireNamespace("ggplot2", quietly = TRUE)) {
 5 | 
 6 |     # Example -----------
 7 |     x <- matrix(c(
 8 |       4.98, 9.14, 4.03, 2.94, 5.33,
 9 |       6.575, 6.421, 7.185, 6.998, 7.147,
10 |       4.0900, 4.9671, 4.9671, 6.0622, 6.0622,
11 |       2.31, 7.07, 7.07, 2.18, 2.18
12 |     ),
13 |     ncol = 4
14 |     )
15 | 
16 |     colnames(x) <- c("lstat", "rm", "dis", "indus")
17 | 
18 |     explanation <- list()
19 |     explanation$p <- c(31.30145, 23.25194, 33.11547, 33.43015, 31.72984)
20 |     explanation$dt <- data.table::data.table(
21 |       "none" = rep(22.00, 5),
22 |       "lstat" = c(5.2632, 0.1672, 5.9888, 8.2142, 0.5060),
23 |       "rm" = c(-1.2527, -0.7088, 5.5451, 0.7508, 5.6875),
24 |       "dis" = c(0.2920, 0.9689, 0.5660, 0.1893, 0.8432),
25 |       "indus" = c(4.5529, 0.3787, -1.4304, 1.8298, 2.2471)
26 |     )
27 |     explanation$x_test <- x
28 |     explanation$is_groupwise <- FALSE
29 |     attr(explanation, "class") <- c("shapr", "list")
30 | 
31 | 
32 |     # Test -----------
33 |     p <- plot(explanation, plot_phi0 = FALSE)
34 | 
35 |     expect_equal(colnames(x), unique(as.character(p$data$variable)))
36 |     expect_equal(explanation$p, unique(p$data$pred))
37 |     expect_equal(sort(as.vector(as.matrix(explanation$dt[, -c("none")]))), sort(p$data$phi))
38 | 
39 |     p <- plot(explanation, plot_phi0 = TRUE)
40 | 
41 |     expect_equal(colnames(explanation$dt), unique(as.character(p$data$variable)))
42 |     expect_equal(explanation$p, unique(p$data$pred))
43 |     expect_equal(sort(as.vector(as.matrix(explanation$dt))), sort(p$data$phi))
44 | 
45 |     p <- plot(explanation, plot_phi0 = TRUE, top_k_features = 2)
46 | 
47 |     expect_equal(2, max(p$data$rank))
48 |   }
49 | })
50 | 


--------------------------------------------------------------------------------
/man/check_features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocess_data.R
 3 | \name{check_features}
 4 | \alias{check_features}
 5 | \title{Checks that two extracted feature lists have exactly the same properties}
 6 | \usage{
 7 | check_features(f_list_1, f_list_2, use_1_as_truth = T)
 8 | }
 9 | \arguments{
10 | \item{f_list_1, f_list_2}{List. As extracted from either \code{get_data_specs} or \code{get_model_specs}.}
11 | 
12 | \item{use_1_as_truth}{Logical. If TRUE, \code{f_list_2} is compared to \code{f_list_1}, i.e. additional elements
13 | is allowed in \code{f_list_2}, and if \code{f_list_1}'s feature classes contains NAs, feature class check is
14 | ignored regardless of what is specified in \code{f_list_1}. If FALSE, \code{f_list_1} and \code{f_list_2} are
15 | equated and they need to contain exactly the same elements. Set to TRUE when comparing a model and data, and FALSE
16 | when comparing two data sets.}
17 | }
18 | \value{
19 | List. The \code{f_list_1} is returned as inserted if there all check are carried out. If some info is
20 | missing from \code{f_list_1}, the function continues consistency checking using \code{f_list_2} and returns that.
21 | }
22 | \description{
23 | Checks that two extracted feature lists have exactly the same properties
24 | }
25 | \examples{
26 | # Load example data
27 | if (requireNamespace("MASS", quietly = TRUE)) {
28 |   data("Boston", package = "MASS")
29 |   # Split data into test- and training data
30 |   x_train <- data.table::as.data.table(head(Boston))
31 |   x_train[, rad := as.factor(rad)]
32 |   data_features <- get_data_specs(x_train)
33 |   model <- lm(medv ~ lstat + rm + rad + indus, data = x_train)
34 | 
35 |   model_features <- get_model_specs(model)
36 |   check_features(model_features, data_features)
37 | }
38 | }
39 | \author{
40 | Martin Jullum
41 | }
42 | \keyword{internal}
43 | 


--------------------------------------------------------------------------------
/man/make_dummies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/features.R
 3 | \name{make_dummies}
 4 | \alias{make_dummies}
 5 | \title{Initiate the making of dummy variables}
 6 | \usage{
 7 | make_dummies(traindata, testdata)
 8 | }
 9 | \arguments{
10 | \item{traindata}{data.table or data.frame.}
11 | 
12 | \item{testdata}{data.table or data.frame. New data that has the same
13 | feature names, types, and levels as \code{traindata}.}
14 | }
15 | \value{
16 | A list that contains the following entries:
17 | \describe{
18 | \item{feature_list}{List. Output from \code{check_features}}
19 | \item{train_dummies}{A data.frame containing all of the factors in \code{traindata} as
20 | one-hot encoded variables.}
21 | \item{test_dummies}{A data.frame containing all of the factors in \code{testdata} as
22 | one-hot encoded variables.}
23 | \item{traindata_new}{Original traindata with correct column ordering and factor levels. To be passed to
24 | \code{\link[shapr:shapr]{shapr}.}}
25 | \item{testdata_new}{Original testdata with correct column ordering and factor levels. To be passed to
26 | \code{\link[shapr:explain]{explain}.}}
27 | }
28 | }
29 | \description{
30 | Initiate the making of dummy variables
31 | }
32 | \examples{
33 | if (requireNamespace("MASS", quietly = TRUE)) {
34 |   data("Boston", package = "MASS")
35 |   x_var <- c("lstat", "rm", "dis", "indus")
36 |   y_var <- "medv"
37 |   x_train <- as.data.frame(Boston[401:411, x_var])
38 |   y_train <- Boston[401:408, y_var]
39 |   x_test <- as.data.frame(Boston[1:4, x_var])
40 | 
41 |   # convert to factors for illustational purpose
42 |   x_train$rm <- factor(round(x_train$rm))
43 |   x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm))
44 | 
45 |   dummylist <- make_dummies(traindata = x_train, testdata = x_test)
46 | }
47 | }
48 | \author{
49 | Annabelle Redelmeier, Martin Jullum
50 | }
51 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to shapr
 2 | 
 3 | This outlines how to propose a change to shapr.
 4 | 
 5 | ### Fixing typos
 6 | 
 7 | Small typos or grammatical errors in documentation may be edited directly using
 8 | the GitHub web interface, so long as the changes are made in the _source_ file.
 9 | 
10 | *  YES: you edit a roxygen comment in a `.R` file below `R/`.
11 | *  NO: you edit an `.Rd` file below `man/`.
12 | 
13 | ### Prerequisites
14 | 
15 | Before you make a substantial pull request, you should always file an issue and
16 | make sure someone from the team agrees that it’s a problem. If you’ve found a
17 | bug, create an associated issue and illustrate the bug with a minimal 
18 | [reprex](https://www.tidyverse.org/help/#reprex).
19 | 
20 | ### Pull request process
21 | 
22 | *  We recommend that you create a Git branch for each pull request (PR).  
23 | *  Look at the CircleCI build status before and after making changes.
24 | The `README` should contain badges for any continuous integration services used
25 | by the package.  
26 | *  New code should follow the tidyverse [style guide](http://style.tidyverse.org).
27 | You can use the [styler](https://CRAN.R-project.org/package=styler) package to
28 | apply these styles, but please don't restyle code that has nothing to do with 
29 | your PR.  
30 | *  We use [roxygen2](https://cran.r-project.org/package=roxygen2) for documentation.  
31 | *  We use [testthat](https://cran.r-project.org/package=testthat). Contributions
32 | with test cases included are easier to accept.  
33 | *  For user-facing changes, add a bullet to the top of `NEWS.md` below the
34 | current development version header describing the changes made followed by your
35 | GitHub username, and links to relevant issue(s)/PR(s).
36 | 
37 | ### Code of Conduct
38 | 
39 | Please note that the shapr project is released with a
40 | [Contributor Code of Conduct](CODE_OF_CONDUCT.md). By contributing to this
41 | project you agree to abide by its terms.
42 | 


--------------------------------------------------------------------------------
/tests/testthat/test-predictions.R:
--------------------------------------------------------------------------------
 1 | context("test-predictions.R")
 2 | 
 3 | test_that("Test prediction", {
 4 | 
 5 |   # Example -----------
 6 |   if (requireNamespace("MASS", quietly = TRUE)) {
 7 |     data("Boston", package = "MASS")
 8 |     dt_train <- data.table::as.data.table(Boston)
 9 |     features <- c("lstat", "rm", "dis", "indus")
10 |     n_combinations <- 10
11 |     n_features <- 4
12 |     prediction_zero <- .5
13 |     n_xtest <- 8
14 |     explainer <- list()
15 |     explainer$model <- stats::lm(formula = "medv ~ lstat + rm + dis + indus", data = head(dt_train, -n_xtest))
16 |     explainer$x_test <- tail(dt_train[, .SD, .SDcols = features], n_xtest)
17 |     explainer$W <- matrix(1, nrow = n_features + 1, ncol = n_combinations)
18 |     explainer$is_groupwise <- FALSE
19 |     explainer$S <- matrix(1, nrow = n_combinations, ncol = n_features)
20 |     dt <- dt_train[, .SD, .SDcols = features][rep(1:.N, 4)]
21 |     dt[, id := rep_len(1:n_xtest, .N)]
22 |     dt[, id_combination := rep_len(1:n_combinations, .N), id]
23 |     dt[, w := runif(.N)]
24 |     max_id_combination <- dt[, max(id_combination)]
25 |     dt <- dt[!(id_combination == max_id_combination)]
26 |     dt_lastrows <- data.table::data.table(
27 |       explainer$x_test,
28 |       id = 1:n_xtest,
29 |       id_combination = max_id_combination,
30 |       w = 1.0
31 |     )
32 |     dt <- rbind(dt, dt_lastrows, dt_lastrows, dt_lastrows)
33 | 
34 |     x <- prediction(dt, prediction_zero, explainer)
35 | 
36 |     # Test -----------
37 |     lnms <- c("p", "dt_mat")
38 |     expect_equal(class(x), "list")
39 |     expect_equal(names(x), lnms)
40 |     expect_equal(x$p, predict_model(explainer$model, explainer$x_test))
41 |     expect_true(data.table::is.data.table(x$dt_mat))
42 | 
43 |     # t(W %*% x$dt_mat) = shapley values
44 |     expect_equal(ncol(explainer$W), nrow(x$dt_mat))
45 | 
46 |     # Tests errors
47 |     expect_error(prediction(dt[id < n_xtest], prediction_zero, explainer))
48 |   }
49 | })
50 | 


--------------------------------------------------------------------------------
/man/observation_impute.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/observations.R
 3 | \name{observation_impute}
 4 | \alias{observation_impute}
 5 | \title{Generate permutations of training data using test observations}
 6 | \usage{
 7 | observation_impute(
 8 |   W_kernel,
 9 |   S,
10 |   x_train,
11 |   x_test,
12 |   w_threshold = 0.7,
13 |   n_samples = 1000
14 | )
15 | }
16 | \arguments{
17 | \item{W_kernel}{Numeric matrix. Contains all nonscaled weights between training and test
18 | observations for all feature combinations. The dimension equals \code{n_train x m}.}
19 | 
20 | \item{S}{Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations}
21 | and \code{m} equals the total number of sampled/non-sampled feature combinations and
22 | the total number of unique features, respectively. Note that \code{m = ncol(x_train)}.}
23 | 
24 | \item{x_train}{Numeric matrix}
25 | 
26 | \item{x_test}{Numeric matrix}
27 | 
28 | \item{w_threshold}{Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion
29 | of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the
30 | \code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight.
31 | \code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).}
32 | 
33 | \item{n_samples}{Positive integer. Indicating the maximum number of samples to use in the
34 | Monte Carlo integration for every conditional expectation. See also details.}
35 | }
36 | \value{
37 | data.table
38 | }
39 | \description{
40 | Generate permutations of training data using test observations
41 | }
42 | \references{
43 | Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent:
44 |   More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502.
45 | }
46 | \author{
47 | Nikolai Sellereite
48 | }
49 | \keyword{internal}
50 | 


--------------------------------------------------------------------------------
/man/get_model_specs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/models.R
 3 | \name{get_model_specs}
 4 | \alias{get_model_specs}
 5 | \alias{get_model_specs.default}
 6 | \alias{get_model_specs.lm}
 7 | \alias{get_model_specs.glm}
 8 | \alias{get_model_specs.gam}
 9 | \alias{get_model_specs.ranger}
10 | \alias{get_model_specs.xgb.Booster}
11 | \title{Fetches feature information from a given model object}
12 | \usage{
13 | get_model_specs(x)
14 | 
15 | \method{get_model_specs}{default}(x)
16 | 
17 | \method{get_model_specs}{lm}(x)
18 | 
19 | \method{get_model_specs}{glm}(x)
20 | 
21 | \method{get_model_specs}{gam}(x)
22 | 
23 | \method{get_model_specs}{ranger}(x)
24 | 
25 | \method{get_model_specs}{xgb.Booster}(x)
26 | }
27 | \arguments{
28 | \item{x}{Model object for the model to be explained.}
29 | }
30 | \value{
31 | A list with the following elements:
32 | \describe{
33 |   \item{labels}{character vector with the feature names to compute Shapley values for}
34 |   \item{classes}{a named character vector with the labels as names and the class type as elements}
35 |   \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements
36 |   (NULL if the feature is not a factor)}
37 | }
38 | }
39 | \description{
40 | Fetches feature information from a given model object
41 | }
42 | \details{
43 | This function is used to extract the feature information to be checked against data passed to \code{shapr}
44 | and \code{explain}. The function is called from \code{preprocess_data}.
45 | }
46 | \examples{
47 | if (requireNamespace("MASS", quietly = TRUE)) {
48 |   # Load example data
49 |   data("Boston", package = "MASS")
50 |   # Split data into test- and training data
51 |   x_train <- data.table::as.data.table(head(Boston))
52 |   x_train[, rad := as.factor(rad)]
53 |   model <- lm(medv ~ lstat + rm + rad + indus, data = x_train)
54 | 
55 |   get_model_specs(model)
56 | }
57 | }
58 | \author{
59 | Martin Jullum
60 | }
61 | \keyword{internal}
62 | 


--------------------------------------------------------------------------------
/R/transformation.R:
--------------------------------------------------------------------------------
 1 | #' Transforms new data to a standardized normal distribution
 2 | #'
 3 | #' @param zx Numeric vector. The first \code{n_z} items are the Gaussian data, and the last part is
 4 | #' the data with the original transformation.
 5 | #' @param n_z Positive integer. Number of elements of \code{zx} that belongs to new data.
 6 | #'
 7 | #' @return Numeric vector of length \code{n_z}
 8 | #'
 9 | #' @keywords internal
10 | #'
11 | #' @author Martin Jullum
12 | inv_gaussian_transform <- function(zx, n_z) {
13 |   if (n_z >= length(zx)) stop("n_z should be less than length(zx)")
14 |   ind <- 1:n_z
15 |   z <- zx[ind]
16 |   x <- zx[-ind]
17 |   u <- stats::pnorm(z)
18 |   x_new <- stats::quantile(x, probs = u)
19 |   return(as.double(x_new))
20 | }
21 | 
22 | #' Transforms new data to standardized normal (dimension 1) based on other data transformations
23 | #'
24 | #' @param yx Numeric vector. The first \code{n_y} items is the data that is transformed, and last
25 | #' part is the data with the original transformation.
26 | #' @param n_y Positive integer. Number of elements of \code{yx} that belongs to the gaussian data.
27 | #'
28 | #' @return Vector of back-transformed Gaussian data
29 | #'
30 | #' @keywords internal
31 | #'
32 | #' @author Martin Jullum
33 | gaussian_transform_separate <- function(yx, n_y) {
34 |   if (n_y >= length(yx)) stop("n_y should be less than length(yx)")
35 |   ind <- 1:n_y
36 |   x <- yx[-ind]
37 |   tmp <- rank(yx)[ind]
38 |   tmp <- tmp - rank(tmp) + 0.5
39 |   u_y <- tmp / (length(x) + 1)
40 |   z_y <- stats::qnorm(u_y)
41 |   return(z_y)
42 | }
43 | 
44 | #' Transforms a sample to standardized normal distribution
45 | #'
46 | #' @param x Numeric vector.The data which should be transformed to a standard normal distribution.
47 | #'
48 | #' @return Numeric vector of length \code{length(x)}
49 | #'
50 | #' @keywords internal
51 | #'
52 | #' @author Martin Jullum
53 | gaussian_transform <- function(x) {
54 |   u <- rank(x) / (length(x) + 1)
55 |   z <- stats::qnorm(u)
56 |   return(z)
57 | }
58 | 


--------------------------------------------------------------------------------
/man/prediction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predictions.R
 3 | \name{prediction}
 4 | \alias{prediction}
 5 | \title{Calculate Shapley weights for test data}
 6 | \usage{
 7 | prediction(dt, prediction_zero, explainer)
 8 | }
 9 | \arguments{
10 | \item{dt}{data.table}
11 | 
12 | \item{prediction_zero}{Numeric. The value to use for \code{phi_0}.}
13 | 
14 | \item{explainer}{An object of class \code{explainer}. See \code{\link{shapr}}.}
15 | }
16 | \value{
17 | An object of class \code{c("shapr", "list")}. For more details see \code{\link{explain}}.
18 | }
19 | \description{
20 | This function should only be called internally, and not be used as
21 | a stand-alone function.
22 | }
23 | \details{
24 | If \code{dt} does not contain three columns called \code{id}, \code{id_combination} and \code{w}
25 | the function will fail. \code{id} represents a unique key for a given test observation,
26 | and \code{id_combination} is a unique key for which feature combination the row represents. \code{w}
27 | represents the Shapley value of feature combination given by \code{id_combination}. In addition
28 | to these three columns, \code{dt} should also have columns which matches the variables used
29 | when training the model.
30 | 
31 | I.e. you have fitted a linear model using the features \code{x1},
32 | \code{x2} and \code{x3}, and you want to explain 5 test observations using the exact method, i.e.
33 | setting \code{exact = TRUE} in \code{\link{shapr}}, the following properties should be satisfied
34 | \enumerate{
35 | \item \code{colnames(dt)} equals \code{c("x1", "x2", "x3", "id", "id_combination", ""w)}
36 | \item \code{dt[, max(id)]} equals the number of test observations
37 | \item \code{dt[, min(id)]} equals 1L.
38 | \item \code{dt[, max(id_combination)]} equals \code{2^m} where m equals the number of features.
39 | \item \code{dt[, min(id_combination)]} equals 1L.
40 | \item \code{dt[, type(w)]} equals \code{double}.
41 | }
42 | }
43 | \author{
44 | Nikolai Sellereite
45 | }
46 | \keyword{internal}
47 | 


--------------------------------------------------------------------------------
/man/observation_impute_cpp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{observation_impute_cpp}
 4 | \alias{observation_impute_cpp}
 5 | \title{Get imputed data}
 6 | \usage{
 7 | observation_impute_cpp(index_xtrain, index_s, xtrain, xtest, S)
 8 | }
 9 | \arguments{
10 | \item{index_xtrain}{Positive integer. Represents a sequence of row indices from \code{xtrain},
11 | i.e. \code{min(index_xtrain) >= 1} and \code{max(index_xtrain) <= nrow(xtrain)}.}
12 | 
13 | \item{index_s}{Positive integer. Represents a sequence of row indices from \code{S},
14 | i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}.}
15 | 
16 | \item{xtrain}{Numeric matrix.}
17 | 
18 | \item{xtest}{Numeric matrix. Represents a single test observation.}
19 | 
20 | \item{S}{Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals
21 | the total number of sampled/non-sampled feature combinations and \code{m} equals
22 | the total number of unique features. Note that \code{m = ncol(xtrain)}. See details
23 | for more information.}
24 | }
25 | \value{
26 | Numeric matrix
27 | }
28 | \description{
29 | Get imputed data
30 | }
31 | \details{
32 | \code{S(i, j) = 1} if and only if feature \code{j} is present in feature
33 | combination \code{i}, otherwise \code{S(i, j) = 0}. I.e. if \code{m = 3}, there
34 | are \code{2^3 = 8} unique ways to combine the features. In this case \code{dim(S) = c(8, 3)}.
35 | Let's call the features \code{x1, x2, x3} and take a closer look at the combination
36 | represented by \code{s = c(x1, x2)}. If this combination is represented by the second row,
37 | the following is true: \code{S[2, 1:3] = c(1, 1, 0)}.
38 | 
39 | The returned object, \code{X}, is a numeric matrix where
40 | \code{dim(X) = c(length(index_xtrain), ncol(xtrain))}. If feature \code{j} is present in
41 | the k-th observation, that is \code{S[index_[k], j] == 1}, \code{X[k, j] = xtest[1, j]}.
42 | Otherwise \code{X[k, j] = xtrain[index_xtrain[k], j]}.
43 | }
44 | \author{
45 | Nikolai Sellereite
46 | }
47 | \keyword{internal}
48 | 


--------------------------------------------------------------------------------
/tests/testthat/test-src_weighted_matrix.R:
--------------------------------------------------------------------------------
 1 | context("test-src_weighted_matrix.R")
 2 | 
 3 | test_that("Test weight_matrix_cpp", {
 4 | 
 5 |   ## Example -----------
 6 |   m <- 3
 7 |   n <- 2^m
 8 |   subsets <- unlist(
 9 |     lapply(
10 |       0:m,
11 |       utils::combn,
12 |       x = m,
13 |       simplify = FALSE
14 |     ),
15 |     recursive = FALSE
16 |   )
17 |   w_all <- shapley_weights(m, choose(m, 0:m), 0:m)
18 |   w_all[!is.finite(w_all)] <- 10^6
19 |   w <- w_all[sapply(subsets, length) + 1]
20 |   x <- weight_matrix_cpp(
21 |     subsets = subsets,
22 |     m = m,
23 |     n = n,
24 |     w = w
25 |   )
26 | 
27 |   ## Exact results -----------
28 |   Z <- matrix(0, nrow = n, ncol = m + 1)
29 |   Z[, 1] <- 1
30 |   for (i in seq_along(subsets)) {
31 |     f <- subsets[[i]]
32 |     if (length(f) > 0) {
33 |       Z[i, f + 1] <- 1
34 |     }
35 |   }
36 |   W <- matrix(0, nrow = n, ncol = n)
37 |   diag(W) <- w
38 |   res <- solve(t(Z) %*% W %*% Z) %*% (t(Z) %*% W)
39 | 
40 |   ## Test results -----------
41 |   expect_true(is.matrix(x))
42 |   expect_true(is.double(x))
43 |   expect_equal(nrow(x), m + 1)
44 |   expect_equal(ncol(x), n)
45 |   expect_equal(x, res)
46 | })
47 | 
48 | test_that("Test feature_matrix_cpp", {
49 | 
50 |   ## Example -----------
51 |   features <- list(
52 |     integer(0),
53 |     1:2,
54 |     10,
55 |     4:8,
56 |     3:7
57 |   )
58 |   m <- 10
59 |   x <- feature_matrix_cpp(features, m)
60 | 
61 |   ## Test results -----------
62 |   expect_true(is.matrix(x))
63 |   expect_equal(ncol(x), m)
64 |   expect_equal(nrow(x), length(features))
65 |   expect_true(max(x) <= 1)
66 |   expect_true(min(x) >= 0)
67 |   expect_equal(sapply(features, length), rowSums(x))
68 |   for (i in seq_along(features)) {
69 |     feature_i <- features[[i]]
70 |     n_features <- length(feature_i)
71 |     if (n_features == 0) {
72 |       expect_equal(x[i, ], rep(0, m))
73 |     } else {
74 |       expect_equal(x[i, feature_i], rep(1, n_features))
75 |       expect_equal(x[i, -feature_i], rep(0, m - n_features))
76 |     }
77 |   }
78 |   expect_error(feature_matrix_cpp(list(1, 2:3), 3))
79 | })
80 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: shapr
 2 | Version: 0.2.0.9002
 3 | Title: Prediction Explanation with Dependence-Aware Shapley Values
 4 | Description: Complex machine learning models are often hard to interpret. However, in 
 5 |   many situations it is crucial to understand and explain why a model made a specific 
 6 |   prediction. Shapley values is the only method for such prediction explanation framework 
 7 |   with a solid theoretical foundation. Previously known methods for estimating the Shapley 
 8 |   values do, however, assume feature independence. This package implements the method 
 9 |   described in Aas, Jullum and Løland (2019) <arXiv:1903.10464>, which accounts for any feature 
10 |   dependence, and thereby produces more accurate estimates of the true Shapley values.
11 | Authors@R: c(
12 |     person("Nikolai", "Sellereite", email = "nikolaisellereite@gmail.com", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")),
13 |     person("Martin", "Jullum", email = "Martin.Jullum@nr.no", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-3908-5155")),
14 |     person("Annabelle", "Redelmeier", email = "Annabelle.Redelmeier@nr.no", role = "aut"),
15 |     person("Anders", "Løland", email = "Anders.Loland@nr.no", role = "ctb"), 
16 |     person("Jens Christian", "Wahl", email = "Jens.Christian.Wahl@nr.no", role = "ctb"), 
17 |     person("Camilla", "Lingjærde", role = "ctb"),
18 |     person("Norsk Regnesentral", role =  c("cph", "fnd"))
19 |     )
20 | URL: https://norskregnesentral.github.io/shapr/, https://github.com/NorskRegnesentral/shapr
21 | BugReports: https://github.com/NorskRegnesentral/shapr/issues
22 | License: MIT + file LICENSE
23 | Encoding: UTF-8
24 | LazyData: true
25 | ByteCompile: true
26 | Language: en-US
27 | RoxygenNote: 7.1.2
28 | Depends: R (>= 3.5.0)
29 | Imports: 
30 |     stats,
31 |     data.table,
32 |     Rcpp (>= 0.12.15),
33 |     condMVNorm,
34 |     mvnfast,
35 |     Matrix
36 | Suggests: 
37 |     ranger,
38 |     xgboost,
39 |     mgcv,
40 |     testthat, 
41 |     knitr,
42 |     rmarkdown,
43 |     roxygen2,
44 |     MASS,
45 |     ggplot2,
46 |     caret,
47 |     gbm,
48 |     party,
49 |     partykit
50 | LinkingTo: 
51 |     RcppArmadillo,
52 |     Rcpp
53 | VignetteBuilder: knitr
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test-src_impute_data.R:
--------------------------------------------------------------------------------
 1 | context("test-src_impute_data.R")
 2 | 
 3 | test_that("Test observation_impute_cpp", {
 4 | 
 5 |   # Example data -----------
 6 |   if (requireNamespace("datasets", quietly = TRUE)) {
 7 |     data("mtcars", package = "datasets")
 8 |     rownames(mtcars) <- NULL
 9 |     mtcars <- as.matrix(mtcars)
10 | 
11 |     # Example -----------
12 |     m <- 3
13 |     n_combinations <- 2^m
14 |     mtcars <- mtcars[1:15, seq(m)]
15 |     ntrain <- 14
16 |     xtrain <- mtcars[seq(ntrain), ]
17 |     xtest <- mtcars[-seq(ntrain), , drop = FALSE]
18 |     S <- matrix(0L, n_combinations, m)
19 |     features <- list(
20 |       integer(), 1, 2, 3, c(1, 2), c(1, 3), c(2, 3), c(1, 2, 3)
21 |     )
22 |     for (i in seq_along(features)) {
23 |       feature_i <- features[[i]]
24 |       if (length(feature_i) > 0) {
25 |         S[i, features[[i]]] <- 1L
26 |       }
27 |     }
28 | 
29 |     # Tests (invalid input) -----------
30 |     expect_error(
31 |       observation_impute_cpp(
32 |         index_xtrain = c(1, 2),
33 |         index_s = c(1, 2, 3),
34 |         xtrain = xtrain,
35 |         xtest = xtest,
36 |         S = S
37 |       )
38 |     )
39 |     expect_error(
40 |       observation_impute_cpp(
41 |         index_xtrain = c(1, 2),
42 |         index_s = c(2, 3),
43 |         xtrain = xtrain[, 1:2],
44 |         xtest = xtest,
45 |         S = S
46 |       )
47 |     )
48 | 
49 |     # Tests (valid input) -----------
50 |     index_xtrain <- c(1, 2)
51 |     index_s <- c(4, 5)
52 |     x <- observation_impute_cpp(
53 |       index_xtrain = index_xtrain,
54 |       index_s = index_s,
55 |       xtrain = xtrain,
56 |       xtest = xtest,
57 |       S = S
58 |     )
59 | 
60 |     expect_equal(nrow(x), length(index_s))
61 |     expect_equal(ncol(x), ncol(xtrain))
62 |     expect_true(is.matrix(x))
63 |     expect_true(is.double(x))
64 | 
65 |     for (i in 1:nrow(x)) {
66 |       feature_i <- features[[index_s[i]]]
67 | 
68 |       for (j in seq(m)) {
69 |         if (j %in% feature_i) {
70 |           expect_equal(x[i, j], unname(xtest[1, j]))
71 |         } else {
72 |           expect_equal(x[i, j], unname(xtrain[index_xtrain[i], j]))
73 |         }
74 |       }
75 |     }
76 |   }
77 | })
78 | 


--------------------------------------------------------------------------------
/man/prepare_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/observations.R
 3 | \name{prepare_data}
 4 | \alias{prepare_data}
 5 | \alias{prepare_data.independence}
 6 | \alias{prepare_data.empirical}
 7 | \alias{prepare_data.gaussian}
 8 | \alias{prepare_data.copula}
 9 | \alias{prepare_data.ctree}
10 | \title{Generate data used for predictions}
11 | \usage{
12 | prepare_data(x, ...)
13 | 
14 | \method{prepare_data}{independence}(x, index_features = NULL, ...)
15 | 
16 | \method{prepare_data}{empirical}(x, index_features = NULL, ...)
17 | 
18 | \method{prepare_data}{gaussian}(x, index_features = NULL, ...)
19 | 
20 | \method{prepare_data}{copula}(x, index_features = NULL, ...)
21 | 
22 | \method{prepare_data}{ctree}(
23 |   x,
24 |   index_features = NULL,
25 |   mc_cores = 1,
26 |   mc_cores_create_ctree = mc_cores,
27 |   mc_cores_sample_ctree = mc_cores,
28 |   ...
29 | )
30 | }
31 | \arguments{
32 | \item{x}{Explainer object. See \code{\link{explain}} for more information.}
33 | 
34 | \item{...}{Currently not used.}
35 | 
36 | \item{index_features}{List. Default is NULL but if either various methods are being used or various mincriterion are
37 | used for different numbers of conditioned features, this will be a list with the features to pass.}
38 | 
39 | \item{mc_cores}{Integer. Only for class \code{ctree} currently. The number of cores to use in paralellization of the
40 | tree building (\code{create_ctree}) and tree sampling (\code{sample_ctree}). Defaults to 1. Note: Uses
41 | parallel::mclapply which relies on forking, i.e. uses only 1 core on Windows systems.}
42 | 
43 | \item{mc_cores_create_ctree}{Integer. Same as \code{mc_cores}, but specific for the tree building function
44 | #' Defaults to \code{mc_cores}.}
45 | 
46 | \item{mc_cores_sample_ctree}{Integer. Same as \code{mc_cores}, but specific for the tree building prediction
47 | function.
48 | Defaults to \code{mc_cores}.}
49 | 
50 | \item{seed}{Positive integer. If \code{NULL} the seed will be inherited from the calling environment.}
51 | }
52 | \value{
53 | A `data.table` containing simulated data passed to \code{\link{prediction}}.
54 | }
55 | \description{
56 | Generate data used for predictions
57 | }
58 | \keyword{internal}
59 | 


--------------------------------------------------------------------------------
/man/plot.shapr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot.shapr}
 4 | \alias{plot.shapr}
 5 | \title{Plot of the Shapley value explanations}
 6 | \usage{
 7 | \method{plot}{shapr}(
 8 |   x,
 9 |   digits = 3,
10 |   plot_phi0 = TRUE,
11 |   index_x_test = NULL,
12 |   top_k_features = NULL,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{x}{An \code{shapr} object. See \code{\link{explain}}.}
18 | 
19 | \item{digits}{Integer. Number of significant digits to use in the feature description}
20 | 
21 | \item{plot_phi0}{Logical. Whether to include \code{phi0} in the plot}
22 | 
23 | \item{index_x_test}{Integer vector. Which of the test observations to plot. E.g. if you have
24 | explained 10 observations using \code{\link{explain}}, you can generate a plot for the first 5
25 | observations by setting \code{index_x_test = 1:5}.}
26 | 
27 | \item{top_k_features}{Integer. How many features to include in the plot. E.g. if you have 15
28 | features in your model you can plot the 5 most important features, for each explanation, by setting
29 | \code{top_k_features = 1:5}.}
30 | 
31 | \item{...}{Currently not used.}
32 | }
33 | \value{
34 | ggplot object with plots of the Shapley value explanations
35 | }
36 | \description{
37 | Plots the individual prediction explanations.
38 | }
39 | \details{
40 | See \code{vignette("understanding_shapr", package = "shapr")} for an example of
41 | how you should use the function.
42 | }
43 | \examples{
44 | if (requireNamespace("MASS", quietly = TRUE)) {
45 |   #' # Load example data
46 |   data("Boston", package = "MASS")
47 | 
48 |   # Split data into test- and training data
49 |   x_train <- head(Boston, -3)
50 |   x_test <- tail(Boston, 3)
51 | 
52 |   # Fit a linear model
53 |   model <- lm(medv ~ lstat + rm + dis + indus, data = x_train)
54 | 
55 |   # Create an explainer object
56 |   explainer <- shapr(x_train, model)
57 | 
58 |   # Explain predictions
59 |   p <- mean(x_train$medv)
60 | 
61 |   # Empirical approach
62 |   explanation <- explain(x_test,
63 |     explainer,
64 |     approach = "empirical",
65 |     prediction_zero = p,
66 |     n_samples = 1e2
67 |   )
68 | 
69 |   if (requireNamespace("ggplot2", quietly = TRUE)) {
70 |     # Plot the explantion (this function)
71 |     plot(explanation)
72 |   }
73 | }
74 | }
75 | \author{
76 | Martin Jullum
77 | }
78 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Nov 11th, 2021: THIS IS A MODIFIED VERSION OF THE check-full VERSION AS DESCRIBED BELOW
 2 | # IT ADDS cranversion TO THE LIST OF BRANCHES TO RUN ON AND REDUCED THE NUMBER OF CONFIGS, OTHERWISE IDENTICAL
 3 | 
 4 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 5 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 6 | #
 7 | # NOTE: This workflow is overkill for most R packages and
 8 | # check-standard.yaml is likely a better choice.
 9 | # usethis::use_github_action("check-standard") will install it.
10 | 
11 | 
12 | on:
13 |   push:
14 |     branches: [main, master, cranversion]
15 |   pull_request:
16 |     branches: [main, master, cranversion]
17 | 
18 | name: R-CMD-check
19 | 
20 | jobs:
21 |   R-CMD-check:
22 |     runs-on: ${{ matrix.config.os }}
23 | 
24 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
25 | 
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         config:
30 |           - {os: macOS-latest,   r: 'release'}
31 | 
32 |           - {os: windows-latest, r: 'release'}
33 | 
34 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
35 |           - {os: ubuntu-latest,   r: 'release'}
36 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
37 |           - {os: ubuntu-latest,   r: 'oldrel-2'}
38 | #          - {os: ubuntu-latest,   r: 'oldrel-3'}
39 | 
40 |     env:
41 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
42 |       R_KEEP_PKG_SOURCE: yes
43 | 
44 |     steps:
45 |       - uses: actions/checkout@v2
46 | 
47 |       - uses: r-lib/actions/setup-pandoc@v1
48 | 
49 |       - uses: r-lib/actions/setup-r@v1
50 |         with:
51 |           r-version: ${{ matrix.config.r }}
52 |           http-user-agent: ${{ matrix.config.http-user-agent }}
53 |           use-public-rspm: true
54 | 
55 |       - uses: r-lib/actions/setup-r-dependencies@v1
56 |         with:
57 |           extra-packages: rcmdcheck
58 | 
59 |       - uses: r-lib/actions/check-r-package@v1
60 | 
61 |       - name: Show testthat output
62 |         if: always()
63 |         run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
64 |         shell: bash
65 | 
66 |       - name: Upload check results
67 |         if: failure()
68 |         uses: actions/upload-artifact@main
69 |         with:
70 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
71 |           path: check
72 | 


--------------------------------------------------------------------------------
/man/feature_combinations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/features.R
 3 | \name{feature_combinations}
 4 | \alias{feature_combinations}
 5 | \title{Define feature combinations, and fetch additional information about each unique combination}
 6 | \usage{
 7 | feature_combinations(
 8 |   m,
 9 |   exact = TRUE,
10 |   n_combinations = 200,
11 |   weight_zero_m = 10^6,
12 |   group_num = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{m}{Positive integer. Total number of features.}
17 | 
18 | \item{exact}{Logical. If \code{TRUE} all \code{2^m} combinations are generated, otherwise a
19 | subsample of the combinations is used.}
20 | 
21 | \item{n_combinations}{Positive integer. Note that if \code{exact = TRUE},
22 | \code{n_combinations} is ignored. However, if \code{m > 12} you'll need to add a positive integer
23 | value for \code{n_combinations}.}
24 | 
25 | \item{weight_zero_m}{Numeric. The value to use as a replacement for infinite combination
26 | weights when doing numerical operations.}
27 | 
28 | \item{group_num}{List. Contains vector of integers indicating the feature numbers for the
29 | different groups.}
30 | }
31 | \value{
32 | A data.table that contains the following columns:
33 | \describe{
34 | \item{id_combination}{Positive integer. Represents a unique key for each combination. Note that the table
35 | is sorted by \code{id_combination}, so that is always equal to \code{x[["id_combination"]] = 1:nrow(x)}.}
36 | \item{features}{List. Each item of the list is an integer vector where \code{features[[i]]}
37 | represents the indices of the features included in combination \code{i}. Note that all the items
38 | are sorted such that \code{features[[i]] == sort(features[[i]])} is always true.}
39 | \item{n_features}{Vector of positive integers. \code{n_features[i]} equals the number of features in combination
40 | \code{i}, i.e. \code{n_features[i] = length(features[[i]])}.}.
41 | \item{N}{Positive integer. The number of unique ways to sample \code{n_features[i]} features
42 | from \code{m} different features, without replacement.}
43 | }
44 | }
45 | \description{
46 | Define feature combinations, and fetch additional information about each unique combination
47 | }
48 | \examples{
49 | # All combinations
50 | x <- feature_combinations(m = 3)
51 | nrow(x) # Equals 2^3 = 8
52 | 
53 | # Subsample of combinations
54 | x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2)
55 | }
56 | \author{
57 | Nikolai Sellereite, Martin Jullum
58 | }
59 | 


--------------------------------------------------------------------------------
/man/sample_ctree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sampling.R
 3 | \name{sample_ctree}
 4 | \alias{sample_ctree}
 5 | \title{Sample ctree variables from a given conditional inference tree}
 6 | \usage{
 7 | sample_ctree(tree, n_samples, x_test, x_train, p, sample)
 8 | }
 9 | \arguments{
10 | \item{tree}{List. Contains tree which is an object of type ctree built from the party package.
11 | Also contains given_ind, the features to condition upon.}
12 | 
13 | \item{n_samples}{Numeric. Indicates how many samples to use for MCMC.}
14 | 
15 | \item{x_test}{Matrix, data.frame or data.table with the features of the observation whose
16 | predictions ought to be explained (test data). Dimension \code{1xp} or \code{px1}.}
17 | 
18 | \item{x_train}{Matrix, data.frame or data.table with training data.}
19 | 
20 | \item{p}{Positive integer. The number of features.}
21 | 
22 | \item{sample}{Boolean. True indicates that the method samples from the terminal node
23 | of the tree whereas False indicates that the method takes all the observations if it is
24 | less than n_samples.}
25 | }
26 | \value{
27 | data.table with \code{n_samples} (conditional) Gaussian samples
28 | }
29 | \description{
30 | Sample ctree variables from a given conditional inference tree
31 | }
32 | \examples{
33 | if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) {
34 |   m <- 10
35 |   n <- 40
36 |   n_samples <- 50
37 |   mu <- rep(1, m)
38 |   cov_mat <- cov(matrix(rnorm(n * m), n, m))
39 |   x_train <- data.table::data.table(MASS::mvrnorm(n, mu, cov_mat))
40 |   x_test <- MASS::mvrnorm(1, mu, cov_mat)
41 |   x_test_dt <- data.table::setDT(as.list(x_test))
42 |   given_ind <- c(4, 7)
43 |   dependent_ind <- (1:dim(x_train)[2])[-given_ind]
44 |   x <- x_train[, given_ind, with = FALSE]
45 |   y <- x_train[, dependent_ind, with = FALSE]
46 |   df <- data.table::data.table(cbind(y, x))
47 |   colnames(df) <- c(paste0("Y", 1:ncol(y)), paste0("V", given_ind))
48 |   ynam <- paste0("Y", 1:ncol(y))
49 |   fmla <- as.formula(paste(paste(ynam, collapse = "+"), "~ ."))
50 |   datact <- party::ctree(fmla, data = df, controls = party::ctree_control(
51 |     minbucket = 7,
52 |     mincriterion = 0.95
53 |   ))
54 |   tree <- list(tree = datact, given_ind = given_ind, dependent_ind = dependent_ind)
55 |   shapr:::sample_ctree(
56 |     tree = tree, n_samples = n_samples, x_test = x_test_dt, x_train = x_train,
57 |     p = length(x_test), sample = TRUE
58 |   )
59 | }
60 | }
61 | \author{
62 | Annabelle Redelmeier
63 | }
64 | \keyword{internal}
65 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Minor release, shapr 0.2.0
 3 | 
 4 | * Adds the ctree approach from new paper
 5 | * Simplified supported for custom models
 6 | * Adds comprehensive check suite for feature consistency
 7 | 
 8 | ## Test environments
 9 | 
10 | * GitHub Actions (windows-latest): R 4.0
11 | * GitHub Actions (ubuntu-16.04): R 4.0, 3.6, 3.5
12 | * GitHub Actions (macOS-latest): R-release, 4.0
13 | * win-builder (x86_64-w64-mingw32): R 4.0, 3.6, R-devel
14 | * local Ubuntu 18.04: R 3.6
15 | * local Windows 10: R 4.0
16 | * R-hub (windows-x86_64-devel): R-devel
17 | * R-hub (macos-highsierra-release-cran): R-release
18 | 
19 | * local Ubuntu 18.04: R 3.6 (without packages in Suggests): 
20 | ```devtools::check(vignettes = FALSE, env_vars=c(`_R_CHECK_DEPENDS_ONLY_` = "true"))```
21 | 
22 | ## R CMD check results
23 | 
24 | There were no ERRORs or WARNINGs.
25 | 
26 | There was 2 NOTES 
27 | 
28 | *NOTE 1 (on local Windows 10: R 4.0):
29 | 
30 |   Note: information on .o files for i386 is not available
31 |   Note: information on .o files for x64 is not available
32 |   File 'C:/Users/jullum/Dropbox/Local_work/Git/shapr.Rcheck/shapr/libs/i386/shapr.dll':
33 |     Found '_exit', possibly from '_exit' (C)
34 |     Found 'abort', possibly from 'abort' (C), 'runtime' (Fortran)
35 |     Found 'exit', possibly from 'exit' (C), 'stop' (Fortran)
36 |     Found 'printf', possibly from 'printf' (C)
37 |   File 'C:/Users/jullum/Dropbox/Local_work/Git/shapr.Rcheck/shapr/libs/x64/shapr.dll':
38 |     Found '_exit', possibly from '_exit' (C)
39 |     Found 'abort', possibly from 'abort' (C), 'runtime' (Fortran)
40 |     Found 'exit', possibly from 'exit' (C), 'stop' (Fortran)
41 |     Found 'printf', possibly from 'printf' (C)
42 |   
43 |   Compiled code should not call entry points which might terminate R nor
44 |   write to stdout/stderr instead of to the console, nor use Fortran I/O
45 |   nor system RNGs. The detected symbols are linked into the code but
46 |   might come from libraries and not actually be called.
47 |   
48 |   See 'Writing portable packages' in the 'Writing R Extensions' manual.
49 | 
50 | > I believe this is a false-positive ref https://stackoverflow.com/questions/64402688/information-on-o-files-for-x64-is-not-available-note-on-r-package-checks-using
51 | 
52 | *NOTE 2 (on all winbuilder + R-hub servers)
53 | 
54 | Days since last update: 6
55 | 
56 | > The previous release was a basic patch after the package was taken off CRAN. This is a proper release with new features.
57 | 
58 | ## Downstream dependencies
59 | There are currently no downstream dependencies for this package.
60 | 


--------------------------------------------------------------------------------
/man/create_ctree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sampling.R
 3 | \name{create_ctree}
 4 | \alias{create_ctree}
 5 | \title{Make all conditional inference trees}
 6 | \usage{
 7 | create_ctree(
 8 |   given_ind,
 9 |   x_train,
10 |   mincriterion,
11 |   minsplit,
12 |   minbucket,
13 |   use_partykit = "on_error"
14 | )
15 | }
16 | \arguments{
17 | \item{given_ind}{Numeric value. Indicates which features are conditioned on.}
18 | 
19 | \item{x_train}{Numeric vector. Indicates the specific values of features for individual i.}
20 | 
21 | \item{mincriterion}{Numeric value or vector equal to 1 - alpha where alpha is the nominal level of the conditional
22 | independence tests.
23 | Can also be a vector equal to the length of the number of features indicating which mincriterion to use
24 | when conditioning on various numbers of features.}
25 | 
26 | \item{minsplit}{Numeric value. Equal to the value that the sum of the left and right daughter nodes need to exceed.}
27 | 
28 | \item{minbucket}{Numeric value. Equal to the minimum sum of weights in a terminal node.}
29 | 
30 | \item{use_partykit}{String. In some semi-rare cases \code{partyk::ctree} runs into an error related to the LINPACK
31 | used by R. To get around this problem, one may fall back to using the newer (but slower) \code{partykit::ctree}
32 | function, which is a reimplementation of the same method. Setting this parameter to \code{"on_error"} (default)
33 | falls back to  \code{partykit::ctree}, if \code{party::ctree} fails. Other options are \code{"never"}, which always
34 | uses \code{party::ctree}, and \code{"always"}, which always uses \code{partykit::ctree}. A warning message is
35 | created whenever \code{partykit::ctree} is used.}
36 | }
37 | \value{
38 | List with conditional inference tree and the variables conditioned/not conditioned on.
39 | }
40 | \description{
41 | Make all conditional inference trees
42 | }
43 | \examples{
44 | if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) {
45 |   m <- 10
46 |   n <- 40
47 |   n_samples <- 50
48 |   mu <- rep(1, m)
49 |   cov_mat <- cov(matrix(rnorm(n * m), n, m))
50 |   x_train <- data.table::data.table(MASS::mvrnorm(n, mu, cov_mat))
51 |   given_ind <- c(4, 7)
52 |   mincriterion <- 0.95
53 |   minsplit <- 20
54 |   minbucket <- 7
55 |   sample <- TRUE
56 |   create_ctree(
57 |     given_ind = given_ind, x_train = x_train,
58 |     mincriterion = mincriterion, minsplit = minsplit,
59 |     minbucket = minbucket, use_partykit = "on_error"
60 |   )
61 | }
62 | }
63 | \author{
64 | Annabelle Redelmeier, Martin Jullum
65 | }
66 | \keyword{internal}
67 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(explain,combined)
 4 | S3method(explain,copula)
 5 | S3method(explain,ctree)
 6 | S3method(explain,ctree_comb_mincrit)
 7 | S3method(explain,empirical)
 8 | S3method(explain,gaussian)
 9 | S3method(explain,independence)
10 | S3method(get_model_specs,gam)
11 | S3method(get_model_specs,glm)
12 | S3method(get_model_specs,lm)
13 | S3method(get_model_specs,ranger)
14 | S3method(get_model_specs,xgb.Booster)
15 | S3method(model_checker,default)
16 | S3method(model_checker,gam)
17 | S3method(model_checker,glm)
18 | S3method(model_checker,lm)
19 | S3method(model_checker,ranger)
20 | S3method(model_checker,xgb.Booster)
21 | S3method(plot,shapr)
22 | S3method(predict_model,default)
23 | S3method(predict_model,gam)
24 | S3method(predict_model,glm)
25 | S3method(predict_model,lm)
26 | S3method(predict_model,ranger)
27 | S3method(predict_model,xgb.Booster)
28 | S3method(prepare_data,copula)
29 | S3method(prepare_data,ctree)
30 | S3method(prepare_data,empirical)
31 | S3method(prepare_data,gaussian)
32 | S3method(prepare_data,independence)
33 | S3method(print,shapr)
34 | export(aicc_full_single_cpp)
35 | export(check_features)
36 | export(compute_shapley)
37 | export(correction_matrix_cpp)
38 | export(create_ctree)
39 | export(explain)
40 | export(feature_combinations)
41 | export(feature_matrix_cpp)
42 | export(get_data_specs)
43 | export(get_model_specs)
44 | export(hat_matrix_cpp)
45 | export(mahalanobis_distance_cpp)
46 | export(make_dummies)
47 | export(model_checker)
48 | export(observation_impute_cpp)
49 | export(predict_model)
50 | export(prepare_and_predict)
51 | export(prepare_data)
52 | export(preprocess_data)
53 | export(rss_cpp)
54 | export(shapr)
55 | export(update_data)
56 | export(weight_matrix_cpp)
57 | importFrom(Rcpp,sourceCpp)
58 | importFrom(data.table,":=")
59 | importFrom(data.table,as.data.table)
60 | importFrom(data.table,between)
61 | importFrom(data.table,copy)
62 | importFrom(data.table,data.table)
63 | importFrom(data.table,fread)
64 | importFrom(data.table,fwrite)
65 | importFrom(data.table,is.data.table)
66 | importFrom(data.table,month)
67 | importFrom(data.table,rbindlist)
68 | importFrom(data.table,setcolorder)
69 | importFrom(data.table,setkey)
70 | importFrom(data.table,setnames)
71 | importFrom(data.table,uniqueN)
72 | importFrom(data.table,year)
73 | importFrom(graphics,hist)
74 | importFrom(graphics,plot)
75 | importFrom(graphics,rect)
76 | importFrom(stats,as.formula)
77 | importFrom(stats,contrasts)
78 | importFrom(stats,model.frame)
79 | importFrom(stats,model.matrix)
80 | importFrom(stats,predict)
81 | importFrom(stats,setNames)
82 | importFrom(utils,head)
83 | importFrom(utils,methods)
84 | importFrom(utils,tail)
85 | useDynLib(shapr, .registration = TRUE)
86 | 


--------------------------------------------------------------------------------
/man/predict_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/models.R
 3 | \name{predict_model}
 4 | \alias{predict_model}
 5 | \alias{predict_model.default}
 6 | \alias{predict_model.lm}
 7 | \alias{predict_model.glm}
 8 | \alias{predict_model.ranger}
 9 | \alias{predict_model.xgb.Booster}
10 | \alias{predict_model.gam}
11 | \title{Generate predictions for different model classes}
12 | \usage{
13 | predict_model(x, newdata)
14 | 
15 | \method{predict_model}{default}(x, newdata)
16 | 
17 | \method{predict_model}{lm}(x, newdata)
18 | 
19 | \method{predict_model}{glm}(x, newdata)
20 | 
21 | \method{predict_model}{ranger}(x, newdata)
22 | 
23 | \method{predict_model}{xgb.Booster}(x, newdata)
24 | 
25 | \method{predict_model}{gam}(x, newdata)
26 | }
27 | \arguments{
28 | \item{x}{Model object for the model to be explained.}
29 | 
30 | \item{newdata}{A data frame (or matrix) in which to look for variables with which to predict.}
31 | }
32 | \value{
33 | Numeric
34 | }
35 | \description{
36 | Performs prediction of response \code{\link[stats]{lm}}, \code{\link[stats]{glm}},
37 | \code{\link[ranger]{ranger}},  \code{\link[mgcv:gam]{mgcv::gam}} and
38 | \code{\link[xgboost:xgb.train]{xgboost::xgb.train}} with binary or continuous
39 | response. See details for more information.
40 | }
41 | \details{
42 | The following models are currently supported:
43 | \itemize{
44 | \item \code{\link[stats:lm]{stats::lm}}
45 | \item \code{\link[stats:glm]{stats::glm}}
46 | \item \code{\link[ranger:ranger]{ranger::ranger}}
47 | \item \code{\link[mgcv:gam]{mgcv::gam}}
48 | \item \code{\link[xgboost:xgb.train]{xgboost::xgb.train}}
49 | }
50 | 
51 | The returned object \code{p} always satisfies the following properties:
52 | \itemize{
53 | \item \code{is.atomic(p)} equals \code{TRUE}
54 | \item \code{is.double(p)} equals \code{TRUE}
55 | }
56 | 
57 | If you have a binary classification model we'll always return the probability prediction
58 | for a single class.
59 | 
60 | For more details on how to explain other types of models (i.e. custom models), see the Advanced usage section
61 | of the vignette: \cr
62 | From R: \code{vignette("understanding_shapr", package = "shapr")}  \cr
63 | Web: \url{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#explain-custom-models}
64 | }
65 | \examples{
66 | if (requireNamespace("MASS", quietly = TRUE)) {
67 |   # Load example data
68 |   data("Boston", package = "MASS")
69 |   # Split data into test- and training data
70 |   x_train <- head(Boston, -3)
71 |   x_test <- tail(Boston, 3)
72 |   # Fit a linear model
73 |   model <- lm(medv ~ lstat + rm + dis + indus, data = x_train)
74 | 
75 |   # Predicting for a model with a standardized format
76 |   predict_model(x = model, newdata = x_test)
77 | }
78 | }
79 | \author{
80 | Martin Jullum
81 | }
82 | \keyword{internal}
83 | 


--------------------------------------------------------------------------------
/inst/scripts/example_custom_model.R:
--------------------------------------------------------------------------------
 1 | library(gbm)
 2 | library(shapr)
 3 | 
 4 | # Load data
 5 | data("Boston", package = "MASS")
 6 | 
 7 | # Create test- and training data
 8 | x_var <- c("lstat", "rm", "dis", "indus")
 9 | y_var <- "medv"
10 | 
11 | x_train <- as.matrix(Boston[-1:-6, x_var])
12 | y_train <- Boston[-1:-6, y_var]
13 | x_test <- as.matrix(Boston[1:6, x_var])
14 | 
15 | form = as.formula(paste0(y_var,"~",paste0(x_var,collapse="+")))
16 | 
17 | library(gbm)
18 | 
19 | xy_train <- data.frame(x_train,medv = y_train)
20 | 
21 | 
22 | # Fitting a gbm model
23 | set.seed(825)
24 | model <- gbm::gbm(
25 |   form,
26 |   data = xy_train,
27 |   distribution = "gaussian"
28 | )
29 | 
30 | #### Full feature versions of the three required model functions ####
31 | 
32 | predict_model.gbm <- function(x, newdata) {
33 | 
34 |   if (!requireNamespace('gbm', quietly = TRUE)) {
35 |     stop('The gbm package is required for predicting train models')
36 |   }
37 | 
38 |   model_type <- ifelse(
39 |     x$distribution$name %in% c("bernoulli","adaboost"),
40 |     "classification",
41 |     "regression"
42 |   )
43 |   if (model_type == "classification") {
44 | 
45 |     predict(x, as.data.frame(newdata), type = "response",n.trees = x$n.trees)
46 |   } else {
47 | 
48 |     predict(x, as.data.frame(newdata),n.trees = x$n.trees)
49 |   }
50 | }
51 | 
52 | get_model_specs.gbm <- function(x){
53 |   feature_list = list()
54 |   feature_list$labels <- labels(x$Terms)
55 |   m <- length(feature_list$labels)
56 | 
57 |   feature_list$classes <- attr(x$Terms,"dataClasses")[-1]
58 |   feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels)
59 |   feature_list$factor_levels[feature_list$classes=="factor"] <- NA # the model object doesn't contain factor levels info
60 | 
61 |   return(feature_list)
62 | }
63 | 
64 | # Prepare the data for explanation
65 | set.seed(123)
66 | explainer <- shapr(xy_train, model)
67 | p0 <- mean(xy_train[,y_var])
68 | explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0)
69 | # Plot results
70 | plot(explanation)
71 | 
72 | 
73 | # Minimal version of the three required model functions
74 | # Note: Working only for this exact version of the model class
75 | # Avoiding to define get_model_specs skips all feature
76 | # consistency checking between your data and model
77 | 
78 | # Removing the previously defined functions to simulate a fresh start
79 | rm(predict_model.gbm)
80 | rm(get_model_specs.gbm)
81 | 
82 | 
83 | predict_model.gbm <- function(x, newdata) {
84 |   predict(x, as.data.frame(newdata),n.trees = x$n.trees)
85 | }
86 | 
87 | 
88 | # Prepare the data for explanation
89 | set.seed(123)
90 | explainer <- shapr(x_train, model)
91 | p0 <- mean(xy_train[,y_var])
92 | explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0)
93 | # Plot results
94 | plot(explanation)
95 | 


--------------------------------------------------------------------------------
/src/impute_data.cpp:
--------------------------------------------------------------------------------
 1 | #include <RcppArmadillo.h>
 2 | using namespace Rcpp;
 3 | 
 4 | //' Get imputed data
 5 | //'
 6 | //' @param index_xtrain Positive integer. Represents a sequence of row indices from \code{xtrain},
 7 | //' i.e. \code{min(index_xtrain) >= 1} and \code{max(index_xtrain) <= nrow(xtrain)}.
 8 | //'
 9 | //' @param index_s Positive integer. Represents a sequence of row indices from \code{S},
10 | //' i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}.
11 | //'
12 | //' @param xtrain Numeric matrix.
13 | //'
14 | //' @param xtest Numeric matrix. Represents a single test observation.
15 | //'
16 | //' @param S Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals
17 | //' the total number of sampled/non-sampled feature combinations and \code{m} equals
18 | //' the total number of unique features. Note that \code{m = ncol(xtrain)}. See details
19 | //' for more information.
20 | //'
21 | //' @details \code{S(i, j) = 1} if and only if feature \code{j} is present in feature
22 | //' combination \code{i}, otherwise \code{S(i, j) = 0}. I.e. if \code{m = 3}, there
23 | //' are \code{2^3 = 8} unique ways to combine the features. In this case \code{dim(S) = c(8, 3)}.
24 | //' Let's call the features \code{x1, x2, x3} and take a closer look at the combination
25 | //' represented by \code{s = c(x1, x2)}. If this combination is represented by the second row,
26 | //' the following is true: \code{S[2, 1:3] = c(1, 1, 0)}.
27 | //'
28 | //' The returned object, \code{X}, is a numeric matrix where
29 | //' \code{dim(X) = c(length(index_xtrain), ncol(xtrain))}. If feature \code{j} is present in
30 | //' the k-th observation, that is \code{S[index_[k], j] == 1}, \code{X[k, j] = xtest[1, j]}.
31 | //' Otherwise \code{X[k, j] = xtrain[index_xtrain[k], j]}.
32 | //'
33 | //' @export
34 | //' @keywords internal
35 | //'
36 | //' @return Numeric matrix
37 | //'
38 | //' @author Nikolai Sellereite
39 | // [[Rcpp::export]]
40 | NumericMatrix observation_impute_cpp(IntegerVector index_xtrain,
41 |                                      IntegerVector index_s,
42 |                                      NumericMatrix xtrain,
43 |                                      NumericMatrix xtest,
44 |                                      IntegerMatrix S) {
45 | 
46 | 
47 |     // Error-checks
48 |     if (index_xtrain.length() != index_s.length())
49 |         Rcpp::stop("The length of index_train and index_s should be equal.");
50 | 
51 |     if (xtrain.ncol() != xtest.ncol())
52 |         Rcpp::stop("Number of columns in xtrain and xtest should be equal.");
53 | 
54 |     NumericMatrix X(index_xtrain.length(), xtrain.ncol());
55 | 
56 |     for (int i = 0; i < X.nrow(); ++i) {
57 | 
58 |         for (int j = 0; j < X.ncol(); ++j) {
59 | 
60 |             if (S(index_s[i] - 1, j) > 0) {
61 |                 X(i, j) = xtest(0, j);
62 |             } else {
63 |                 X(i, j) = xtrain(index_xtrain[i] - 1, j);
64 |             }
65 | 
66 |         }
67 |     }
68 | 
69 |     return X;
70 | }
71 | 


--------------------------------------------------------------------------------
/inst/scripts/example_ctree_method.R:
--------------------------------------------------------------------------------
  1 | library(shapr)
  2 | 
  3 | data("Boston", package = "MASS")
  4 | 
  5 | x_var <- c("lstat", "rm", "dis", "indus")
  6 | y_var <- "medv"
  7 | 
  8 | #### 1) Example with just continuous features ####
  9 | 
 10 | x_train <- as.matrix(tail(Boston[, x_var], -6))
 11 | y_train <- tail(Boston[, y_var], -6)
 12 | x_test <- as.matrix(head(Boston[, x_var], 6))
 13 | 
 14 | # Just looking at the dependence between the features
 15 | cor(x_train)
 16 | 
 17 | # Fitting a basic xgboost model to the training data
 18 | model <- xgboost::xgboost(
 19 |   data = x_train,
 20 |   label = y_train,
 21 |   nround = 20,
 22 |   verbose = FALSE
 23 | )
 24 | 
 25 | # Prepare the data for explanation
 26 | explainer <- shapr(x_train, model)
 27 | 
 28 | # Spedifying the phi_0, i.e. the expected prediction without any features
 29 | p0 <- mean(y_train)
 30 | 
 31 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using
 32 | # the ctree approach with default mincriterion = 0.95, minsplit = 20, minbucket = 7,
 33 | # and sample = TRUE
 34 | explanation <- explain(x_test, explainer,
 35 |                        approach = "ctree",
 36 |                        prediction_zero = p0)
 37 | 
 38 | # Printing the Shapley values for the test data
 39 | explanation$dt
 40 | 
 41 | # Finally we plot the resulting explanations
 42 | plot(explanation)
 43 | 
 44 | 
 45 | #### 2) Example with mixed continuous and categorical features ####
 46 | library(shapr)
 47 | 
 48 | data("Boston", package = "MASS")
 49 | 
 50 | x_var <- c("lstat", "rm", "dis", "indus")
 51 | y_var <- "medv"
 52 | 
 53 | x_train <- as.matrix(tail(Boston[, x_var], -6))
 54 | y_train <- tail(Boston[, y_var], -6)
 55 | x_test <- as.matrix(head(Boston[, x_var], 6))
 56 | 
 57 | x_train_cat <- as.data.frame(x_train)
 58 | x_test_cat <- as.data.frame(x_test)
 59 | 
 60 | # convert to factors for illustational purpose
 61 | x_train_cat$rm <- factor(round(x_train_cat$rm))
 62 | x_test_cat$rm <- factor(round(x_test_cat$rm), levels = c(8, 9, 7, 4, 5, 6))
 63 | 
 64 | # Make sure they have the same levels!
 65 | print(levels(x_train_cat$rm))
 66 | print(levels(x_test_cat$rm))
 67 | 
 68 | # -- special function when using categorical data + xgboost
 69 | dummylist <- make_dummies(traindata = x_train_cat, testdata = x_test_cat)
 70 | 
 71 | x_train_dummy <- dummylist$train_dummies
 72 | x_test_dummy <- dummylist$test_dummies
 73 | 
 74 | # Fitting a basic xgboost model to the training data
 75 | model_cat <- xgboost::xgboost(
 76 |   data = x_train_dummy,
 77 |   label = y_train,
 78 |   nround = 20,
 79 |   verbose = FALSE
 80 | )
 81 | model_cat$feature_list <- dummylist$feature_list
 82 | 
 83 | explainer_cat <- shapr(dummylist$traindata_new, model_cat)
 84 | 
 85 | # Specifying the phi_0, i.e. the expected prediction without any features
 86 | p0 <- mean(y_train)
 87 | 
 88 | # dummylist$testdata_new$rm
 89 | 
 90 | explanation_cat <- explain(
 91 |   dummylist$testdata_new,
 92 |   approach = "ctree",
 93 |   explainer = explainer_cat,
 94 |   prediction_zero = p0
 95 | )
 96 | 
 97 | # Plot the resulting explanations for observations 1 and 6, excluding
 98 | # the no-covariate effect
 99 | plot(explanation_cat)
100 | 


--------------------------------------------------------------------------------
/src/distance.cpp:
--------------------------------------------------------------------------------
 1 | #include <RcppArmadillo.h>
 2 | using namespace Rcpp;
 3 | 
 4 | //' (Generalized) Mahalanobis distance
 5 | //'
 6 | //' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}.
 7 | //'
 8 | //' @param featureList List of vectors indicating all factor combinations that should be included in the computations. Assumes that the first one is empty.
 9 | //' @param mcov Matrix. The Sigma-matrix in the Mahalanobis distance formula (\code{stats::cov(Xtrain_mat)}) gives Mahalanobis distance,
10 | //' \code{diag(m)} gives the Euclidean distance.
11 | //' @param S_scale_dist Logical indicating
12 | //' @param Xtrain_mat Matrix
13 | //' @param Xtest_mat Matrix
14 | //'
15 | //' @export
16 | //' @keywords internal
17 | //'
18 | //' @return Array of three dimensions. Contains the squared distance for between all training and test observations for all feature combinations passed to the function.
19 | //' @author Martin Jullum
20 | // [[Rcpp::export]]
21 | arma::cube mahalanobis_distance_cpp(Rcpp::List featureList,arma::mat Xtrain_mat, arma::mat Xtest_mat, arma::mat mcov, bool S_scale_dist) {
22 | 
23 |     using namespace arma;
24 | 
25 |     // Define variables
26 |     int ntrain = Xtrain_mat.n_rows;
27 |     int ntest = Xtest_mat.n_rows;
28 |     int p = featureList.size();
29 | 
30 | 
31 |     arma::mat mcov0;
32 |     arma::mat cholDec;
33 |     arma::mat mu0;
34 |     arma::mat mu;
35 |     arma::mat X;
36 | 
37 |     arma::cube out(ntrain,ntest,p,arma::fill::zeros);
38 | 
39 |     // Declaring some private variables
40 | 
41 |     double acc;
42 |     uint32_t icol, irow, ii;
43 |     double S_scale;
44 |     IntegerVector temp;
45 | 
46 |     for (int k = 0; k < p; ++k){
47 |         temp = featureList[k];
48 |         if(temp.length() == 0) {
49 |             continue;
50 |         }
51 |         arma::uvec theseFeatures = featureList[k];
52 |         theseFeatures = theseFeatures-1;
53 | 
54 |         mcov0 = mcov.submat(theseFeatures,theseFeatures);
55 |         X = Xtrain_mat.cols(theseFeatures);
56 |         mu0 = Xtest_mat.cols(theseFeatures);
57 | 
58 |         uint32_t d = X.n_cols;
59 |         vec tmp(d);
60 |         cholDec = trimatl(chol(mcov0).t());
61 |         vec D = cholDec.diag();
62 |         if(S_scale_dist){
63 |             S_scale = 1.0/pow(theseFeatures.n_elem,2.0);
64 |        } else {
65 |             S_scale = 1.0;
66 |         }
67 | 
68 |         for (int j = 0; j < ntest; ++j) {
69 |             mu = mu0.row(j);
70 | 
71 |             // For each of the "n" random vectors, forwardsolve the corresponding linear system.
72 |             // Forwardsolve because I'm using the lower triangle Cholesky.
73 |             for(icol = 0; icol < ntrain; icol++)
74 |             {
75 | 
76 |                 for(irow = 0; irow < d; irow++)
77 |                 {
78 |                     acc = 0.0;
79 | 
80 |                     for(ii = 0; ii < irow; ii++) acc += tmp.at(ii) * cholDec.at(irow, ii);
81 | 
82 |                     tmp.at(irow) = ( X.at(icol, irow) - mu.at(irow) - acc ) / D.at(irow);
83 |                 }
84 | 
85 |                 out.at(icol,j,k) = sum(square(tmp));
86 |             }
87 | 
88 |         }
89 |         out.slice(k) *= S_scale;
90 |     }
91 | 
92 | 
93 |     return out;
94 | }
95 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # shapr 0.2.0.9000
 2 | 
 3 | * Development version
 4 | * Added support for groupSHAP, including check of appropriate groups, examples and tests
 5 | * Various modifications to input of internal functions to reflect that Shapley values may be
 6 |   computed both feature-wise and group-wise
 7 | * Fixed bug when passing non-named data to shapr() or explain() (e.g. ```shapr(data[,1:5],model...)```
 8 | 
 9 | # shapr 0.2.0
10 | 
11 | * Minor CRAN release
12 | * Added the new dependence modeling approach "ctree" which handles categorical features in addition
13 |   to numerical ones. For more information see our paper https://doi.org/10.1007/978-3-030-57321-8_7
14 | * Added support to explain models which take as input categorical features for model classes like xgboost
15 |   which originally takes only numeric input. On the user side, an additional call to the new *make_dummies*
16 |   function is required. See the vignette for details. 
17 | * Slight change in the user procedure for explaining predictions from custom models. This now requires 
18 | only a single function *predict_model*. 
19 | * Introduced a thorough system for extracting and checking the feature information in the model and the data 
20 |   passed to *shapr* and *explain*. The features in the data are checked for consistency with what can be extracted
21 |   from the model object. If the model object is missing some of the necessary information, the info from the data
22 |   is used instead. The system checks feature labels, classes, and any factor levels.
23 | * Due to the previous point, the *feature_labels* option previously used for custom models is removed.
24 | * Added a manual testing script for custom model (currently cannot be handled by testthat due to environment issues).
25 | * A few under-the-hood changes for checking in the *shapr* function.
26 | 
27 | # shapr 0.1.4
28 | 
29 | * Patch to fulfill CRAN policy of using packages under Suggests conditionally (in tests and examples)
30 | 
31 | # shapr 0.1.3
32 | 
33 | * Fix installation error on Solaris
34 | * Updated README with CRAN installation instructions and badges
35 | 
36 | # shapr 0.1.2
37 | 
38 | * CRAN release
39 | * Removed unused clustering code
40 | * Removed several package dependencies
41 | * Moved automatic check and pkgdown site build from Circle CI to GitHub actions
42 | * Some minor efficiency fixes
43 | * Changed stopping threshold from 12 to 13 features for none-sampling version of 
44 |   KernelSHAP for consistency with our recommendation
45 | * Changed package title (shortened)
46 | * Minor fixes to fulfill CRAN policy
47 | * Improved documentation
48 | * Revised internal/external and exported/non-exported functions, leading to far
49 |   fewer external functions and a cleaner manual. 
50 | 
51 | # shapr 0.1.1
52 | 
53 | * Journal of Open Source Software release
54 | * Improved installation instructions and community guidelines in README 
55 | * Improved documentation
56 | * Some minor bugfixes
57 | 
58 | # shapr 0.1.0
59 | 
60 | * Support for custom models
61 | * Improved documentation
62 | * Automated testing using [testthat](https://github.com/r-lib/testthat)
63 | * Added vignette that gives an introduction to the package
64 | * Added webpage for package using [pkgdown](https://github.com/r-lib/pkgdown)
65 | * Improved API for end user
66 | * Various bugfixes
67 | 
68 | # shapr 0.0.0.9000
69 | 
70 | * First version of the package. Currently under development.
71 | 


--------------------------------------------------------------------------------
/src/weighted_matrix.cpp:
--------------------------------------------------------------------------------
  1 | #include <RcppArmadillo.h>
  2 | using namespace Rcpp;
  3 | 
  4 | //' Calculate weight matrix
  5 | //'
  6 | //' @param subsets List. Each of the elements equals an integer
  7 | //' vector representing a valid combination of features/feature groups.
  8 | //' @param m Integer. Number of features/feature groups
  9 | //' @param n Integer. Number of combinations
 10 | //' @param w Numeric vector of length \code{n}, i.e. \code{w[i]} equals
 11 | //' the Shapley weight of feature/feature group combination \code{i}, represented by
 12 | //' \code{subsets[[i]]}.
 13 | //'
 14 | //' @export
 15 | //' @keywords internal
 16 | //'
 17 | //' @return Matrix of dimension n x m + 1
 18 | //' @author Nikolai Sellereite
 19 | // [[Rcpp::export]]
 20 | arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w){
 21 | 
 22 |     // Note that Z is a n x (m + 1) matrix, where m is the number
 23 |     // of unique subsets. All elements in the first column are equal to 1.
 24 |     // For j > 0, Z(i, j) = 1 if and only if feature/feature group j is present in
 25 |     // the ith combination of subsets. In example, if Z(i, j) = 1 we know that
 26 |     // j is present in subsets[i].
 27 | 
 28 |     // Note that w represents the diagonal in W, where W is a diagoanl
 29 |     // n x n matrix.
 30 | 
 31 |     // Note that X.t() equals Z.t() * W, where w is the diagonal of W, which by
 32 |     // definition gives X = W * Z. Since W is a diagonal matrix we could
 33 |     // simplify this so that X(i, j) = sum(W(i, k) * Z(k, j)) = W(i, i) * Z(i, j))
 34 |     // for all combinations of (i, j).
 35 | 
 36 |     // Note that R represents a (m + 1) * n matrix, i.e. R = (X.t() * Z)^-1 * X.t(),
 37 |     // where X.t() = Z.t() * W.
 38 | 
 39 |     // See \url{https://arxiv.org/pdf/1903.10464.pdf} for additional details,
 40 |     // i.e. section 3.2.
 41 | 
 42 |     // Define objects
 43 |     int n_elements;
 44 |     IntegerVector subset_vec;
 45 |     arma::mat Z(n, m + 1, arma::fill::zeros), X(n, m + 1, arma::fill::zeros);
 46 |     arma::mat R(m + 1, n, arma::fill::zeros);
 47 | 
 48 |     // Populate Z
 49 |     for (int i = 0; i < n; i++) {
 50 | 
 51 |         // Set all elements in the first column equal to 1
 52 |         Z(i, 0) = 1;
 53 | 
 54 |         // Extract subsets
 55 |         subset_vec = subsets[i];
 56 |         n_elements = subset_vec.length();
 57 |         if (n_elements > 0) {
 58 |             for (int j = 0; j < n_elements; j++)
 59 |                 Z(i, subset_vec[j]) = 1;
 60 |         }
 61 |     }
 62 | 
 63 |     // Populate X
 64 |     for (int i = 0; i < n; i++) {
 65 | 
 66 |         for (int j = 0; j < Z.n_cols; j++) {
 67 | 
 68 |             X(i, j) = w[i] * Z(i, j);
 69 |         }
 70 |     }
 71 | 
 72 |     R = inv(X.t() * Z) * X.t();
 73 | 
 74 |     return R;
 75 | }
 76 | 
 77 | //' Get feature matrix
 78 | //'
 79 | //' @param features List
 80 | //' @param m Positive integer. Total number of features
 81 | //'
 82 | //' @export
 83 | //' @keywords internal
 84 | //'
 85 | //' @return Matrix
 86 | //' @author Nikolai Sellereite
 87 | // [[Rcpp::export]]
 88 | NumericMatrix feature_matrix_cpp(List features, int m) {
 89 | 
 90 |     // Define variables
 91 |     int n_combinations;
 92 |     n_combinations = features.length();
 93 |     NumericMatrix A(n_combinations, m);
 94 | 
 95 |     // Error-check
 96 |     IntegerVector features_zero = features[0];
 97 |     if (features_zero.length() > 0)
 98 |         Rcpp::stop("The first element of features should be an empty vector, i.e. integer(0)");
 99 | 
100 |     for (int i = 1; i < n_combinations; ++i) {
101 | 
102 |         NumericVector feature_vec = features[i];
103 | 
104 |         for (int j = 0; j < feature_vec.length(); ++j) {
105 | 
106 |             A(i, feature_vec[j] - 1) = 1.0;
107 |         }
108 |     }
109 | 
110 |     return A;
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/man/shapr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shapley.R
 3 | \name{shapr}
 4 | \alias{shapr}
 5 | \title{Create an explainer object with Shapley weights for test data.}
 6 | \usage{
 7 | shapr(x, model, n_combinations = NULL, group = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{Numeric matrix or data.frame/data.table. Contains the data used to estimate the (conditional)
11 | distributions for the features needed to properly estimate the conditional expectations in the Shapley formula.}
12 | 
13 | \item{model}{The model whose predictions we want to explain. Run
14 | \code{\link[shapr:get_supported_models]{shapr:::get_supported_models()}}
15 | for a table of which models \code{shapr} supports natively.}
16 | 
17 | \item{n_combinations}{Integer. The number of feature combinations to sample. If \code{NULL},
18 | the exact method is used and all combinations are considered. The maximum number of
19 | combinations equals \code{2^ncol(x)}.}
20 | 
21 | \item{group}{List. If \code{NULL} regular feature wise Shapley values are computed.
22 | If provided, group wise Shapley values are computed. \code{group} then has length equal to
23 | the number of groups. The list element contains character vectors with the features included
24 | in each of the different groups.}
25 | }
26 | \value{
27 | Named list that contains the following items:
28 | \describe{
29 |   \item{exact}{Boolean. Equals \code{TRUE} if \code{n_combinations = NULL} or
30 |   \code{n_combinations < 2^ncol(x)}, otherwise \code{FALSE}.}
31 |   \item{n_features}{Positive integer. The number of columns in \code{x}}
32 |   \item{S}{Binary matrix. The number of rows equals the number of unique combinations, and
33 |   the number of columns equals the total number of features. I.e. let's say we have a case with
34 |   three features. In that case we have \code{2^3 = 8} unique combinations. If the j-th
35 |   observation for the i-th row equals \code{1} it indicates that the j-th feature is present in
36 |   the i-th combination. Otherwise it equals \code{0}.}
37 |   \item{W}{Matrix. This matrix is equal to the matrix \code{R_D} in Equation 7 in the reference
38 |   of \code{link{explain}}. The Shapley value for a test observation will be equal to the matrix-vector product
39 |   of \code{W} and the contribution vector.}
40 |   \item{X}{data.table. Returned object from \code{\link{feature_combinations}}}
41 |   \item{x_train}{data.table. Transformed \code{x} into a data.table.}
42 |   \item{feature_list}{List. The \code{updated_feature_list} output from
43 |   \code{\link[shapr:preprocess_data]{preprocess_data}}}
44 | }
45 | 
46 | In addition to the items above, \code{model} and \code{n_combinations} are also present in the returned object.
47 | }
48 | \description{
49 | Create an explainer object with Shapley weights for test data.
50 | }
51 | \examples{
52 | if (requireNamespace("MASS", quietly = TRUE)) {
53 |   # Load example data
54 |   data("Boston", package = "MASS")
55 |   df <- Boston
56 | 
57 |   # Example using the exact method
58 |   x_var <- c("lstat", "rm", "dis", "indus")
59 |   y_var <- "medv"
60 |   df0 <- df[, x_var]
61 |   model <- lm(medv ~ lstat + rm + dis + indus, data = df)
62 |   explainer <- shapr(df0, model)
63 | 
64 |   print(nrow(explainer$X))
65 |   # 16 (which equals 2^4)
66 | 
67 |   # Example using approximation
68 |   y_var <- "medv"
69 |   model <- lm(medv ~ ., data = df)
70 |   explainer <- shapr(df, model, n_combinations = 1e3)
71 | 
72 |   print(nrow(explainer$X))
73 | 
74 |   # Example using approximation where n_combinations > 2^m
75 |   x_var <- c("lstat", "rm", "dis", "indus")
76 |   y_var <- "medv"
77 |   model <- lm(medv ~ lstat + rm + dis + indus, data = df)
78 |   explainer <- shapr(df0, model, n_combinations = 1e3)
79 | 
80 |   print(nrow(explainer$X))
81 |   # 16 (which equals 2^4)
82 | 
83 |   # Example using groups
84 |   group <- list(A=x_var[1:2], B=x_var[3:4])
85 | 
86 |   explainer_group <- shapr(df0, model, group = group)
87 |   print(nrow(explainer_group$X))
88 |   # 4 (which equals 2^(#groups))
89 | }
90 | }
91 | \author{
92 | Nikolai Sellereite
93 | }
94 | 


--------------------------------------------------------------------------------
/R/predictions.R:
--------------------------------------------------------------------------------
  1 | #' Calculate Shapley weights for test data
  2 | #'
  3 | #' @description This function should only be called internally, and not be used as
  4 | #' a stand-alone function.
  5 | #'
  6 | #' @param dt data.table
  7 | #' @param prediction_zero Numeric. The value to use for \code{phi_0}.
  8 | #' @param explainer An object of class \code{explainer}. See \code{\link{shapr}}.
  9 | #'
 10 | #' @details If \code{dt} does not contain three columns called \code{id}, \code{id_combination} and \code{w}
 11 | #' the function will fail. \code{id} represents a unique key for a given test observation,
 12 | #' and \code{id_combination} is a unique key for which feature combination the row represents. \code{w}
 13 | #' represents the Shapley value of feature combination given by \code{id_combination}. In addition
 14 | #' to these three columns, \code{dt} should also have columns which matches the variables used
 15 | #' when training the model.
 16 | #'
 17 | #' I.e. you have fitted a linear model using the features \code{x1},
 18 | #' \code{x2} and \code{x3}, and you want to explain 5 test observations using the exact method, i.e.
 19 | #' setting \code{exact = TRUE} in \code{\link{shapr}}, the following properties should be satisfied
 20 | #' \enumerate{
 21 | #' \item \code{colnames(dt)} equals \code{c("x1", "x2", "x3", "id", "id_combination", ""w)}
 22 | #' \item \code{dt[, max(id)]} equals the number of test observations
 23 | #' \item \code{dt[, min(id)]} equals 1L.
 24 | #' \item \code{dt[, max(id_combination)]} equals \code{2^m} where m equals the number of features.
 25 | #' \item \code{dt[, min(id_combination)]} equals 1L.
 26 | #' \item \code{dt[, type(w)]} equals \code{double}.
 27 | #' }
 28 | #'
 29 | #'
 30 | #' @return An object of class \code{c("shapr", "list")}. For more details see \code{\link{explain}}.
 31 | #'
 32 | #' @keywords internal
 33 | #'
 34 | #' @author Nikolai Sellereite
 35 | prediction <- function(dt, prediction_zero, explainer) {
 36 | 
 37 |   # Checks on input data
 38 |   id <- w <- id_combination <- p_hat <- NULL # due to NSE notes in R CMD check
 39 |   stopifnot(
 40 |     data.table::is.data.table(dt),
 41 |     !is.null(dt[["id"]]),
 42 |     !is.null(dt[["id_combination"]]),
 43 |     !is.null(dt[["w"]])
 44 |   )
 45 | 
 46 |   # Setup
 47 |   feature_names <- colnames(explainer$x_test)
 48 |   data.table::setkeyv(dt, c("id", "id_combination"))
 49 | 
 50 |   # Check that the number of test observations equals max(id)
 51 |   stopifnot(nrow(explainer$x_test) == dt[, max(id)])
 52 | 
 53 |   # Reducing the prediction data.table
 54 |   max_id_combination <- nrow(explainer$S)
 55 |   V1 <- keep <- NULL # due to NSE notes in R CMD check
 56 |   dt[, keep := TRUE]
 57 |   first_element <- dt[, tail(.I, 1), .(id, id_combination)][id_combination %in% c(1, max_id_combination), V1]
 58 |   dt[id_combination %in% c(1, max_id_combination), keep := FALSE]
 59 |   dt[first_element, c("keep", "w") := list(TRUE, 1.0)]
 60 |   dt <- dt[keep == TRUE][, keep := NULL]
 61 | 
 62 |   # Predictions
 63 |   if (!all(dt[, unique(id_combination)] == 1)) { # Avoid warnings when predicting with empty newdata
 64 |     dt[id_combination != 1, p_hat := predict_model(explainer$model, newdata = .SD), .SDcols = feature_names]
 65 |   }
 66 |   dt[id_combination == 1, p_hat := prediction_zero]
 67 | 
 68 |   if (dt[, max(id_combination)] < max_id_combination) {
 69 |     p_all <- NULL
 70 |   } else {
 71 |     p_all <- dt[id_combination == max_id_combination, p_hat]
 72 |     names(p_all) <- 1:nrow(explainer$x_test)
 73 |   }
 74 | 
 75 | 
 76 |   # Calculate contributions
 77 |   dt_res <- dt[, .(k = sum((p_hat * w) / sum(w))), .(id, id_combination)]
 78 |   data.table::setkeyv(dt_res, c("id", "id_combination"))
 79 |   dt_mat <- data.table::dcast(dt_res, id_combination ~ id, value.var = "k")
 80 |   dt_mat[, id_combination := NULL]
 81 | 
 82 |   r <- list(p = p_all, dt_mat = dt_mat)
 83 | 
 84 |   return(r)
 85 | }
 86 | 
 87 | 
 88 | #' Compute shapley values
 89 | #' @param explainer An \code{explain} object.
 90 | #' @param contribution_mat The contribution matrix.
 91 | #' @return A \code{data.table} with shapley values for each test observation.
 92 | #' @export
 93 | #' @keywords internal
 94 | compute_shapley <- function(explainer, contribution_mat) {
 95 | 
 96 |   feature_names <- colnames(explainer$x_test)
 97 |   if (!explainer$is_groupwise) {
 98 |     shap_names <- feature_names
 99 |   } else {
100 |     shap_names <- names(explainer$group)
101 |   }
102 | 
103 | 
104 |   kshap <- t(explainer$W %*% contribution_mat)
105 |   dt_kshap <- data.table::as.data.table(kshap)
106 |   colnames(dt_kshap) <- c("none", shap_names)
107 | 
108 |   return(dt_kshap)
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/inst/scripts/devel/compare_indep_implementations.R:
--------------------------------------------------------------------------------
  1 | library(xgboost)
  2 | library(shapr)
  3 | 
  4 | data("Boston", package = "MASS")
  5 | 
  6 | x_var <- c("crim", "zn", "indus","chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "black", "lstat")[1:6]
  7 | y_var <- "medv"
  8 | 
  9 | x_train <- as.matrix(Boston[-1:-6, x_var])
 10 | y_train <- Boston[-1:-6, y_var]
 11 | x_test <- as.matrix(Boston[1:6, x_var])
 12 | 
 13 | x_test <- x_test[rep(1,1000),]
 14 | 
 15 | # Looking at the dependence between the features
 16 | cor(x_train)
 17 | 
 18 | # Fitting a basic xgboost model to the training data
 19 | model <- xgboost(
 20 |   data = x_train,
 21 |   label = y_train,
 22 |   nround = 20,
 23 |   verbose = FALSE
 24 | )
 25 | 
 26 | # Prepare the data for explanation
 27 | explainer <- shapr(x_train, model)
 28 | 
 29 | # Specifying the phi_0, i.e. the expected prediction without any features
 30 | p <- mean(y_train)
 31 | 
 32 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using
 33 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default)
 34 | t_old <- proc.time()
 35 | explanation_old <- explain(
 36 |   x_test,
 37 |   approach = "empirical",
 38 |   type = "independence",
 39 |   explainer = explainer,
 40 |   prediction_zero = p, seed=111,n_samples = 100
 41 | )
 42 | print(proc.time()-t_old)
 43 | #user  system elapsed
 44 | #64.228   2.829  16.455
 45 | 
 46 | t_new <- proc.time()
 47 | explanation_new <- explain(
 48 |   x_test,
 49 |   approach = "independence",
 50 |   explainer = explainer,
 51 |   prediction_zero = p,seed = 111,n_samples = 100
 52 |   )
 53 | print(proc.time()-t_new)
 54 | #user  system elapsed
 55 | #10.376   0.731   4.907
 56 | 
 57 | colMeans(explanation_old$dt)
 58 | colMeans(explanation_new$dt)
 59 | #> colMeans(explanation_old$dt)
 60 | #none       crim         zn      indus       chas        nox         rm
 61 | #22.4459999  0.5606242  0.2137357 -0.3738064 -0.2214088 -0.3129846  0.9761603
 62 | #> colMeans(explanation_new$dt)
 63 | #none       crim         zn      indus       chas        nox         rm
 64 | #22.4459999  0.5638315  0.2087042 -0.3697038 -0.2155639 -0.3173748  0.9724273
 65 | 
 66 | t_old <- proc.time()
 67 | explanation_full_old <- explain(
 68 |   x_test,
 69 |   approach = "empirical",
 70 |   type = "independence",
 71 |   explainer = explainer,
 72 |   prediction_zero = p, seed=111
 73 | )
 74 | print(proc.time()-t_old)
 75 | #user  system elapsed
 76 | #96.064   6.679  29.782
 77 | 
 78 | t_new <- proc.time()
 79 | explanation_full_new <- explain(
 80 |   x_test,
 81 |   approach = "independence",
 82 |   explainer = explainer,
 83 |   prediction_zero = p,seed = 111
 84 | )
 85 | print(proc.time()-t_new)
 86 | #user  system elapsed
 87 | #40.363   5.978  16.982
 88 | 
 89 | explanation_full_old$dt
 90 | explanation_full_new$dt
 91 | 
 92 | #> explanation_full_old$dt
 93 | #none      crim        zn      indus       chas        nox        rm
 94 | #1: 22.446 0.5669854 0.2103575 -0.3720833 -0.2213789 -0.3109162 0.9693561
 95 | #2: 22.446 0.5669846 0.2103578 -0.3720834 -0.2213790 -0.3109160 0.9693565
 96 | #3: 22.446 0.5669852 0.2103576 -0.3720835 -0.2213789 -0.3109162 0.9693562
 97 | #4: 22.446 0.5669855 0.2103575 -0.3720834 -0.2213790 -0.3109163 0.9693562
 98 | #5: 22.446 0.5669851 0.2103576 -0.3720833 -0.2213794 -0.3109161 0.9693566
 99 | #---
100 | #  996: 22.446 0.5669856 0.2103575 -0.3720833 -0.2213791 -0.3109163 0.9693562
101 | #997: 22.446 0.5669851 0.2103575 -0.3720833 -0.2213790 -0.3109162 0.9693563
102 | #998: 22.446 0.5669854 0.2103576 -0.3720834 -0.2213790 -0.3109163 0.9693562
103 | #999: 22.446 0.5669853 0.2103575 -0.3720832 -0.2213791 -0.3109162 0.9693562
104 | #1000: 22.446 0.5669846 0.2103577 -0.3720833 -0.2213790 -0.3109161 0.9693565
105 | #> explanation_full_new$dt
106 | #none      crim        zn      indus      chas        nox        rm
107 | #1: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
108 | #2: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
109 | #3: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
110 | #4: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
111 | #5: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
112 | #---
113 | #  996: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
114 | #997: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
115 | #998: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
116 | #999: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
117 | #1000: 22.446 0.5669853 0.2103576 -0.3720834 -0.221379 -0.3109162 0.9693563
118 | 
119 | 


--------------------------------------------------------------------------------
/inst/REFERENCES.bib:
--------------------------------------------------------------------------------
  1 | @inproceedings{redelmeier2020explaining,
  2 |   title={Explaining predictive models with mixed features using Shapley values and conditional inference trees},
  3 |   author={Redelmeier, Annabelle and Jullum, Martin and Aas, Kjersti},
  4 |   booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction},
  5 |   pages={117--137},
  6 |   year={2020},
  7 |   organization={Springer}
  8 | }
  9 | 
 10 | @article{aas2019explaining,
 11 |   title={Explaining individual predictions when features are dependent: More accurate approximations to Shapley values},
 12 |   author={Aas, Kjersti and Jullum, Martin and L{\o}land, Anders},
 13 |   journal={{Artificial Intelligence}},
 14 |   volume={298},
 15 |   year={2021}
 16 | }
 17 | 
 18 | @inproceedings{lundberg2017unified,
 19 |   title={A unified approach to interpreting model predictions},
 20 |   author={Lundberg, Scott M and Lee, Su-In},
 21 |   booktitle={Advances in Neural Information Processing Systems},
 22 |   pages={4765--4774},
 23 |   year={2017}
 24 | }
 25 | 
 26 | @article{kononenko2010efficient,
 27 |   title={An efficient explanation of individual classifications using game theory},
 28 |   author={{\v{S}}trumbelj, Erik and Kononenko, Igor},
 29 |   journal={Journal of Machine Learning Research},
 30 |   volume={11},
 31 |   number={Jan},
 32 |   pages={1--18},
 33 |   year={2010}
 34 | }
 35 | 
 36 | @article{vstrumbelj2014explaining,
 37 |   title={Explaining prediction models and individual predictions with feature contributions},
 38 |   author={{\v{S}}trumbelj, Erik and Kononenko, Igor},
 39 |   journal={Knowledge and information systems},
 40 |   volume={41},
 41 |   number={3},
 42 |   pages={647--665},
 43 |   year={2014},
 44 |   publisher={Springer}
 45 | }
 46 | 
 47 | @article{lundberg2018consistent,
 48 |   title={Consistent individualized feature attribution for tree ensembles},
 49 |   author={Lundberg, Scott M and Erion, Gabriel G and Lee, Su-In},
 50 |   journal={arXiv preprint arXiv:1802.03888},
 51 |   year={2018}
 52 | }
 53 | 
 54 | @article{hurvich1998smoothing,
 55 |   title={Smoothing parameter selection in nonparametric regression using an improved Akaike information criterion},
 56 |   author={Hurvich, Clifford M and Simonoff, Jeffrey S and Tsai, Chih-Ling},
 57 |   journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
 58 |   volume={60},
 59 |   number={2},
 60 |   pages={271--293},
 61 |   year={1998},
 62 |   publisher={Wiley Online Library}
 63 | }
 64 | 
 65 | @article{Shapley53,
 66 | author={Lloyd S. Shapley},
 67 | title={{A Value for N-Person Games}},
 68 | journal={{Contributions to the Theory of Games}},
 69 | volume=2,
 70 | year = 1953,
 71 | pages ={307--317}
 72 | }
 73 | 
 74 | @article{sklar1959fonctions,
 75 |   title={Fonctions de repartition an dimensions et leurs marges},
 76 |   author={Sklar, M},
 77 |   journal={Publ. inst. statist. univ. Paris},
 78 |   volume={8},
 79 |   pages={229--231},
 80 |   year={1959}
 81 | }
 82 | 
 83 | @article{rosenblatt1956,
 84 | author = {Murray Rosenblatt},
 85 | journal = {{The Annals of Mathematical Statistics}},
 86 | pages = {832--837},
 87 | title = {{Remarks on Some Nonparametric Estimates of a Density Function}},
 88 | volume = 27,
 89 | year = 1956
 90 | }
 91 | 
 92 | @Manual{lime_api,
 93 |     title = {lime: Local Interpretable Model-Agnostic Explanations},
 94 |     author = {Thomas Lin Pedersen and Michaël Benesty},
 95 |     year = {2019},
 96 |     note = {R package version 0.5.1},
 97 |     url = {https://CRAN.R-project.org/package=lime},
 98 | }
 99 | 
100 | @article{hothorn2006unbiased,
101 | 	title={Unbiased recursive partitioning: A conditional inference framework},
102 | 	author={Hothorn, Torsten and Hornik, Kurt and Zeileis, Achim},
103 | 	journal={Journal of Computational and Graphical statistics},
104 | 	volume={15},
105 | 	number={3},
106 | 	pages={651--674},
107 | 	year={2006},
108 | 	publisher={Taylor \& Francis}
109 | }
110 | 
111 | @article{partykit_package,
112 | 	title = {{partykit}: A Modular Toolkit for Recursive Partytioning in {R}},
113 | 	author = {Torsten Hothorn and Achim Zeileis},
114 | 	journal = {Journal of Machine Learning Research},
115 | 	year = {2015},
116 | 	volume = {16},
117 | 	pages = {3905-3909},
118 | }
119 | 
120 | @article{Redelmeier2020ctree,
121 |   title={Explaining predictive models with mixed features using Shapley values and conditional inference trees},
122 |   author={Redelmeier, Annabelle and Jullum, Martin and Aas, Kjersti},
123 |   journal={Submitted},
124 |   year={2020}
125 | }
126 | 
127 | @inproceedings{jullum2021efficient,
128 |   title={Efficient and simple prediction explanations with groupShapley: A practical perspective},
129 |   author={Jullum, Martin and Redelmeier, Annabelle and Aas, Kjersti},
130 |   year={2021},
131 |   pages={28-43},
132 |   booktitle={Proceedings of the 2nd Italian Workshop on Explainable Artificial Intelligence},
133 |   publisher={CEUR Workshop Proceedings}
134 | }
135 | 


--------------------------------------------------------------------------------
/tests/testthat/manual_test_scripts/test_custom_models.R:
--------------------------------------------------------------------------------
  1 | # Test custom models
  2 | 
  3 | # Doing all testing from shapr
  4 | # Because new functions have to be created (to use gbm with shapr), we cannot use a classic testthat set up because
  5 | # shapr will not see the functions created inside of the test environment. Therefore we have to test these functions
  6 | # a bit differently (and more manual) than other tests.
  7 | 
  8 | library(testthat)
  9 | library(shapr)
 10 | library(gbm)
 11 | library(MASS)
 12 | 
 13 | # Data -----------
 14 | data("Boston", package = "MASS")
 15 | y_var <- "medv"
 16 | x_train <- tail(Boston, -6)
 17 | y_train <- tail(Boston[, y_var], -6)
 18 | y_train_binary <- as.factor(tail((Boston[, y_var] > 20) * 1, -6))
 19 | 
 20 | # convert to factors for testing purposes
 21 | x_train$rad <- factor(round(x_train$rad))
 22 | x_train$chas <- factor(round(x_train$chas))
 23 | 
 24 | train_df <- cbind(x_train, y_train, y_train_binary)
 25 | 
 26 | 
 27 | x_var_numeric <- c("lstat", "rm", "dis", "indus")
 28 | x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas")
 29 | 
 30 | train_df_used_numeric <- x_train[, x_var_numeric]
 31 | train_df_used_factor <- x_train[, x_var_factor]
 32 | 
 33 | formula_numeric <- as.formula(paste0("y_train ~ ", paste0(x_var_numeric, collapse = "+")))
 34 | formula_factor <- as.formula(paste0("y_train ~ ", paste0(x_var_factor, collapse = "+")))
 35 | 
 36 | # Custom model with only numeric features
 37 | model_custom <- gbm::gbm(formula_numeric, data = train_df, distribution = "gaussian")
 38 | expect_error(shapr(train_df_used_numeric, model_custom)) # Required model objects defined
 39 | get_model_specs.gbm <- function(x) {
 40 |   feature_list <- list()
 41 |   feature_list$labels <- labels(x$Terms)
 42 |   m <- length(feature_list$labels)
 43 |   feature_list$classes <- attr(x$Terms, "dataClasses")[-1]
 44 |   feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels)
 45 |   feature_list$factor_levels[feature_list$classes == "factor"] <- NA # the model object don't contain factor levels info
 46 |   return(feature_list)
 47 | }
 48 | expect_error(shapr(train_df_used_numeric, model_custom)) # predict_model objects not defined
 49 | 
 50 | predict_model.gbm <- function(x, newdata) {
 51 |   if (!requireNamespace("gbm", quietly = TRUE)) {
 52 |     stop("The gbm package is required for predicting train models")
 53 |   }
 54 |   model_type <- ifelse(
 55 |     x$distribution$name %in% c("bernoulli", "adaboost"),
 56 |     "classification",
 57 |     "regression"
 58 |   )
 59 |   if (model_type == "classification") {
 60 |     predict(x, as.data.frame(newdata), type = "response", n.trees = x$n.trees)
 61 |   } else {
 62 |     predict(x, as.data.frame(newdata), n.trees = x$n.trees)
 63 |   }
 64 | }
 65 | 
 66 | expect_silent(shapr(train_df_used_numeric, model_custom)) # Both defined, so pass silently
 67 | 
 68 | rm(get_model_specs.gbm)
 69 | 
 70 | expect_message(shapr(train_df_used_numeric, model_custom)) # Only predict_model defined, so warning
 71 | rm(predict_model.gbm)
 72 | 
 73 | 
 74 | # Custom model with factors
 75 | model_custom <- gbm::gbm(formula_factor, data = train_df, distribution = "gaussian")
 76 | expect_error(shapr(train_df_used_factor, model_custom)) # Required model objects defined
 77 | get_model_specs.gbm <- function(x) {
 78 |   feature_list <- list()
 79 |   feature_list$labels <- labels(x$Terms)
 80 |   m <- length(feature_list$labels)
 81 |   feature_list$classes <- attr(x$Terms, "dataClasses")[-1]
 82 |   feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels)
 83 |   feature_list$factor_levels[feature_list$classes == "factor"] <- NA # model object doesn't contain factor level info
 84 |   return(feature_list)
 85 | }
 86 | expect_error(shapr(train_df_used_factor, model_custom)) # predict_model objects not defined
 87 | 
 88 | predict_model.gbm <- function(x, newdata) {
 89 |   if (!requireNamespace("gbm", quietly = TRUE)) {
 90 |     stop("The gbm package is required for predicting train models")
 91 |   }
 92 |   model_type <- ifelse(
 93 |     x$distribution$name %in% c("bernoulli", "adaboost"),
 94 |     "classification",
 95 |     "regression"
 96 |   )
 97 |   if (model_type == "classification") {
 98 |     predict(x, as.data.frame(newdata), type = "response", n.trees = x$n.trees)
 99 |   } else {
100 |     predict(x, as.data.frame(newdata), n.trees = x$n.trees)
101 |   }
102 | }
103 | expect_message(shapr(train_df_used_factor, model_custom)) # Both defined, so pass with message as factor_level is NA
104 | 
105 | rm(get_model_specs.gbm)
106 | 
107 | expect_message(shapr(train_df_used_factor, model_custom)) # Only predict_model defined, so warning message returned
108 | 
109 | rm(predict_model.gbm)
110 | 
111 | predict_model.gbm <- function(x, newdata) NULL
112 | 
113 | # Erroneous predict_model defined, so throw error + messages
114 | expect_message(expect_error(shapr(train_df_used_factor, model_custom)))
115 | 
116 | 
117 | rm(predict_model.gbm)
118 | 


--------------------------------------------------------------------------------
/tests/testthat/test-observations.R:
--------------------------------------------------------------------------------
  1 | context("test-observations.R")
  2 | 
  3 | test_that("Test observation_impute", {
  4 |   if (requireNamespace("MASS", quietly = TRUE)) {
  5 |     # Examples
  6 |     n <- 20
  7 |     m <- 2
  8 |     sigma <- cov(matrix(MASS::mvrnorm(m * n, 0, 1), nrow = n))
  9 |     x_train <- as.matrix(MASS::mvrnorm(n, mu = rep(0, m), Sigma = sigma), ncol = m)
 10 |     x_test <- t(as.matrix(MASS::mvrnorm(1, mu = rep(0, m), sigma)))
 11 |     colnames(x_train) <- colnames(x_test) <- paste0("X", seq(m))
 12 |     S <- matrix(c(1, 0, 0, 1), nrow = m)
 13 |     W_kernel <- matrix(rnorm(n * ncol(S), mean = 1 / n, sd = 1 / n^2), nrow = n)
 14 |     r <- observation_impute(W_kernel, S, x_train, x_test)
 15 | 
 16 |     # Test the default argument n_samples
 17 |     expect_equal(
 18 |       observation_impute(W_kernel, S, x_train, x_test, n_samples = 1e3),
 19 |       observation_impute(W_kernel, S, x_train, x_test)
 20 |     )
 21 | 
 22 |     # Test the default argument w_threshold
 23 |     expect_equal(
 24 |       observation_impute(W_kernel, S, x_train, x_test, w_threshold = .7),
 25 |       observation_impute(W_kernel, S, x_train, x_test)
 26 |     )
 27 | 
 28 |     # Test that w_threshold reduces number of rows
 29 |     expect_true(
 30 |       nrow(observation_impute(W_kernel, S, x_train, x_test, w_threshold = .7)) >
 31 |         nrow(observation_impute(W_kernel, S, x_train, x_test, w_threshold = 0.5))
 32 |     )
 33 | 
 34 |     # Test that n_samples reduces number of rows
 35 |     expect_true(
 36 |       nrow(observation_impute(W_kernel, S, x_train, x_test)) >
 37 |         nrow(observation_impute(W_kernel, S, x_train, x_test, n_samples = 10))
 38 |     )
 39 | 
 40 |     # Tests error
 41 |     expect_error(observation_impute(1, S, x_train, x_test))
 42 |     expect_error(observation_impute(W_kernel, 1, x_train, x_test))
 43 |     expect_error(observation_impute(W_kernel, tail(S, -1), x_train, x_test))
 44 |     expect_error(observation_impute(tail(W_kernel, -1), S, x_train, x_test))
 45 | 
 46 |     # Test single result
 47 |     cnms <- c(colnames(x_train), "id_combination", "w")
 48 |     expect_true(data.table::is.data.table(r))
 49 |     expect_true(ncol(r) == m + 2)
 50 |     expect_true(all(colnames(r) == cnms))
 51 |     expect_true(all(unlist(lapply(r, is.numeric))))
 52 |     expect_true(is.integer(r$id_combination))
 53 |   }
 54 | })
 55 | 
 56 | 
 57 | test_that("Check correct index_feature usage in prepare_data", {
 58 | 
 59 |   data("Boston", package = "MASS")
 60 |   x_var <- c("lstat", "rm", "dis", "indus")
 61 |   y_var <- "medv"
 62 | 
 63 |   y_train <- tail(Boston[, y_var], 50)
 64 |   x <- as.matrix(head(Boston[, x_var], 2))
 65 |   n_samples <- 100
 66 |   index_features <- 4:7
 67 |   w_threshold = 0.95
 68 |   type = "fixed_sigma"
 69 |   fixed_sigma_vec = 0.1
 70 |   n_samples_aicc = 1000
 71 |   eval_max_aicc = 20
 72 |   start_aicc = 0.1
 73 |   mincriterion = 0.95
 74 |   minsplit = 20
 75 |   minbucket = 7
 76 |   sample = TRUE
 77 | 
 78 |   explainer <- readRDS(file = "test_objects/shapley_explainer_obj.rds")
 79 |   explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt)
 80 |   explainer$n_samples <- n_samples
 81 | 
 82 |   explainer$approach <- "independence"
 83 |   dt <- prepare_data(explainer, index_features = index_features)
 84 |   expect_identical(sort(dt[,unique(id_combination)]),index_features)
 85 | 
 86 |   explainer$type <- type
 87 |   explainer$fixed_sigma_vec <- fixed_sigma_vec
 88 |   explainer$n_samples_aicc <- n_samples_aicc
 89 |   explainer$eval_max_aicc <- eval_max_aicc
 90 |   explainer$start_aicc <- start_aicc
 91 |   explainer$w_threshold <- w_threshold
 92 |   explainer$cov_mat <- stats::cov(explainer$x_train)
 93 | 
 94 |   explainer$approach <- "empirical"
 95 |   dt <- prepare_data(explainer, index_features = index_features)
 96 |   expect_identical(sort(dt[,unique(id_combination)]),index_features)
 97 | 
 98 |   explainer$mu <- unname(colMeans(explainer$x_train))
 99 |   explainer$approach <- "gaussian"
100 |   dt <- prepare_data(explainer, index_features = index_features)
101 |   expect_identical(sort(dt[,unique(id_combination)]),index_features)
102 | 
103 |   explainer$x_test_gaussian <- explainer$x_test # Shortcut
104 |   explainer$approach <- "copula"
105 |   dt <- prepare_data(explainer, index_features = index_features)
106 |   expect_identical(sort(dt[,unique(id_combination)]),index_features)
107 | 
108 |   explainer$x_test_gaussian <- explainer$x_test # Shortcut
109 |   explainer$approach <- "copula"
110 |   dt <- prepare_data(explainer, index_features = index_features)
111 |   expect_identical(sort(dt[,unique(id_combination)]),index_features)
112 | 
113 |   explainer$mincriterion <- mincriterion
114 |   explainer$minsplit <- minsplit
115 |   explainer$minbucket <- minbucket
116 |   explainer$sample <- sample
117 |   explainer$approach <- "ctree"
118 |   explainer$x_test <- preprocess_data(x, explainer$feature_list)$x_dt
119 |   dt <- prepare_data(explainer, index_features = index_features)
120 |   expect_identical(sort(dt[,unique(id_combination)]),index_features)
121 | 
122 | })
123 | 


--------------------------------------------------------------------------------
/inst/joss_paper/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Example from https://joss.readthedocs.io/en/latest/submitting.html
 3 | title: 'shapr: An R-package for explaining machine learning models with dependence-aware Shapley values'
 4 | tags:
 5 |   - R
 6 |   - explainable AI
 7 |   - interpretable machine learning
 8 |   - shapley values
 9 |   - feature dependence
10 | authors:
11 |   - name: Nikolai Sellereite
12 |     orcid: 0000-0002-4671-0337
13 |     affiliation: 1 # (Multiple affiliations must be quoted)
14 |   - name: Martin Jullum
15 |     orcid: 0000-0003-3908-5155
16 |     affiliation: 1
17 | affiliations:
18 |  - name: Norwegian Computing Center
19 |    index: 1
20 | citation_author: Sellereite and Jullum
21 | date: 20 November 2019
22 | year: 2019
23 | formatted_doi: XX.XXXXX/joss.XXXXX
24 | bibliography: paper.bib
25 | output: rticles::joss_article
26 | csl: apa.csl
27 | journal: JOSS
28 | ---
29 | 
30 | 
31 | # Summary
32 | 
33 | A common task within machine learning is to train a model to predict an unknown outcome 
34 | (response variable) based on a set of known input variables/features.
35 | When using such models for real life applications, it is often crucial to understand why a certain set of features lead 
36 | to a specific prediction.
37 | Most machine learning models are, however, complicated and hard to understand, so that they are often viewed as 
38 | "black-boxes", that produce some output from some input.
39 | 
40 | Shapley values [@Shapley53] is a concept from cooperative game theory used to distribute fairly a joint payoff among the
41 | cooperating players. 
42 | @kononenko2010efficient and later @lundberg2017unified proposed to use the Shapley value framework to explain 
43 | predictions by distributing the prediction value on the input features. 
44 | Established methods and implementations for explaining predictions with Shapley values like Shapley 
45 | Sampling Values [@vstrumbelj2014explaining], SHAP/Kernel SHAP [@lundberg2017unified], and to some extent 
46 | TreeSHAP/TreeExplainer [@lundberg2018consistent; @Lundberg2020], assume that the features are independent when 
47 | approximating the Shapley values. 
48 | The `R`-package `shapr`, however, implements the methodology proposed by @aas2019explaining, where predictions are explained while
49 | accounting for the dependence between the features, resulting in significantly more accurate approximations to the 
50 | Shapley values. 
51 | 
52 | 
53 | 
54 | # Implementation
55 | 
56 | `shapr` implements a variant of the Kernel SHAP methodology [@lundberg2017unified] for efficiently dealing with the 
57 | combinatorial problem related to the Shapley value formula.
58 | The main methodological contribution of @aas2019explaining is three different methods to estimate certain conditional 
59 | expectation quantities, referred to as the  _empirical_, _Gaussian_ and _copula_ approach. Additionaly, the user has
60 | the option of combining the three approaches. 
61 | The implementation supports explanation of models fitted with the following functions natively: `stats::lm` [@rCore], `stats::glm` [@rCore], 
62 | `ranger::ranger` [@ranger], `mgcv::gam` [@mgcv] and `xgboost::xgboost`/`xgboost::xgb.train` [@xgboost]. 
63 | Moreover, the package supports explanation of custom models by supplying two simple additional class functions.
64 | 
65 | For reference, the package also includes a benchmark implementation of the original (independence assuming) version of
66 | Kernel SHAP [@lundberg2017unified], providing identical results to the "official" Kernel SHAP `Python` package `shap`. 
67 | This allows the user to easily see the effect and importance of accounting for the feature dependence.
68 | 
69 | The user interface in the package has largely been adopted from the `R`-package `lime` [@limeRpackage]. 
70 | The user first sets up the explainability framework with the `shapr` function. 
71 | Then the output from `shapr` is provided to the `explain` function, along with the data to explain the prediction 
72 | and the method that should be used to estimate the aforementioned conditional expectations.
73 | 
74 | The majority of the code is in plain `R` [@rCore], while the most time consuming operations are coded in `C++` 
75 | through the `Rcpp` package [@rcppRpackage] and `RcppArmadillo` package [@eddelbuettel2014rcpparmadillo] for computational speed up. 
76 | For RAM efficiency and computational speed up of typical bookeeping operations, we utilize the `data.table` 
77 | package [@datatableRpackage] which does operations "by reference", i.e. without memory copies.
78 | 
79 | For a detailed description of the underlying methodology that the package implements, we refer to the 
80 | [paper](https://arxiv.org/abs/1903.10464) [@aas2019explaining] which uses the package in examples and simulation 
81 | studies.
82 | To get started with the package, we recommend going through the package vignette and introductory examples
83 | available at the package's [pkgdown site](https://norskregnesentral.github.io/shapr/). 
84 | 
85 | # Acknowledgement
86 | 
87 | This work was supported by the Norwegian Research Council grant 237718 (Big Insight).
88 | 
89 | 
90 | # References
91 | 


--------------------------------------------------------------------------------
/inst/joss_paper/paper.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Example from https://joss.readthedocs.io/en/latest/submitting.html
 3 | title: 'shapr: An R-package for explaining machine learning models with dependence-aware Shapley values'
 4 | tags:
 5 |   - R
 6 |   - explainable AI
 7 |   - interpretable machine learning
 8 |   - shapley values
 9 |   - feature dependence
10 | authors:
11 |   - name: Nikolai Sellereite
12 |     orcid: 0000-0002-4671-0337
13 |     affiliation: 1 # (Multiple affiliations must be quoted)
14 |   - name: Martin Jullum
15 |     orcid: 0000-0003-3908-5155
16 |     affiliation: 1
17 | affiliations:
18 |  - name: Norwegian Computing Center
19 |    index: 1
20 | citation_author: Sellereite and Jullum
21 | date: 20 November 2019
22 | year: 2019
23 | formatted_doi: XX.XXXXX/joss.XXXXX
24 | bibliography: paper.bib
25 | output: rticles::joss_article
26 | csl: apa.csl
27 | journal: JOSS
28 | ---
29 | 
30 | 
31 | # Summary
32 | 
33 | A common task within machine learning is to train a model to predict an unknown outcome 
34 | (response variable) based on a set of known input variables/features.
35 | When using such models for real life applications, it is often crucial to understand why a certain set of features lead 
36 | to a specific prediction.
37 | Most machine learning models are, however, complicated and hard to understand, so that they are often viewed as 
38 | "black-boxes", that produce some output from some input.
39 | 
40 | Shapley values [@Shapley53] is a concept from cooperative game theory used to distribute fairly a joint payoff among the
41 | cooperating players. 
42 | @kononenko2010efficient and later @lundberg2017unified proposed to use the Shapley value framework to explain 
43 | predictions by distributing the prediction value on the input features. 
44 | Established methods and implementations for explaining predictions with Shapley values like Shapley 
45 | Sampling Values [@vstrumbelj2014explaining], SHAP/Kernel SHAP [@lundberg2017unified], and to some extent 
46 | TreeSHAP/TreeExplainer [@lundberg2018consistent; @Lundberg2020], assume that the features are independent when 
47 | approximating the Shapley values. 
48 | The `R`-package `shapr`, however, implements the methodology proposed by @aas2019explaining, where predictions are explained while
49 | accounting for the dependence between the features, resulting in significantly more accurate approximations to the 
50 | Shapley values. 
51 | 
52 | 
53 | 
54 | # Implementation
55 | 
56 | `shapr` implements a variant of the Kernel SHAP methodology [@lundberg2017unified] for efficiently dealing with the 
57 | combinatorial problem related to the Shapley value formula.
58 | The main methodological contribution of @aas2019explaining is three different methods to estimate certain conditional 
59 | expectation quantities, referred to as the  _empirical_, _Gaussian_ and _copula_ approach. Additionaly, the user has
60 | the option of combining the three approaches. 
61 | The implementation supports explanation of models fitted with the following functions natively: `stats::lm` [@rCore], `stats::glm` [@rCore], 
62 | `ranger::ranger` [@ranger], `mgcv::gam` [@mgcv] and `xgboost::xgboost`/`xgboost::xgb.train` [@xgboost]. 
63 | Moreover, the package supports explanation of custom models by supplying two simple additional class functions.
64 | 
65 | For reference, the package also includes a benchmark implementation of the original (independence assuming) version of
66 | Kernel SHAP [@lundberg2017unified], providing identical results to the "official" Kernel SHAP `Python` package `shap`. 
67 | This allows the user to easily see the effect and importance of accounting for the feature dependence.
68 | 
69 | The user interface in the package has largely been adopted from the `R`-package `lime` [@limeRpackage]. 
70 | The user first sets up the explainability framework with the `shapr` function. 
71 | Then the output from `shapr` is provided to the `explain` function, along with the data to explain the prediction 
72 | and the method that should be used to estimate the aforementioned conditional expectations.
73 | 
74 | The majority of the code is in plain `R` [@rCore], while the most time consuming operations are coded in `C++` 
75 | through the `Rcpp` package [@rcppRpackage] and `RcppArmadillo` package [@eddelbuettel2014rcpparmadillo] for computational speed up. 
76 | 
77 | For RAM efficiency and computational speed up of typical bookeeping operations, we utilize the `data.table` 
78 | package [@datatableRpackage] which does operations "by reference", i.e. without memory copies.
79 | 
80 | For a detailed description of the underlying methodology that the package implements, we refer to the 
81 | [paper](https://arxiv.org/abs/1903.10464) [@aas2019explaining] which uses the package in examples and simulation 
82 | studies.
83 | To get started with the package, we recommend going through the package vignette and introductory examples
84 | available at the package's [pkgdown site](https://norskregnesentral.github.io/shapr/). 
85 | 
86 | # Acknowledgement
87 | 
88 | This work was supported by the Norwegian Research Council grant 237718 (Big Insight).
89 | 
90 | 
91 | # References
92 | 


--------------------------------------------------------------------------------
/R/plot.R:
--------------------------------------------------------------------------------
  1 | #' Plot of the Shapley value explanations
  2 | #'
  3 | #' @description Plots the individual prediction explanations.
  4 | #'
  5 | #' @param x An \code{shapr} object. See \code{\link{explain}}.
  6 | #' @param digits Integer. Number of significant digits to use in the feature description
  7 | #' @param plot_phi0 Logical. Whether to include \code{phi0} in the plot
  8 | #' @param index_x_test Integer vector. Which of the test observations to plot. E.g. if you have
  9 | #' explained 10 observations using \code{\link{explain}}, you can generate a plot for the first 5
 10 | #' observations by setting \code{index_x_test = 1:5}.
 11 | #' @param top_k_features Integer. How many features to include in the plot. E.g. if you have 15
 12 | #' features in your model you can plot the 5 most important features, for each explanation, by setting
 13 | #' \code{top_k_features = 1:5}.
 14 | #' @param ... Currently not used.
 15 | #'
 16 | #' @details See \code{vignette("understanding_shapr", package = "shapr")} for an example of
 17 | #' how you should use the function.
 18 | #'
 19 | #' @return ggplot object with plots of the Shapley value explanations
 20 | #'
 21 | #' @export
 22 | #' @examples
 23 | #' if (requireNamespace("MASS", quietly = TRUE)) {
 24 | #'   #' # Load example data
 25 | #'   data("Boston", package = "MASS")
 26 | #'
 27 | #'   # Split data into test- and training data
 28 | #'   x_train <- head(Boston, -3)
 29 | #'   x_test <- tail(Boston, 3)
 30 | #'
 31 | #'   # Fit a linear model
 32 | #'   model <- lm(medv ~ lstat + rm + dis + indus, data = x_train)
 33 | #'
 34 | #'   # Create an explainer object
 35 | #'   explainer <- shapr(x_train, model)
 36 | #'
 37 | #'   # Explain predictions
 38 | #'   p <- mean(x_train$medv)
 39 | #'
 40 | #'   # Empirical approach
 41 | #'   explanation <- explain(x_test,
 42 | #'     explainer,
 43 | #'     approach = "empirical",
 44 | #'     prediction_zero = p,
 45 | #'     n_samples = 1e2
 46 | #'   )
 47 | #'
 48 | #'   if (requireNamespace("ggplot2", quietly = TRUE)) {
 49 | #'     # Plot the explantion (this function)
 50 | #'     plot(explanation)
 51 | #'   }
 52 | #' }
 53 | #' @author Martin Jullum
 54 | plot.shapr <- function(x,
 55 |                        digits = 3,
 56 |                        plot_phi0 = TRUE,
 57 |                        index_x_test = NULL,
 58 |                        top_k_features = NULL,
 59 |                        ...) {
 60 |   if (!requireNamespace("ggplot2", quietly = TRUE)) {
 61 |     stop("ggplot2 is not installed. Please run install.packages('ggplot2')")
 62 |   }
 63 | 
 64 |   if (is.null(index_x_test)) index_x_test <- seq(nrow(x$x_test))
 65 |   if (is.null(top_k_features)) top_k_features <- ncol(x$x_test) + 1
 66 |   id <- phi <- NULL # due to NSE notes in R CMD check
 67 | 
 68 |   is_groupwise <- x$is_groupwise
 69 | 
 70 |   # melting Kshap
 71 |   shap_names <- colnames(x$dt)[-1]
 72 |   KshapDT <- data.table::copy(x$dt)
 73 |   KshapDT[, id := .I]
 74 |   meltKshap <- data.table::melt(KshapDT, id.vars = "id", value.name = "phi")
 75 |   meltKshap[, sign := factor(sign(phi), levels = c(1, -1), labels = c("Increases", "Decreases"))]
 76 | 
 77 |   # Converting and melting Xtest
 78 |   if (!is_groupwise) {
 79 |     desc_mat <- format(x$x_test, digits = digits)
 80 |     for (i in 1:ncol(desc_mat)) {
 81 |       desc_mat[, i] <- paste0(shap_names[i], " = ", desc_mat[, i])
 82 |     }
 83 |   } else {
 84 |     desc_mat <- format(x$dt[, -1], digits = digits)
 85 |     for (i in 1:ncol(desc_mat)) {
 86 |       desc_mat[, i] <- paste0(shap_names[i])
 87 |     }
 88 |   }
 89 | 
 90 |   desc_dt <- data.table::as.data.table(cbind(none = "none", desc_mat))
 91 |   melt_desc_dt <- data.table::melt(desc_dt[, id := .I], id.vars = "id", value.name = "description")
 92 | 
 93 |   # Data table for plotting
 94 |   plotting_dt <- merge(meltKshap, melt_desc_dt)
 95 | 
 96 | 
 97 |   # Adding the predictions
 98 |   predDT <- data.table::data.table(id = KshapDT$id, pred = x$p)
 99 |   plotting_dt <- merge(plotting_dt, predDT, by = "id")
100 | 
101 |   # Adding header for each individual plot
102 |   header <- variable <- pred <- description <- NULL # due to NSE notes in R CMD check
103 |   plotting_dt[, header := paste0("id: ", id, ", pred = ", format(pred, digits = digits + 1))]
104 | 
105 |   if (!plot_phi0) {
106 |     plotting_dt <- plotting_dt[variable != "none"]
107 |   }
108 |   plotting_dt <- plotting_dt[id %in% index_x_test]
109 |   plotting_dt[, rank := data.table::frank(-abs(phi)), by = "id"]
110 |   plotting_dt <- plotting_dt[rank <= top_k_features]
111 |   plotting_dt[, description := factor(description, levels = unique(description[order(abs(phi))]))]
112 | 
113 |   # Plotting
114 |   gg <- ggplot2::ggplot(plotting_dt) +
115 |     ggplot2::facet_wrap(~header, scales = "free_y", labeller = "label_value", ncol = 2) +
116 |     ggplot2::geom_col(ggplot2::aes(x = description, y = phi, fill = sign)) +
117 |     ggplot2::coord_flip() +
118 |     ggplot2::scale_fill_manual(values = c("steelblue", "lightsteelblue"), drop = TRUE) +
119 |     ggplot2::labs(
120 |       y = "Feature contribution",
121 |       x = "Feature",
122 |       fill = "",
123 |       title = "Shapley value prediction explanation"
124 |     ) +
125 |     ggplot2::theme(
126 |       legend.position = "bottom",
127 |       plot.title = ggplot2::element_text(hjust = 0.5)
128 |     )
129 | 
130 |   return(gg)
131 | }
132 | 


--------------------------------------------------------------------------------
/inst/scripts/compare_shap_python.R:
--------------------------------------------------------------------------------
  1 | library(MASS)
  2 | library(xgboost)
  3 | library(shapr)
  4 | library(data.table)
  5 | 
  6 | # Python settings
  7 | # Using the virtual environment here "../../Python/.venv/bin/python", as set by
  8 | #Sys.setenv(RETICULATE_PYTHON = "../../Python/.venv/bin/python") in the .Rprofile
  9 | library(reticulate)
 10 | 
 11 | # Install some packages
 12 | #py_install("xgboost")
 13 | #py_install("shap")
 14 | #py_install("pandas")
 15 | 
 16 | data("Boston")
 17 | 
 18 | x_var <- c("lstat", "rm", "dis", "indus")
 19 | y_var <- "medv"
 20 | 
 21 | x_train <- as.matrix(tail(Boston[, x_var], -6))
 22 | y_train <- tail(Boston[, y_var], -6)
 23 | x_test <- as.matrix(head(Boston[, x_var], 6))
 24 | 
 25 | # Creating a larger test data set (600 observations) for more realistic function time calls.
 26 | # Modifying x_test to repeat the 6 test observations 100 times
 27 | x_test = rep(1,100) %x% x_test
 28 | colnames(x_test) <- colnames(x_train)
 29 | 
 30 | # Reading the R format version of the xgboost model to avoid crash reading same xgboost model in R and Python
 31 | model <- readRDS(system.file("model_objects", "xgboost_model_object.rds", package = "shapr"))
 32 | 
 33 | pred_test <- predict(model,x_test)
 34 | 
 35 | # Spedifying the phi_0, i.e. the expected prediction without any features
 36 | p0 <- mean(predict(model,x_train))# adjustment from the standard mean(y_train) to comply with the shap implementation
 37 | 
 38 | time_R_start <- proc.time()
 39 | # Prepare the data for explanation
 40 | explainer <- shapr(x_train, model)
 41 | 
 42 | time_R_prepare <- proc.time()
 43 | 
 44 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using
 45 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default)
 46 | explanation_independence <- explain(x_test, explainer, approach = "independence", prediction_zero = p0)
 47 | 
 48 | time_R_indep0 <- proc.time()
 49 | 
 50 | explanation_largesigma <- explain(x_test, explainer, approach = "empirical", type = "fixed_sigma",
 51 |                                   fixed_sigma_vec = 10000, w_threshold = 1, prediction_zero = p0)
 52 | 
 53 | time_R_largesigma0 <- proc.time()
 54 | 
 55 | time_R_indep <- time_R_indep0 - time_R_start
 56 | time_R_largesigma <- (time_R_largesigma0 - time_R_indep0) + (time_R_prepare- time_R_start)
 57 | 
 58 | # Printing the Shapley values for the test data
 59 | Kshap_indep <- explanation_independence$dt
 60 | Kshap_largesigma <- explanation_largesigma$dt
 61 | 
 62 | head(Kshap_indep)
 63 | #> Kshap_indep
 64 | #       none    lstat         rm         dis     indus
 65 | #1: 22,41355 7,116128  0,5203017 -1,91427784 3,1657530
 66 | #2: 22,41355 2,173011 -1,2201068 -0,47653736 0,3620256
 67 | #3: 22,41355 8,280909  3,7869719 -1,96968536 0,6037250
 68 | #4: 22,41355 8,384073  2,9590225 -2,19376523 1,8672685
 69 | #5: 22,41355 4,212031  3,8319436 -0,06695137 1,3392699
 70 | #6: 22,41355 3,295275 -1,2450126 -0,70618891 1,0924035
 71 | 
 72 | head(Kshap_largesigma)
 73 | #> Kshap_largesigma
 74 | #       none    lstat         rm        dis     indus
 75 | #1: 22,41355 7,116128  0,5203018 -1,9142779 3,1657530
 76 | #2: 22,41355 2,173011 -1,2201069 -0,4765373 0,3620255
 77 | #3: 22,41355 8,280910  3,7869718 -1,9696854 0,6037249
 78 | #4: 22,41355 8,384073  2,9590226 -2,1937652 1,8672685
 79 | #5: 22,41355 4,212031  3,8319435 -0,0669514 1,3392700
 80 | #6: 22,41355 3,295275 -1,2450126 -0,7061889 1,0924036
 81 | 
 82 | 
 83 | # Checking the difference between the methods
 84 | mean(abs(as.matrix(Kshap_indep)-as.matrix(Kshap_largesigma)))
 85 | #[1] 8.404487e-08  # Numerically identical
 86 | 
 87 | 
 88 | 
 89 | #### Running shap from Python ####
 90 | reticulate::py_run_file(system.file("scripts", "shap_python_script.py", package = "shapr"))
 91 | # Writes Python objects to the list py #
 92 | 
 93 | # Checking that the predictions are identical
 94 | sum((pred_test-py$py_pred_test)^2)
 95 | 
 96 | head(Kshap_indep)
 97 | #> Kshap_indep
 98 | #       none    lstat         rm         dis     indus
 99 | #1: 22,41355 7,116128  0,5203017 -1,91427784 3,1657530
100 | #2: 22,41355 2,173011 -1,2201068 -0,47653736 0,3620256
101 | #3: 22,41355 8,280909  3,7869719 -1,96968536 0,6037250
102 | #4: 22,41355 8,384073  2,9590225 -2,19376523 1,8672685
103 | #5: 22,41355 4,212031  3,8319436 -0,06695137 1,3392699
104 | #6: 22,41355 3,295275 -1,2450126 -0,70618891 1,0924035
105 | 
106 | head(py$Kshap_shap)
107 | #> py$Kshap_shap
108 | #      none    lstat         rm         dis     indus
109 | #1 22,41355 7,116128  0,5203018 -1,91427784 3,1657530
110 | #2 22,41355 2,173011 -1,2201069 -0,47653727 0,3620255
111 | #3 22,41355 8,280910  3,7869719 -1,96968537 0,6037250
112 | #4 22,41355 8,384073  2,9590226 -2,19376508 1,8672686
113 | #5 22,41355 4,212031  3,8319435 -0,06695135 1,3392701
114 | #6 22,41355 3,295275 -1,2450126 -0,70618891 1,0924036
115 | 
116 | 
117 | # Checking difference between our R implementtaion and the shap implementation i Python
118 | mean(abs(as.matrix(Kshap_indep)-as.matrix(py$Kshap_shap)))
119 | #[1] 1,300368e-07 # Numerically identical
120 | 
121 | # Checking the running time of the different methods
122 | time_R_indep[3]
123 | time_R_largesigma[3]
124 | py$time_py
125 | 
126 | #> time_R_indep[3]
127 | #elapsed
128 | #7,417
129 | #> time_R_largesigma[3]
130 | #elapsed
131 | #6,271
132 | #> py$time_py
133 | #[1] 21,23536
134 | 
135 | # Our R implementation is about 3 times faster than the the shap package on this task.
136 | # Might be some overhead by calling Python from R, but it shouldn't be even close to that much.
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/inst/joss_paper/paper.bib:
--------------------------------------------------------------------------------
  1 | @article{aas2019explaining,
  2 |   title={Explaining individual predictions when features are dependent: More accurate approximations to {S}hapley values},
  3 |   author={Aas, Kjersti and Jullum, Martin and L{\o}land, Anders},
  4 |   journal={Arxiv preprint arxiv:1903.10464},
  5 |   year={2019}
  6 | }
  7 | 
  8 | @inproceedings{lundberg2017unified,
  9 |   title={A unified approach to interpreting model predictions},
 10 |   author={Lundberg, Scott M and Lee, Su-In},
 11 |   booktitle={Advances in Neural Information Processing Systems},
 12 |   pages={4765--4774},
 13 |   year={2017}
 14 | }
 15 | 
 16 | @article{kononenko2010efficient,
 17 |   title={An efficient explanation of individual classifications using game theory},
 18 |   author={{\v{S}}trumbelj, Erik and Kononenko, Igor},
 19 |   journal={Journal of machine learning research},
 20 |   volume={11},
 21 |   number={Jan},
 22 |   pages={1--18},
 23 |   year={2010}
 24 | }
 25 | 
 26 | @article{vstrumbelj2014explaining,
 27 |   title={Explaining prediction models and individual predictions with feature contributions},
 28 |   author={{\v{S}}trumbelj, Erik and Kononenko, Igor},
 29 |   journal={Knowledge and information systems},
 30 |   volume={41},
 31 |   number={3},
 32 |   pages={647--665},
 33 |   year={2014},
 34 |   publisher={Springer},
 35 |   doi={10.1007/s10115-013-0679-x},
 36 | }
 37 | 
 38 | @article{lundberg2018consistent,
 39 |   title={Consistent individualized feature attribution for tree ensembles},
 40 |   author={Lundberg, Scott M and Erion, Gabriel G and Lee, Su-In},
 41 |   journal={Arxiv preprint arxiv:1802.03888},
 42 |   year={2018}
 43 | }
 44 | 
 45 | @article{hurvich1998smoothing,
 46 |   title={Smoothing parameter selection in nonparametric regression using an improved Akaike information criterion},
 47 |   author={Hurvich, Clifford M and Simonoff, Jeffrey S and Tsai, Chih-Ling},
 48 |   journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
 49 |   volume={60},
 50 |   number={2},
 51 |   pages={271--293},
 52 |   year={1998},
 53 |   publisher={Wiley Online Library},
 54 |   doi={10.1111/1467-9868.00125}
 55 | }
 56 | 
 57 | @Article{Shapley53,
 58 | author={Lloyd S. Shapley},
 59 | title={A value for n-person games},
 60 | journal={Contributions to the theory of games},
 61 | volume=2,
 62 | year = 1953,
 63 | pages ={307--317}
 64 | }
 65 | 
 66 | @article{sklar1959fonctions,
 67 |   title={Fonctions de repartition an dimensions et leurs marges},
 68 |   author={Sklar, M},
 69 |   journal={Publ. inst. statist. univ. Paris},
 70 |   volume={8},
 71 |   pages={229--231},
 72 |   year={1959}
 73 | }
 74 | 
 75 | @article{rosenblatt1956,
 76 | author = {Murray Rosenblatt},
 77 | journal = {{The Annals of Mathematical Statistics}},
 78 | pages = {832--837},
 79 | title = {{Remarks on Some Nonparametric Estimates of a Density Function}},
 80 | volume = 27,
 81 | year = 1956,
 82 | doi = {10.1214/aoms/1177728190}
 83 | }
 84 | 
 85 | 
 86 |   @Manual{limeRpackage,
 87 |     title = {{lime}: Local Interpretable Model-Agnostic Explanations},
 88 |     author = {Thomas Lin Pedersen and Michaël Benesty},
 89 |     year = {2019},
 90 |     note = {R package version 0.5.1},
 91 |     url = {https://CRAN.R-project.org/package=lime},
 92 |   }
 93 | 
 94 | 
 95 |   @Article{rcppRpackage,
 96 |     title = {{Rcpp}: Seamless {R} and {C++} Integration},
 97 |     author = {Dirk Eddelbuettel and Romain Fran\c{c}ois},
 98 |     journal = {Journal of statistical software},
 99 |     year = {2011},
100 |     volume = {40},
101 |     number = {8},
102 |     pages = {1--18},
103 |     url = {http://www.jstatsoft.org/v40/i08/},
104 |     doi = {10.18637/jss.v040.i08},
105 |   }
106 | 
107 |   @Manual{rCore,
108 |     title = {R: A language and environment for statistical computing},
109 |     author = {{R Core Team}},
110 |     organization = {R foundation for statistical computing},
111 |     year = {2019},
112 |     url = {https://www.R-project.org/},
113 |   }
114 | 
115 |   @Manual{datatableRpackage,
116 |     title = {{data.table}: Extension of `data.frame`},
117 |     author = {Matt Dowle and Arun Srinivasan},
118 |     year = {2019},
119 |     note = {R package version 1.12.2},
120 |     url = {https://CRAN.R-project.org/package=data.table},
121 |   }
122 | 
123 | 
124 | @article{Lundberg2020,
125 |   author    = {Scott M. Lundberg and
126 |                Gabriel G. Erion and
127 |                Hugh Chen and
128 |                Alex DeGrave and
129 |                Jordan M. Prutkin and
130 |                Bala Nair and
131 |                Ronit Katz and
132 |                Jonathan Himmelfarb and
133 |                Nisha Bansal and
134 |                Su{-}In Lee},
135 |   title     = {From local explanations to global understanding with explainable {AI} for trees},
136 |   journal   = {Nature machine intelligence},
137 |   volume    = {2},
138 |   pages     = {56-67},
139 |   year      = {2020},
140 |   doi       = {10.1038/s42256-019-0138-9}
141 | }
142 | 
143 | @article{eddelbuettel2014rcpparmadillo,
144 |   title={RcppArmadillo: {A}ccelerating {R} with high-performance {C}++ linear algebra},
145 |   author={Eddelbuettel, Dirk and Sanderson, Conrad},
146 |   journal={Computational statistics \& data analysis},
147 |   volume={71},
148 |   pages={1054--1063},
149 |   year={2014},
150 |   publisher={Elsevier},
151 |   doi = {10.1016/j.csda.2013.02.005}
152 | }
153 | 
154 | @Article{ranger,
155 |     title = {{ranger}: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}},
156 |     author = {Marvin N. Wright and Andreas Ziegler},
157 |     journal = {Journal of statistical software},
158 |     year = {2017},
159 |     volume = {77},
160 |     number = {1},
161 |     pages = {1--17},
162 |     doi = {10.18637/jss.v077.i01},
163 | }
164 | 
165 | @Manual{xgboost,
166 |     title = {xgboost: Extreme Gradient Boosting},
167 |     author = {Tianqi Chen and Tong He and Michael Benesty and Vadim Khotilovich and Yuan Tang and Hyunsu Cho and Kailong Chen and Rory Mitchell and Ignacio Cano and Tianyi Zhou and Mu Li and Junyuan Xie and Min Lin and Yifeng Geng and Yutian Li},
168 |     year = {2019},
169 |     note = {R package version 0.90.0.2},
170 |     url = {https://CRAN.R-project.org/package=xgboost},
171 |   }
172 | 
173 | @Book{mgcv,
174 |     title = {Generalized Additive Models: An Introduction with {R}},
175 |     year = {2017},
176 |     author = {S.N Wood},
177 |     edition = {2},
178 |     publisher = {Chapman and Hall/CRC},
179 |   }
180 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #' Computing single H matrix in AICc-function using the Mahalanobis distance
  5 | #'
  6 | #' @param X matrix with "covariates"
  7 | #' @param mcov covariance matrix
  8 | #' @param S_scale_dist logical indicating whether the Mahalanobis distance should be scaled with the number of variables
  9 | #' @param h numeric specifying the scaling (sigma)
 10 | #'
 11 | #' @export
 12 | #' @keywords internal
 13 | #'
 14 | #' @return Matrix of dimension \code{ncol(X)*ncol(X)}
 15 | #' @author Martin Jullum
 16 | hat_matrix_cpp <- function(X, mcov, S_scale_dist, h) {
 17 |     .Call(`_shapr_hat_matrix_cpp`, X, mcov, S_scale_dist, h)
 18 | }
 19 | 
 20 | #' sigma_hat_sq-function
 21 | #'
 22 | #' @param H Matrix. Output from \code{\link{hat_matrix_cpp}}
 23 | #' @param y Vector, i.e. representing the response variable
 24 | #'
 25 | #' @export
 26 | #' @keywords internal
 27 | #'
 28 | #' @return Scalar
 29 | #'
 30 | #' @author Martin Jullum
 31 | rss_cpp <- function(H, y) {
 32 |     .Call(`_shapr_rss_cpp`, H, y)
 33 | }
 34 | 
 35 | #' correction term with trace_input in AICc formula
 36 | #'
 37 | #' @param tr_H numeric giving the trace of H
 38 | #' @param n numeric given the number of rows in H
 39 | #' @export
 40 | #' @keywords internal
 41 | #'
 42 | #' @return Scalar
 43 | #' @author Martin Jullum
 44 | correction_matrix_cpp <- function(tr_H, n) {
 45 |     .Call(`_shapr_correction_matrix_cpp`, tr_H, n)
 46 | }
 47 | 
 48 | #'  Temp-function for computing the full AICc with several X's etc
 49 | #'
 50 | #' @param X matrix with "covariates"
 51 | #' @param mcov covariance matrix
 52 | #' @param S_scale_dist logical indicating whether the Mahalanobis distance should be scaled with the number of variables
 53 | #' @param h numeric specifying the scaling (sigma)
 54 | #' @param y vector with the "response variable"
 55 | #'
 56 | #' @export
 57 | #' @keywords internal
 58 | #'
 59 | #' @return Scalar with the numeric value of the AICc formula
 60 | #' @author Martin Jullum
 61 | aicc_full_single_cpp <- function(X, mcov, S_scale_dist, h, y) {
 62 |     .Call(`_shapr_aicc_full_single_cpp`, X, mcov, S_scale_dist, h, y)
 63 | }
 64 | 
 65 | #'  AICc formula for several sets, alternative definition
 66 | #'
 67 | #' @param h Numeric. Specifies the scaling (sigma)
 68 | #' @param X_list List
 69 | #' @param mcov_list List
 70 | #' @param S_scale_dist Logical. Indicates whether Mahalanobis distance should be scaled with the
 71 | #' number of variables
 72 | #' @param y_list List.
 73 | #' @param negative Logical.
 74 | #' @keywords internal
 75 | #'
 76 | #' @return Scalar with the numeric value of the AICc formula
 77 | #'
 78 | #' @author Martin Jullum
 79 | aicc_full_cpp <- function(h, X_list, mcov_list, S_scale_dist, y_list, negative) {
 80 |     .Call(`_shapr_aicc_full_cpp`, h, X_list, mcov_list, S_scale_dist, y_list, negative)
 81 | }
 82 | 
 83 | #' (Generalized) Mahalanobis distance
 84 | #'
 85 | #' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}.
 86 | #'
 87 | #' @param featureList List of vectors indicating all factor combinations that should be included in the computations. Assumes that the first one is empty.
 88 | #' @param mcov Matrix. The Sigma-matrix in the Mahalanobis distance formula (\code{stats::cov(Xtrain_mat)}) gives Mahalanobis distance,
 89 | #' \code{diag(m)} gives the Euclidean distance.
 90 | #' @param S_scale_dist Logical indicating
 91 | #' @param Xtrain_mat Matrix
 92 | #' @param Xtest_mat Matrix
 93 | #'
 94 | #' @export
 95 | #' @keywords internal
 96 | #'
 97 | #' @return Array of three dimensions. Contains the squared distance for between all training and test observations for all feature combinations passed to the function.
 98 | #' @author Martin Jullum
 99 | mahalanobis_distance_cpp <- function(featureList, Xtrain_mat, Xtest_mat, mcov, S_scale_dist) {
100 |     .Call(`_shapr_mahalanobis_distance_cpp`, featureList, Xtrain_mat, Xtest_mat, mcov, S_scale_dist)
101 | }
102 | 
103 | #' @keywords internal
104 | sample_features_cpp <- function(m, n_features) {
105 |     .Call(`_shapr_sample_features_cpp`, m, n_features)
106 | }
107 | 
108 | #' Get imputed data
109 | #'
110 | #' @param index_xtrain Positive integer. Represents a sequence of row indices from \code{xtrain},
111 | #' i.e. \code{min(index_xtrain) >= 1} and \code{max(index_xtrain) <= nrow(xtrain)}.
112 | #'
113 | #' @param index_s Positive integer. Represents a sequence of row indices from \code{S},
114 | #' i.e. \code{min(index_s) >= 1} and \code{max(index_s) <= nrow(S)}.
115 | #'
116 | #' @param xtrain Numeric matrix.
117 | #'
118 | #' @param xtest Numeric matrix. Represents a single test observation.
119 | #'
120 | #' @param S Integer matrix of dimension \code{n_combinations x m}, where \code{n_combinations} equals
121 | #' the total number of sampled/non-sampled feature combinations and \code{m} equals
122 | #' the total number of unique features. Note that \code{m = ncol(xtrain)}. See details
123 | #' for more information.
124 | #'
125 | #' @details \code{S(i, j) = 1} if and only if feature \code{j} is present in feature
126 | #' combination \code{i}, otherwise \code{S(i, j) = 0}. I.e. if \code{m = 3}, there
127 | #' are \code{2^3 = 8} unique ways to combine the features. In this case \code{dim(S) = c(8, 3)}.
128 | #' Let's call the features \code{x1, x2, x3} and take a closer look at the combination
129 | #' represented by \code{s = c(x1, x2)}. If this combination is represented by the second row,
130 | #' the following is true: \code{S[2, 1:3] = c(1, 1, 0)}.
131 | #'
132 | #' The returned object, \code{X}, is a numeric matrix where
133 | #' \code{dim(X) = c(length(index_xtrain), ncol(xtrain))}. If feature \code{j} is present in
134 | #' the k-th observation, that is \code{S[index_[k], j] == 1}, \code{X[k, j] = xtest[1, j]}.
135 | #' Otherwise \code{X[k, j] = xtrain[index_xtrain[k], j]}.
136 | #'
137 | #' @export
138 | #' @keywords internal
139 | #'
140 | #' @return Numeric matrix
141 | #'
142 | #' @author Nikolai Sellereite
143 | observation_impute_cpp <- function(index_xtrain, index_s, xtrain, xtest, S) {
144 |     .Call(`_shapr_observation_impute_cpp`, index_xtrain, index_s, xtrain, xtest, S)
145 | }
146 | 
147 | #' Calculate weight matrix
148 | #'
149 | #' @param subsets List. Each of the elements equals an integer
150 | #' vector representing a valid combination of features/feature groups.
151 | #' @param m Integer. Number of features/feature groups
152 | #' @param n Integer. Number of combinations
153 | #' @param w Numeric vector of length \code{n}, i.e. \code{w[i]} equals
154 | #' the Shapley weight of feature/feature group combination \code{i}, represented by
155 | #' \code{subsets[[i]]}.
156 | #'
157 | #' @export
158 | #' @keywords internal
159 | #'
160 | #' @return Matrix of dimension n x m + 1
161 | #' @author Nikolai Sellereite
162 | weight_matrix_cpp <- function(subsets, m, n, w) {
163 |     .Call(`_shapr_weight_matrix_cpp`, subsets, m, n, w)
164 | }
165 | 
166 | #' Get feature matrix
167 | #'
168 | #' @param features List
169 | #' @param m Positive integer. Total number of features
170 | #'
171 | #' @export
172 | #' @keywords internal
173 | #'
174 | #' @return Matrix
175 | #' @author Nikolai Sellereite
176 | feature_matrix_cpp <- function(features, m) {
177 |     .Call(`_shapr_feature_matrix_cpp`, features, m)
178 | }
179 | 
180 | 


--------------------------------------------------------------------------------
/inst/scripts/devel/compare_explain_batch.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(xgboost)
  3 | #library(shapr)
  4 | library(data.table)
  5 | 
  6 | data("Boston", package = "MASS")
  7 | 
  8 | x_var <- c("lstat", "rm", "dis", "indus")#,"nox","age","tax","ptratio")
  9 | y_var <- "medv"
 10 | 
 11 | x_train <- as.matrix(Boston[-1:-6, x_var])
 12 | y_train <- Boston[-1:-6, y_var]
 13 | x_test <- as.matrix(Boston[1:6, x_var])
 14 | 
 15 | # Fitting a basic xgboost model to the training data
 16 | model <- xgboost(
 17 |   data = x_train,
 18 |   label = y_train,
 19 |   nround = 20,
 20 |   verbose = FALSE
 21 | )
 22 | 
 23 | # THIS IS GENERATED FROM MASTER BRANCH
 24 | # Prepare the data for explanation
 25 | library(shapr)
 26 | explainer <- shapr(x_train, model,n_combinations = 100)
 27 | p = mean(y_train)
 28 | gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000)
 29 | emp =  explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000)
 30 | copula =  explain(x_test, explainer, "copula", prediction_zero = p, n_samples = 10000)
 31 | indep = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000)
 32 | comb = explain(x_test, explainer, c("gaussian", "gaussian", "empirical", "empirical"), prediction_zero = p, n_samples = 10000)
 33 | ctree = explain(x_test, explainer, "ctree", mincriterion = 0.95, prediction_zero = p, n_samples = 10000)
 34 | ctree2 = explain(x_test, explainer, "ctree", mincriterion = c(0.95, 0.95, 0.95, 0.95), prediction_zero = p, n_samples = 10000)
 35 | #saveRDS(list(gauss = gauss, empirical = emp, copula = copula, indep = indep, comb = comb, ctree = ctree, ctree_comb = ctree2), file = "inst/scripts/devel/master_res2.rds")
 36 | # saveRDS(list(ctree = ctree, ctree_comb = ctree2), file = "inst/scripts/devel/master_res_ctree.rds")
 37 | 
 38 | 
 39 | detach("package:shapr", unload = TRUE)
 40 | devtools::load_all()
 41 | nobs = 6
 42 | x_test <- as.matrix(Boston[1:nobs, x_var])
 43 | explainer <- shapr(x_train, model,n_combinations = 100)
 44 | p = mean(y_train)
 45 | gauss = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000, n_batches = 1)
 46 | emp =  explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000, n_batches = 1)
 47 | copula =  explain(x_test, explainer, "copula", prediction_zero = p, n_samples = 10000, n_batches = 1)
 48 | indep = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 1)
 49 | comb = explain(x_test, explainer, c("gaussian", "gaussian", "empirical", "empirical"), prediction_zero = p, n_samples = 10000, n_batches = 1)
 50 | ctree = explain(x_test, explainer, "ctree", mincriterion = 0.95, prediction_zero = p, n_samples = 10000, n_batches = 1)
 51 | ctree2 = explain(x_test, explainer, "ctree", mincriterion = c(0.95, 0.95, 0.95, 0.95), prediction_zero = p, n_samples = 10000, n_batches = 1)
 52 | 
 53 | res = readRDS("inst/scripts/devel/master_res2.rds")
 54 | 
 55 | # Compare res
 56 | all.equal(res$gauss$dt, gauss$dt) # TRUE
 57 | all.equal(res$empirical$dt, emp$dt) # TRUE
 58 | 
 59 | res$comb$dt
 60 | comb$dt
 61 | 
 62 | # With batches
 63 | gauss_b = explain(x_test, explainer, "gaussian", prediction_zero = p, n_samples = 10000, n_batches = 3)
 64 | emp_b =  explain(x_test, explainer, "empirical", prediction_zero = p, n_samples = 10000, n_batches = 3)
 65 | 
 66 | gauss_b$dt
 67 | res$gauss$dt
 68 | 
 69 | emp_b$dt
 70 | res$empirical$dt
 71 | 
 72 | #### MJ stuff here:
 73 | 
 74 | explain.independence2 <- function(x, explainer, approach, prediction_zero,
 75 |                                   n_samples = 1e3, n_batches = 1, seed = 1, only_return_contrib_dt = FALSE, ...) {
 76 | 
 77 | 
 78 |   if (!is.null(seed)) set.seed(seed)
 79 | 
 80 |   # Add arguments to explainer object
 81 |   explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt)
 82 |   explainer$approach <- approach
 83 |   explainer$n_samples <- n_samples
 84 | 
 85 |   r <- prepare_and_predict(explainer, n_batches, prediction_zero, only_return_contrib_dt, ...)
 86 | }
 87 | 
 88 | 
 89 | prepare_data.independence2 <- function(x, index_features = NULL, ...) {
 90 |   id <- id_combination <- w <- NULL # due to NSE notes in R CMD check
 91 | 
 92 |   if (is.null(index_features)) {
 93 |     index_features <- x$X[, .I]
 94 |   }
 95 | 
 96 |   S <- x$S[index_features, ]
 97 |   x_train <- as.matrix(x$x_train)
 98 |   n_train <- nrow(x_train)
 99 |   n_samples <- min(x$n_samples, n_train)
100 | 
101 |   index_s <- rep(seq(nrow(S)), each = n_samples)
102 |   w <- 1 / x$n_samples
103 | 
104 |   n_col <- nrow(x$x_test)
105 | 
106 |   dt_l <- list()
107 |   for (i in seq(n_col)) {
108 |     x_test <- x$x_test[i, , drop = FALSE]
109 | 
110 |     # sampling index_xtrain
111 |     index_xtrain <- c(replicate(nrow(S), sample(x = seq(n_train), size = n_samples, replace = F)))
112 | 
113 |     # Generate data used for prediction
114 |     dt_p <- observation_impute_cpp(
115 |       index_xtrain = index_xtrain,
116 |       index_s = index_s,
117 |       xtrain = x_train,
118 |       xtest = x_test,
119 |       S = S
120 |     )
121 | 
122 |     # Add keys
123 |     dt_l[[i]] <- data.table::as.data.table(dt_p)
124 |     data.table::setnames(dt_l[[i]], colnames(x_train))
125 |     dt_l[[i]][, id_combination := index_s]
126 |     dt_l[[i]][, w := w] # IS THIS NECESSARY?
127 |     dt_l[[i]][, id := i]
128 |   }
129 | 
130 | 
131 |   dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
132 |   return(dt)
133 | }
134 | 
135 | 
136 | 
137 | 
138 | # Using independence with n_samples > nrow(x_train) such that no sampling is performed
139 | 
140 | indep1 =  explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 1)
141 | indep2 =  explain(x_test, explainer, "independence2", prediction_zero = p, n_samples = 10000, n_batches = 1)
142 | 
143 | all.equal(indep1,indep2) # TRUE
144 | 
145 | indep1_batch_2 = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 2)
146 | 
147 | all.equal(indep1,indep1_batch_2) # TRUE
148 | 
149 | indep1_batch_5 = explain(x_test, explainer, "independence", prediction_zero = p, n_samples = 10000, n_batches = 5)
150 | 
151 | all.equal(indep1,indep1_batch_5) # TRUE
152 | 
153 | comb_indep_1_batch_1 = explain(x_test, explainer, c("independence", "independence", "independence", "independence"), prediction_zero = p, n_samples = 10000, n_batches = 1)
154 | 
155 | all.equal(indep1,comb_indep_1_batch_1) # TRUE
156 | 
157 | comb_indep_1_batch_2 = explain(x_test, explainer, c("independence", "independence", "independence", "independence"), prediction_zero = p, n_samples = 10000, n_batches = 2)
158 | 
159 | all.equal(indep1,comb_indep_1_batch_2) # TRUE
160 | 
161 | comb_indep_1_2_batch_1 = explain(x_test, explainer, c("independence", "independence", "independence2", "independence2"), prediction_zero = p, n_samples = 10000, n_batches = 1)
162 | 
163 | all.equal(indep1,comb_indep_1_2_batch_1) #TRUE
164 | 
165 | comb_indep_1_2_batch_2 = explain(x_test, explainer, c("independence", "independence", "independence2", "independence2"), prediction_zero = p, n_samples = 10000, n_batches = 2)
166 | 
167 | all.equal(indep1,comb_indep_1_2_batch_2) #TRUE
168 | 
169 | comb_indep_1_2_batch_5 = explain(x_test, explainer, c("independence", "independence", "independence2", "independence2"), prediction_zero = p, n_samples = 10000, n_batches = 5)
170 | 
171 | all.equal(indep1,comb_indep_1_2_batch_5) #TRUE
172 | 
173 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | bibliography: ./inst/REFERENCES.bib
  4 | ---
  5 | 
  6 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  7 | 
  8 | ```{r setup, include = FALSE}
  9 | knitr::opts_chunk$set(
 10 |     collapse = TRUE,
 11 |     comment = "#>",
 12 |     fig.path = "man/figures/README-",
 13 |     out.width = "100%",
 14 |     tidy = "styler"
 15 | )
 16 | 
 17 | ```
 18 | 
 19 | # shapr <img src="man/figures/NR-logo_utvidet_r32g60b136_small.png" align="right" height="50px"/>
 20 | 
 21 | <!-- badges: start -->
 22 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/shapr)](https://cran.r-project.org/package=shapr)
 23 | [![CRAN_Downloads_Badge](https://cranlogs.r-pkg.org/badges/grand-total/shapr)](https://cran.r-project.org/package=shapr)
 24 | [![R build status](https://github.com/NorskRegnesentral/shapr/workflows/R-CMD-check/badge.svg)](https://github.com/NorskRegnesentral/shapr/actions?query=workflow%3AR-CMD-check)
 25 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html)
 26 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 27 | [![DOI](https://joss.theoj.org/papers/10.21105/joss.02027/status.svg)](https://doi.org/10.21105/joss.02027)
 28 | <!-- badges: end -->
 29 | 
 30 | # NOTE: This package is undergoing severe restructuring. A new version (with breaking changes) will be available on GitHub soon.
 31 | 
 32 | The most common machine learning task is to train a model which is able to predict an unknown outcome (response variable) based on a set of known input variables/features.
 33 | When using such models for real life applications, it is often crucial to understand why a certain set of features lead to exactly that prediction.
 34 | However, explaining predictions from complex, or seemingly simple, machine learning models is a practical and ethical question, as well as a legal issue. Can I trust the model? Is it biased? Can I explain it to others? We want to explain individual predictions from a complex machine learning model by learning simple, interpretable  explanations.
 35 | 
 36 | Shapley values is the only prediction explanation framework with a solid theoretical foundation (@lundberg2017unified). Unless the true distribution of the features are known, and there are less than say 10-15 features, these Shapley values needs to be estimated/approximated. 
 37 | Popular methods like Shapley Sampling Values (@vstrumbelj2014explaining), SHAP/Kernel SHAP (@lundberg2017unified), and to some extent TreeSHAP (@lundberg2018consistent), assume that the features are independent when approximating the Shapley values for prediction explanation. This may lead to very inaccurate Shapley values, and consequently wrong interpretations of the predictions. @aas2019explaining extends and improves the Kernel SHAP method of @lundberg2017unified to account for the dependence between the features, resulting in significantly more accurate approximations to the Shapley values. 
 38 | [See the paper for details](https://arxiv.org/abs/1903.10464).
 39 | 
 40 | This package implements the methodology of @aas2019explaining.
 41 | 
 42 | The following methodology/features are currently implemented:
 43 | 
 44 | -   Native support of explanation of predictions from models fitted with the following functions 
 45 | `stats::glm`, `stats::lm`,`ranger::ranger`, `xgboost::xgboost`/`xgboost::xgb.train` and `mgcv::gam`.
 46 | -   Accounting for feature dependence assuming the features are Gaussian (@aas2019explaining).
 47 | -   Accounting for feature dependence with a Gaussian copula (Gaussian dependence structure, any marginal) (@aas2019explaining).
 48 | -   Accounting for feature dependence using the Mahalanobis distance based empirical (conditional) distribution approach of @aas2019explaining.
 49 | -   Accounting for feature dependence using conditional inference trees (@redelmeier2020explaining). 
 50 | -   Combining any of the four methods.
 51 | -   Optional use of the AICc criterion of @hurvich1998smoothing when optimizing the bandwidth parameter in the empirical (conditional) approach of @aas2019explaining.
 52 | -   Functionality for visualizing the explanations.
 53 | -   Support for models not supported natively.
 54 | 
 55 | <!--
 56 | Current methodological restrictions:
 57 | 
 58 | - The features must follow a continuous distribution
 59 | - Discrete features typically work just fine in practice although the theory breaks down
 60 | - Ordered/unordered categorical features are not supported
 61 | -->
 62 | 
 63 | Future releases will include:
 64 | 
 65 | -   Support for parallelization over explanations, Monte Carlo sampling and features subsets for non-parallelizable prediction functions.
 66 | -   Computational improvement of the AICc optimization approach,
 67 | -   Adaptive selection of method to account for the feature dependence.
 68 | 
 69 | Note that both the features and the prediction must be numeric. The approach is constructed for continuous features. Discrete features may also work just fine with the empirical (conditional) distribution approach.
 70 | Unlike SHAP and TreeSHAP, we decompose probability predictions directly to ease the interpretability, i.e. not via log odds transformations.
 71 | The application programming interface (API) of `shapr` is inspired by @lime_api.
 72 | 
 73 | ## Installation
 74 | 
 75 | To install the current stable release from CRAN, use
 76 | 
 77 | ```{r, eval = FALSE}
 78 | install.packages("shapr")
 79 | ```
 80 | 
 81 | To install the current development version, use
 82 | 
 83 | ```{r, eval = FALSE}
 84 | remotes::install_github("NorskRegnesentral/shapr")
 85 | ```
 86 | 
 87 | If you would like to install all packages of the models we currently support, use
 88 | 
 89 | ```{r, eval = FALSE}
 90 | remotes::install_github("NorskRegnesentral/shapr", dependencies = TRUE)
 91 | ```
 92 | 
 93 | 
 94 | If you would also like to build and view the vignette locally, use 
 95 | ```{r, eval = FALSE}
 96 | remotes::install_github("NorskRegnesentral/shapr", dependencies = TRUE, build_vignettes = TRUE)
 97 | vignette("understanding_shapr", "shapr")
 98 | ```
 99 | 
100 | You can always check out the latest version of the vignette [here](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html). 
101 | 
102 | ## Example
103 | `shapr` supports computation of Shapley values with any predictive model which takes a set of numeric features and produces a numeric outcome. 
104 | 
105 | The following example shows how a simple `xgboost` model is trained using the *Boston Housing Data*, and how `shapr` explains the individual predictions. 
106 | 
107 | 
108 | ```{r basic_example, warning = FALSE}
109 | library(xgboost)
110 | library(shapr)
111 | 
112 | data("Boston", package = "MASS")
113 | 
114 | x_var <- c("lstat", "rm", "dis", "indus")
115 | y_var <- "medv"
116 | 
117 | ind_x_test <- 1:6
118 | x_train <- as.matrix(Boston[-ind_x_test, x_var])
119 | y_train <- Boston[-ind_x_test, y_var]
120 | x_test <- as.matrix(Boston[ind_x_test, x_var])
121 | 
122 | # Looking at the dependence between the features
123 | cor(x_train)
124 | 
125 | # Fitting a basic xgboost model to the training data
126 | model <- xgboost(
127 |   data = x_train,
128 |   label = y_train,
129 |   nround = 20,
130 |   verbose = FALSE
131 | )
132 | 
133 | # Prepare the data for explanation
134 | explainer <- shapr(x_train, model)
135 | 
136 | # Specifying the phi_0, i.e. the expected prediction without any features
137 | p <- mean(y_train)
138 | 
139 | # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using
140 | # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default)
141 | explanation <- explain(
142 |   x_test,
143 |   approach = "empirical",
144 |   explainer = explainer,
145 |   prediction_zero = p
146 | )
147 | 
148 | # Printing the Shapley values for the test data.
149 | # For more information about the interpretation of the values in the table, see ?shapr::explain.  
150 | print(explanation$dt)
151 | 
152 | # Finally we plot the resulting explanations
153 | plot(explanation)
154 | ```
155 | 
156 | ## Contribution
157 | 
158 | All feedback and suggestions are very welcome. Details on how to contribute can be found 
159 | [here](https://norskregnesentral.github.io/shapr/CONTRIBUTING.html). If you have any questions or comments, feel
160 | free to open an issue [here](https://github.com/NorskRegnesentral/shapr/issues). 
161 | 
162 | Please note that the 'shapr' project is released with a
163 | [Contributor Code of Conduct](https://norskregnesentral.github.io/shapr/CODE_OF_CONDUCT.html).
164 | By contributing to this project, you agree to abide by its terms. 
165 | 
166 | ## References
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 12 | #endif
 13 | 
 14 | // hat_matrix_cpp
 15 | arma::mat hat_matrix_cpp(arma::mat X, arma::mat mcov, bool S_scale_dist, double h);
 16 | RcppExport SEXP _shapr_hat_matrix_cpp(SEXP XSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP, SEXP hSEXP) {
 17 | BEGIN_RCPP
 18 |     Rcpp::RObject rcpp_result_gen;
 19 |     Rcpp::RNGScope rcpp_rngScope_gen;
 20 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
 21 |     Rcpp::traits::input_parameter< arma::mat >::type mcov(mcovSEXP);
 22 |     Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP);
 23 |     Rcpp::traits::input_parameter< double >::type h(hSEXP);
 24 |     rcpp_result_gen = Rcpp::wrap(hat_matrix_cpp(X, mcov, S_scale_dist, h));
 25 |     return rcpp_result_gen;
 26 | END_RCPP
 27 | }
 28 | // rss_cpp
 29 | double rss_cpp(arma::mat H, arma::vec y);
 30 | RcppExport SEXP _shapr_rss_cpp(SEXP HSEXP, SEXP ySEXP) {
 31 | BEGIN_RCPP
 32 |     Rcpp::RObject rcpp_result_gen;
 33 |     Rcpp::RNGScope rcpp_rngScope_gen;
 34 |     Rcpp::traits::input_parameter< arma::mat >::type H(HSEXP);
 35 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
 36 |     rcpp_result_gen = Rcpp::wrap(rss_cpp(H, y));
 37 |     return rcpp_result_gen;
 38 | END_RCPP
 39 | }
 40 | // correction_matrix_cpp
 41 | double correction_matrix_cpp(double tr_H, int n);
 42 | RcppExport SEXP _shapr_correction_matrix_cpp(SEXP tr_HSEXP, SEXP nSEXP) {
 43 | BEGIN_RCPP
 44 |     Rcpp::RObject rcpp_result_gen;
 45 |     Rcpp::RNGScope rcpp_rngScope_gen;
 46 |     Rcpp::traits::input_parameter< double >::type tr_H(tr_HSEXP);
 47 |     Rcpp::traits::input_parameter< int >::type n(nSEXP);
 48 |     rcpp_result_gen = Rcpp::wrap(correction_matrix_cpp(tr_H, n));
 49 |     return rcpp_result_gen;
 50 | END_RCPP
 51 | }
 52 | // aicc_full_single_cpp
 53 | arma::vec aicc_full_single_cpp(arma::mat X, arma::mat mcov, bool S_scale_dist, double h, arma::vec y);
 54 | RcppExport SEXP _shapr_aicc_full_single_cpp(SEXP XSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP, SEXP hSEXP, SEXP ySEXP) {
 55 | BEGIN_RCPP
 56 |     Rcpp::RObject rcpp_result_gen;
 57 |     Rcpp::RNGScope rcpp_rngScope_gen;
 58 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
 59 |     Rcpp::traits::input_parameter< arma::mat >::type mcov(mcovSEXP);
 60 |     Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP);
 61 |     Rcpp::traits::input_parameter< double >::type h(hSEXP);
 62 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
 63 |     rcpp_result_gen = Rcpp::wrap(aicc_full_single_cpp(X, mcov, S_scale_dist, h, y));
 64 |     return rcpp_result_gen;
 65 | END_RCPP
 66 | }
 67 | // aicc_full_cpp
 68 | double aicc_full_cpp(double h, Rcpp::List X_list, Rcpp::List mcov_list, bool S_scale_dist, Rcpp::List y_list, bool negative);
 69 | RcppExport SEXP _shapr_aicc_full_cpp(SEXP hSEXP, SEXP X_listSEXP, SEXP mcov_listSEXP, SEXP S_scale_distSEXP, SEXP y_listSEXP, SEXP negativeSEXP) {
 70 | BEGIN_RCPP
 71 |     Rcpp::RObject rcpp_result_gen;
 72 |     Rcpp::RNGScope rcpp_rngScope_gen;
 73 |     Rcpp::traits::input_parameter< double >::type h(hSEXP);
 74 |     Rcpp::traits::input_parameter< Rcpp::List >::type X_list(X_listSEXP);
 75 |     Rcpp::traits::input_parameter< Rcpp::List >::type mcov_list(mcov_listSEXP);
 76 |     Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP);
 77 |     Rcpp::traits::input_parameter< Rcpp::List >::type y_list(y_listSEXP);
 78 |     Rcpp::traits::input_parameter< bool >::type negative(negativeSEXP);
 79 |     rcpp_result_gen = Rcpp::wrap(aicc_full_cpp(h, X_list, mcov_list, S_scale_dist, y_list, negative));
 80 |     return rcpp_result_gen;
 81 | END_RCPP
 82 | }
 83 | // mahalanobis_distance_cpp
 84 | arma::cube mahalanobis_distance_cpp(Rcpp::List featureList, arma::mat Xtrain_mat, arma::mat Xtest_mat, arma::mat mcov, bool S_scale_dist);
 85 | RcppExport SEXP _shapr_mahalanobis_distance_cpp(SEXP featureListSEXP, SEXP Xtrain_matSEXP, SEXP Xtest_matSEXP, SEXP mcovSEXP, SEXP S_scale_distSEXP) {
 86 | BEGIN_RCPP
 87 |     Rcpp::RObject rcpp_result_gen;
 88 |     Rcpp::RNGScope rcpp_rngScope_gen;
 89 |     Rcpp::traits::input_parameter< Rcpp::List >::type featureList(featureListSEXP);
 90 |     Rcpp::traits::input_parameter< arma::mat >::type Xtrain_mat(Xtrain_matSEXP);
 91 |     Rcpp::traits::input_parameter< arma::mat >::type Xtest_mat(Xtest_matSEXP);
 92 |     Rcpp::traits::input_parameter< arma::mat >::type mcov(mcovSEXP);
 93 |     Rcpp::traits::input_parameter< bool >::type S_scale_dist(S_scale_distSEXP);
 94 |     rcpp_result_gen = Rcpp::wrap(mahalanobis_distance_cpp(featureList, Xtrain_mat, Xtest_mat, mcov, S_scale_dist));
 95 |     return rcpp_result_gen;
 96 | END_RCPP
 97 | }
 98 | // sample_features_cpp
 99 | List sample_features_cpp(int m, IntegerVector n_features);
100 | RcppExport SEXP _shapr_sample_features_cpp(SEXP mSEXP, SEXP n_featuresSEXP) {
101 | BEGIN_RCPP
102 |     Rcpp::RObject rcpp_result_gen;
103 |     Rcpp::RNGScope rcpp_rngScope_gen;
104 |     Rcpp::traits::input_parameter< int >::type m(mSEXP);
105 |     Rcpp::traits::input_parameter< IntegerVector >::type n_features(n_featuresSEXP);
106 |     rcpp_result_gen = Rcpp::wrap(sample_features_cpp(m, n_features));
107 |     return rcpp_result_gen;
108 | END_RCPP
109 | }
110 | // observation_impute_cpp
111 | NumericMatrix observation_impute_cpp(IntegerVector index_xtrain, IntegerVector index_s, NumericMatrix xtrain, NumericMatrix xtest, IntegerMatrix S);
112 | RcppExport SEXP _shapr_observation_impute_cpp(SEXP index_xtrainSEXP, SEXP index_sSEXP, SEXP xtrainSEXP, SEXP xtestSEXP, SEXP SSEXP) {
113 | BEGIN_RCPP
114 |     Rcpp::RObject rcpp_result_gen;
115 |     Rcpp::RNGScope rcpp_rngScope_gen;
116 |     Rcpp::traits::input_parameter< IntegerVector >::type index_xtrain(index_xtrainSEXP);
117 |     Rcpp::traits::input_parameter< IntegerVector >::type index_s(index_sSEXP);
118 |     Rcpp::traits::input_parameter< NumericMatrix >::type xtrain(xtrainSEXP);
119 |     Rcpp::traits::input_parameter< NumericMatrix >::type xtest(xtestSEXP);
120 |     Rcpp::traits::input_parameter< IntegerMatrix >::type S(SSEXP);
121 |     rcpp_result_gen = Rcpp::wrap(observation_impute_cpp(index_xtrain, index_s, xtrain, xtest, S));
122 |     return rcpp_result_gen;
123 | END_RCPP
124 | }
125 | // weight_matrix_cpp
126 | arma::mat weight_matrix_cpp(List subsets, int m, int n, NumericVector w);
127 | RcppExport SEXP _shapr_weight_matrix_cpp(SEXP subsetsSEXP, SEXP mSEXP, SEXP nSEXP, SEXP wSEXP) {
128 | BEGIN_RCPP
129 |     Rcpp::RObject rcpp_result_gen;
130 |     Rcpp::RNGScope rcpp_rngScope_gen;
131 |     Rcpp::traits::input_parameter< List >::type subsets(subsetsSEXP);
132 |     Rcpp::traits::input_parameter< int >::type m(mSEXP);
133 |     Rcpp::traits::input_parameter< int >::type n(nSEXP);
134 |     Rcpp::traits::input_parameter< NumericVector >::type w(wSEXP);
135 |     rcpp_result_gen = Rcpp::wrap(weight_matrix_cpp(subsets, m, n, w));
136 |     return rcpp_result_gen;
137 | END_RCPP
138 | }
139 | // feature_matrix_cpp
140 | NumericMatrix feature_matrix_cpp(List features, int m);
141 | RcppExport SEXP _shapr_feature_matrix_cpp(SEXP featuresSEXP, SEXP mSEXP) {
142 | BEGIN_RCPP
143 |     Rcpp::RObject rcpp_result_gen;
144 |     Rcpp::RNGScope rcpp_rngScope_gen;
145 |     Rcpp::traits::input_parameter< List >::type features(featuresSEXP);
146 |     Rcpp::traits::input_parameter< int >::type m(mSEXP);
147 |     rcpp_result_gen = Rcpp::wrap(feature_matrix_cpp(features, m));
148 |     return rcpp_result_gen;
149 | END_RCPP
150 | }
151 | 
152 | static const R_CallMethodDef CallEntries[] = {
153 |     {"_shapr_hat_matrix_cpp", (DL_FUNC) &_shapr_hat_matrix_cpp, 4},
154 |     {"_shapr_rss_cpp", (DL_FUNC) &_shapr_rss_cpp, 2},
155 |     {"_shapr_correction_matrix_cpp", (DL_FUNC) &_shapr_correction_matrix_cpp, 2},
156 |     {"_shapr_aicc_full_single_cpp", (DL_FUNC) &_shapr_aicc_full_single_cpp, 5},
157 |     {"_shapr_aicc_full_cpp", (DL_FUNC) &_shapr_aicc_full_cpp, 6},
158 |     {"_shapr_mahalanobis_distance_cpp", (DL_FUNC) &_shapr_mahalanobis_distance_cpp, 5},
159 |     {"_shapr_sample_features_cpp", (DL_FUNC) &_shapr_sample_features_cpp, 2},
160 |     {"_shapr_observation_impute_cpp", (DL_FUNC) &_shapr_observation_impute_cpp, 5},
161 |     {"_shapr_weight_matrix_cpp", (DL_FUNC) &_shapr_weight_matrix_cpp, 4},
162 |     {"_shapr_feature_matrix_cpp", (DL_FUNC) &_shapr_feature_matrix_cpp, 2},
163 |     {NULL, NULL, 0}
164 | };
165 | 
166 | RcppExport void R_init_shapr(DllInfo *dll) {
167 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
168 |     R_useDynamicSymbols(dll, FALSE);
169 | }
170 | 


--------------------------------------------------------------------------------
/tests/testthat/test-a-shapley.R:
--------------------------------------------------------------------------------
  1 | context("test-shapley.R")
  2 | 
  3 | RNGversion(vstr = "3.5.0")
  4 | 
  5 | test_that("Basic test functions in shapley.R", {
  6 | 
  7 |   # Load data -----------
  8 |   if (requireNamespace("MASS", quietly = TRUE)) {
  9 |     data("Boston", package = "MASS")
 10 |     x_var <- c("lstat", "rm", "dis", "indus")
 11 |     x_train <- tail(Boston[, x_var], 50)
 12 | 
 13 |     # Load premade lm model. Path needs to be relative to testthat directory in the package
 14 |     model <- readRDS("model_objects/lm_model_object.rds")
 15 | 
 16 |     # Prepare the data for explanation
 17 |     explainer <- shapr(x_train, model)
 18 | 
 19 |     expect_known_value(explainer,
 20 |       file = "test_objects/shapley_explainer_obj.rds",
 21 |       update = F
 22 |     )
 23 |   }
 24 | })
 25 | 
 26 | 
 27 | test_that("Testing data input to shapr in shapley.R", {
 28 |   if (requireNamespace("MASS", quietly = TRUE)) {
 29 |     data("Boston", package = "MASS")
 30 | 
 31 |     y_var <- "medv"
 32 |     x_train <- tail(Boston, -6)
 33 |     y_train <- tail(Boston[, y_var], -6)
 34 |     y_train_binary <- as.factor(tail((Boston[, y_var] > 20) * 1, -6))
 35 | 
 36 |     # convert to factors for testing purposes
 37 |     x_train$rad <- factor(round(x_train$rad))
 38 |     x_train$chas <- factor(round(x_train$chas))
 39 | 
 40 |     train_df <- cbind(x_train, y_train, y_train_binary)
 41 | 
 42 | 
 43 |     x_var_numeric <- c("lstat", "rm", "dis", "indus")
 44 |     x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas")
 45 | 
 46 |     train_df_used_numeric <- x_train[, x_var_numeric]
 47 |     train_df_used_factor <- x_train[, x_var_factor]
 48 | 
 49 |     formula_numeric <- as.formula(paste0("y_train ~ ", paste0(x_var_numeric, collapse = "+")))
 50 |     formula_factor <- as.formula(paste0("y_train ~ ", paste0(x_var_factor, collapse = "+")))
 51 | 
 52 |     formula_binary_numeric <- as.formula(paste0("y_train_binary ~ ", paste0(x_var_numeric, collapse = "+")))
 53 |     formula_binary_factor <- as.formula(paste0("y_train_binary ~ ", paste0(x_var_factor, collapse = "+")))
 54 | 
 55 |     dummylist <- make_dummies(traindata = x_train[, x_var_factor], testdata = x_train[, x_var_factor])
 56 | 
 57 |     # List of models to run silently
 58 |     l_numeric <- list(
 59 |       stats::lm(formula_numeric, data = train_df),
 60 |       stats::glm(formula_numeric, data = train_df)
 61 |     )
 62 | 
 63 |     if (requireNamespace("mgcv", quietly = TRUE)) {
 64 |       l_numeric[[length(l_numeric) + 1]] <- mgcv::gam(formula_numeric, data = train_df)
 65 |     }
 66 | 
 67 |     l_factor <- list(
 68 |       stats::lm(formula_factor, data = train_df),
 69 |       stats::glm(formula_factor, data = train_df)
 70 |     )
 71 | 
 72 |     if (requireNamespace("mgcv", quietly = TRUE)) {
 73 |       l_factor[[length(l_factor) + 1]] <- mgcv::gam(formula_factor, data = train_df)
 74 |     }
 75 | 
 76 |     if (requireNamespace("xgboost", quietly = TRUE)) {
 77 |       l_factor[[length(l_factor) + 1]] <- xgboost::xgboost(
 78 |         data = dummylist$train_dummies,
 79 |         label = y_train,
 80 |         nrounds = 3, verbose = FALSE
 81 |       )
 82 |       l_factor[[length(l_factor)]]$feature_list <- dummylist$feature_list
 83 |     }
 84 | 
 85 | 
 86 |     for (i in seq_along(l_numeric)) {
 87 |       expect_silent(shapr(train_df_used_numeric, l_numeric[[i]])) # No modification
 88 |       expect_message(shapr(train_df, l_numeric[[i]])) # Features dropped
 89 |     }
 90 | 
 91 |     for (i in seq_along(l_factor)) {
 92 |       expect_silent(shapr(train_df_used_factor, l_factor[[i]])) # No modification
 93 |       expect_message(shapr(train_df, l_factor[[i]])) # Features dropped
 94 |     }
 95 | 
 96 | 
 97 |     # Testing errors on incompatible model and data
 98 |     # Missing features
 99 |     model <- stats::lm(formula_factor, data = train_df)
100 |     data_error <- train_df[, -3]
101 |     expect_error(shapr(data_error, model))
102 | 
103 |     # Duplicated column names
104 |     data_error <- train_df_used_factor
105 |     data_error <- cbind(data_error, lstat = 1)
106 |     expect_error(shapr(data_error, model))
107 | 
108 |     # Empty column names in data
109 |     data_error <- train_df
110 |     colnames(data_error) <- NULL
111 |     expect_error(shapr(data_error, model))
112 | 
113 |     # Empty column names in model (ok if found in data -- and we trust it)
114 |     if (requireNamespace("xgboost", quietly = TRUE)) {
115 |       data_with_colnames <- data_without_colnames <- as.matrix(train_df_used_numeric)
116 |       colnames(data_without_colnames) <- NULL
117 | 
118 |       model_xgb <- xgboost::xgboost(
119 |         data = data_without_colnames, label = y_train,
120 |         nrounds = 3, verbose = FALSE
121 |       )
122 |       expect_message(shapr(data_with_colnames, model_xgb))
123 |     }
124 | 
125 |     # Data feature with incorrect class
126 |     data_error <- train_df_used_factor
127 |     data_error$lstat <- as.logical(data_error$lstat > 15)
128 |     expect_error(shapr(data_error, model))
129 | 
130 |     # non-matching factor levels
131 |     data_error <- head(train_df_used_factor)
132 |     data_error$rad <- droplevels(data_error$rad)
133 |     expect_error(shapr(data_error, model))
134 |   }
135 | })
136 | 
137 | test_that("Basic test functions for grouping in shapley.R", {
138 | 
139 |   # Load data -----------
140 |   if (requireNamespace("MASS", quietly = TRUE)) {
141 |     data("Boston", package = "MASS")
142 |     x_var <- c("lstat", "rm", "dis", "indus")
143 |     x_train <- tail(Boston[, x_var], 50)
144 | 
145 |     # Load premade lm model. Path needs to be relative to testthat directory in the package
146 |     model <- readRDS("model_objects/lm_model_object.rds")
147 | 
148 |     group1_num <- list(
149 |       c(1, 3),
150 |       c(2, 4)
151 |     )
152 | 
153 |     group1 <- lapply(group1_num, function(x) {
154 |       x_var[x]
155 |     })
156 | 
157 | 
158 |     group2_num <- list(
159 |       c(1),
160 |       c(2),
161 |       c(3),
162 |       c(4)
163 |     )
164 | 
165 |     group2 <- lapply(group2_num, function(x) {
166 |       x_var[x]
167 |     })
168 | 
169 |     # Prepare the data for explanation
170 |     explainer1 <- shapr(x_train, model, group = group1)
171 |     explainer2 <- shapr(x_train, model, group = group2)
172 | 
173 |     set.seed(123)
174 |     explainer1_2 <- shapr(x_train, model, group = group1, n_combinations = 5)
175 |     set.seed(1234)
176 |     explainer2_2 <- shapr(x_train, model, group = group2, n_combinations = 5)
177 | 
178 |     expect_known_value(explainer1,
179 |       file = "test_objects/shapley_explainer_group1_obj.rds",
180 |       update = F
181 |     )
182 |     expect_known_value(explainer2,
183 |       file = "test_objects/shapley_explainer_group2_obj.rds",
184 |       update = F
185 |     )
186 |     expect_known_value(explainer1_2,
187 |                        file = "test_objects/shapley_explainer_group1_2_obj.rds",
188 |                        update = F
189 |     )
190 |     expect_known_value(explainer2_2,
191 |                        file = "test_objects/shapley_explainer_group2_2_obj.rds",
192 |                        update = F
193 |     )
194 | 
195 |   }
196 | })
197 | 
198 | 
199 | test_that("Testing data input to shapr for grouping in shapley.R", {
200 |   if (requireNamespace("MASS", quietly = TRUE)) {
201 |     data("Boston", package = "MASS")
202 | 
203 |     x_var <- c("lstat", "rm", "dis", "indus")
204 |     not_x_var <- "crim"
205 | 
206 |     x_train <- as.matrix(tail(Boston[, x_var], -6))
207 |     xy_train <- tail(Boston, -6)
208 |     group_num <- list(
209 |       c(1, 3),
210 |       c(2, 4)
211 |     )
212 | 
213 |     group <- lapply(group_num, function(x) {
214 |       x_var[x]
215 |     })
216 |     names(group) <- c("A", "B")
217 | 
218 |     group_no_names <- lapply(group_num, function(x) {
219 |       x_var[x]
220 |     })
221 | 
222 |     group_error_1 <- list(
223 |       c(x_var[1:2], not_x_var),
224 |       x_var[3:4]
225 |     )
226 | 
227 |     group_error_2 <- list(
228 |       x_var[1],
229 |       x_var[3:4]
230 |     )
231 | 
232 |     group_error_3 <- list(
233 |       x_var[c(1, 2)],
234 |       x_var[c(1, 3, 4)]
235 |     )
236 | 
237 |     group_error_4 <- list(
238 |       x_var[c(1, 2)],
239 |       x_var[c(1, 3, 4)]
240 |     )
241 | 
242 | 
243 |     # Fitting models
244 |     formula <- as.formula(paste0("medv ~ ", paste0(x_var, collapse = "+")))
245 |     model <- stats::lm(formula = formula, data = xy_train)
246 | 
247 | 
248 |     # Expect silent
249 |     expect_silent(shapr(x = x_train, model = model, group = group))
250 | 
251 |     # Expect message for missing names
252 |     expect_message(shapr(x = x_train, model = model, group = group_no_names))
253 | 
254 | 
255 |     # Expect error when group is not a list
256 |     expect_error(shapr(x_train, model, group = x_var))
257 | 
258 | 
259 |     # Expect error that group does not include names of features
260 |     expect_error(shapr(x = x_train, model = model, group = group_num))
261 | 
262 |     # Expect error when x_train/model does not use a feature mentioned in the group
263 |     expect_error(shapr(x_train, model, group = group_error_1))
264 | 
265 |     # Expect error when group does not contain a feature used by the model
266 |     expect_error(shapr(x_train, model, group = group_error_2))
267 | 
268 |     # Expect error when group does duplicated features
269 |     expect_error(shapr(x_train, model, group = group_error_3))
270 |   }
271 | })
272 | 


--------------------------------------------------------------------------------