├── .gitignore ├── .gitattributes ├── _pkgdown.yml ├── data ├── HR_data.rda └── titanic_data.rda ├── docs ├── MI2logo.jpg ├── reference │ ├── lollipop-1.png │ ├── lollipop-2.png │ ├── lollipop-3.png │ ├── lollipop-4.png │ ├── lollipop-5.png │ ├── waterfall-1.png │ ├── importance-1.png │ ├── importance-2.png │ ├── importance-3.png │ ├── importance-4.png │ ├── importance-5.png │ ├── importance-6.png │ ├── EIX_lollipop-1.png │ ├── EIX_waterfall-1.png │ ├── interactions-1.png │ ├── interactions-2.png │ ├── interactions-3.png │ ├── interactions-4.png │ ├── plot.lollipop-1.png │ ├── plot.lollipop-2.png │ ├── importanceTable-1.png │ ├── importanceTable-2.png │ ├── importanceTable-3.png │ ├── importanceTable-4.png │ ├── plot.importance-1.png │ ├── plot.importance-2.png │ ├── plot.importance-3.png │ ├── plot.importance-4.png │ ├── plot.importance-5.png │ ├── plot.importance-6.png │ ├── interactionsTable-1.png │ ├── interactionsTable-2.png │ ├── plot.interactions-1.png │ ├── plot.interactions-2.png │ ├── plot.interactions-3.png │ ├── plot.interactions-4.png │ ├── plot.importanceTable-1.png │ ├── plot.importanceTable-2.png │ ├── plot.importanceTable-3.png │ ├── plot.importanceTable-4.png │ ├── plot.interactionsTable-1.png │ ├── plot.interactionsTable-2.png │ ├── tableOfTrees.html │ ├── calculateGain.html │ ├── HR_data.html │ ├── index.html │ ├── titanic.html │ ├── titanic_data.html │ ├── countPairs.html │ ├── lollipop.html │ ├── EIX_lollipop.html │ ├── plot.lollipop.html │ └── waterfall.html ├── pkgdown.yml ├── articles │ ├── EIX_files │ │ └── figure-html │ │ │ ├── unnamed-chunk-6-1.png │ │ │ ├── unnamed-chunk-7-1.png │ │ │ ├── unnamed-chunk-8-1.png │ │ │ ├── unnamed-chunk-9-1.png │ │ │ ├── unnamed-chunk-10-1.png │ │ │ ├── unnamed-chunk-11-1.png │ │ │ └── unnamed-chunk-11-2.png │ ├── titanic_data_files │ │ └── figure-html │ │ │ ├── unnamed-chunk-4-1.png │ │ │ ├── unnamed-chunk-5-1.png │ │ │ ├── unnamed-chunk-6-1.png │ │ │ ├── unnamed-chunk-7-1.png │ │ │ └── unnamed-chunk-8-1.png │ ├── vignette_titanic_files │ │ └── figure-html │ │ │ ├── unnamed-chunk-4-1.png │ │ │ ├── unnamed-chunk-5-1.png │ │ │ ├── unnamed-chunk-6-1.png │ │ │ ├── unnamed-chunk-7-1.png │ │ │ └── unnamed-chunk-8-1.png │ └── index.html ├── pkgdown.js ├── link.svg ├── docsearch.js ├── jquery.sticky-kit.min.js ├── authors.html ├── pkgdown.css └── index.html ├── cheatsheets ├── EIX.pdf ├── EIX.png └── EIX.pptx ├── .Rbuildignore ├── README.md ├── .travis.yml ├── EIX.Rproj ├── man ├── tableOfTrees.Rd ├── calculateGain.Rd ├── EIX-package.Rd ├── HR_data.Rd ├── lollipop.Rd ├── plot.interactions.Rd ├── plot.lollipop.Rd ├── waterfall.Rd ├── interactions.Rd ├── importance.Rd ├── titanic_data.Rd └── plot.importance.Rd ├── R ├── package.R ├── HR_data.R ├── lollipop.R ├── plot_interactions.R ├── titanic_data.R ├── plot_lollipop.R ├── interactions.R ├── calculateGain.R ├── plot_importance.R └── importance.R ├── NAMESPACE ├── DESCRIPTION └── vignettes ├── titanic_data.Rmd └── EIX.Rmd /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | inst/doc 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | package: MI2template 3 | default_assets: false 4 | 5 | -------------------------------------------------------------------------------- /data/HR_data.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/data/HR_data.rda -------------------------------------------------------------------------------- /docs/MI2logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/MI2logo.jpg -------------------------------------------------------------------------------- /cheatsheets/EIX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/cheatsheets/EIX.pdf -------------------------------------------------------------------------------- /cheatsheets/EIX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/cheatsheets/EIX.png -------------------------------------------------------------------------------- /cheatsheets/EIX.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/cheatsheets/EIX.pptx -------------------------------------------------------------------------------- /data/titanic_data.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/data/titanic_data.rda -------------------------------------------------------------------------------- /docs/reference/lollipop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/lollipop-1.png -------------------------------------------------------------------------------- /docs/reference/lollipop-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/lollipop-2.png -------------------------------------------------------------------------------- /docs/reference/lollipop-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/lollipop-3.png -------------------------------------------------------------------------------- /docs/reference/lollipop-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/lollipop-4.png -------------------------------------------------------------------------------- /docs/reference/lollipop-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/lollipop-5.png -------------------------------------------------------------------------------- /docs/reference/waterfall-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/waterfall-1.png -------------------------------------------------------------------------------- /docs/reference/importance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importance-1.png -------------------------------------------------------------------------------- /docs/reference/importance-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importance-2.png -------------------------------------------------------------------------------- /docs/reference/importance-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importance-3.png -------------------------------------------------------------------------------- /docs/reference/importance-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importance-4.png -------------------------------------------------------------------------------- /docs/reference/importance-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importance-5.png -------------------------------------------------------------------------------- /docs/reference/importance-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importance-6.png -------------------------------------------------------------------------------- /docs/reference/EIX_lollipop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/EIX_lollipop-1.png -------------------------------------------------------------------------------- /docs/reference/EIX_waterfall-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/EIX_waterfall-1.png -------------------------------------------------------------------------------- /docs/reference/interactions-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/interactions-1.png -------------------------------------------------------------------------------- /docs/reference/interactions-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/interactions-2.png -------------------------------------------------------------------------------- /docs/reference/interactions-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/interactions-3.png -------------------------------------------------------------------------------- /docs/reference/interactions-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/interactions-4.png -------------------------------------------------------------------------------- /docs/reference/plot.lollipop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.lollipop-1.png -------------------------------------------------------------------------------- /docs/reference/plot.lollipop-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.lollipop-2.png -------------------------------------------------------------------------------- /docs/reference/importanceTable-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importanceTable-1.png -------------------------------------------------------------------------------- /docs/reference/importanceTable-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importanceTable-2.png -------------------------------------------------------------------------------- /docs/reference/importanceTable-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importanceTable-3.png -------------------------------------------------------------------------------- /docs/reference/importanceTable-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/importanceTable-4.png -------------------------------------------------------------------------------- /docs/reference/plot.importance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importance-1.png -------------------------------------------------------------------------------- /docs/reference/plot.importance-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importance-2.png -------------------------------------------------------------------------------- /docs/reference/plot.importance-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importance-3.png -------------------------------------------------------------------------------- /docs/reference/plot.importance-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importance-4.png -------------------------------------------------------------------------------- /docs/reference/plot.importance-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importance-5.png -------------------------------------------------------------------------------- /docs/reference/plot.importance-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importance-6.png -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^cheatsheets$ 4 | ^.*README.*$ 5 | ^.travis.yml$ 6 | ^_pkgdown\.yml$ 7 | ^docs$ 8 | -------------------------------------------------------------------------------- /docs/reference/interactionsTable-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/interactionsTable-1.png -------------------------------------------------------------------------------- /docs/reference/interactionsTable-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/interactionsTable-2.png -------------------------------------------------------------------------------- /docs/reference/plot.interactions-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.interactions-1.png -------------------------------------------------------------------------------- /docs/reference/plot.interactions-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.interactions-2.png -------------------------------------------------------------------------------- /docs/reference/plot.interactions-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.interactions-3.png -------------------------------------------------------------------------------- /docs/reference/plot.interactions-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.interactions-4.png -------------------------------------------------------------------------------- /docs/reference/plot.importanceTable-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importanceTable-1.png -------------------------------------------------------------------------------- /docs/reference/plot.importanceTable-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importanceTable-2.png -------------------------------------------------------------------------------- /docs/reference/plot.importanceTable-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importanceTable-3.png -------------------------------------------------------------------------------- /docs/reference/plot.importanceTable-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.importanceTable-4.png -------------------------------------------------------------------------------- /docs/reference/plot.interactionsTable-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.interactionsTable-1.png -------------------------------------------------------------------------------- /docs/reference/plot.interactionsTable-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/reference/plot.interactionsTable-2.png -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 1.19.2.1 2 | pkgdown: 1.3.0 3 | pkgdown_sha: ~ 4 | articles: 5 | EIX: EIX.html 6 | titanic_data: titanic_data.html 7 | 8 | -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /docs/articles/EIX_files/figure-html/unnamed-chunk-11-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/EIX_files/figure-html/unnamed-chunk-11-2.png -------------------------------------------------------------------------------- /docs/articles/titanic_data_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/titanic_data_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /docs/articles/titanic_data_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/titanic_data_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /docs/articles/titanic_data_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/titanic_data_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /docs/articles/titanic_data_files/figure-html/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/titanic_data_files/figure-html/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /docs/articles/titanic_data_files/figure-html/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/titanic_data_files/figure-html/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelOriented/EIX/HEAD/docs/articles/vignette_titanic_files/figure-html/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## EIX - Explain Interactions in Xgboost 2 | A set of tools to explain XGBoost and LightGBM models. 3 | 4 | ## Installation 5 | 6 | Install from GitHub 7 | ```{r} 8 | devtools::install_github("ModelOriented/EIX") 9 | ``` 10 | 11 | ## Cheatsheets 12 | 13 | 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: R 2 | sudo: false 3 | cache: packages 4 | dist: trusty 5 | 6 | before_install: 7 | - sudo apt-get install --yes udunits-bin libproj-dev libgeos-dev libgdal-dev libgdal1-dev libudunits2-dev 8 | 9 | env: 10 | global: 11 | - R_CHECK_ARGS="--timings" 12 | 13 | notifications: 14 | email: false 15 | 16 | r_packages: 17 | - archivist 18 | - DALEX 19 | - ggplot2 20 | - covr 21 | 22 | after_success: 23 | - Rscript -e 'library(covr); codecov()' 24 | -------------------------------------------------------------------------------- /EIX.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/tableOfTrees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateGain.R 3 | \name{tableOfTrees} 4 | \alias{tableOfTrees} 5 | \title{tableOfTrees} 6 | \usage{ 7 | tableOfTrees(model, data) 8 | } 9 | \arguments{ 10 | \item{model}{a xgboost or lightgbm model} 11 | 12 | \item{data}{a data table with data used to train the model} 13 | } 14 | \value{ 15 | a data table 16 | } 17 | \description{ 18 | tableOfTrees 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/calculateGain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculateGain.R 3 | \name{calculateGain} 4 | \alias{calculateGain} 5 | \title{calculateGain} 6 | \usage{ 7 | calculateGain(xgb.model, data) 8 | } 9 | \arguments{ 10 | \item{xgb.model}{a xgboost or lightgbm model} 11 | 12 | \item{data}{a data table with data used to train the model} 13 | } 14 | \value{ 15 | a list 16 | } 17 | \description{ 18 | List of trees with pairs of variable and other needed fields 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /R/package.R: -------------------------------------------------------------------------------- 1 | #' EIX package 2 | #' 3 | #' Structure mining from 'XGBoost' and 'LightGBM' models. 4 | #' Key functionalities of this package cover: visualisation of tree-based ensembles models, 5 | #' identification of interactions, measuring of variable importance, 6 | #' measuring of interaction importance, explanation of single prediction 7 | #' with break down plots (based on 'xgboostExplainer' and 'iBreakDown' packages). 8 | #' To download the 'LightGBM' use the following link: . 9 | #' EIX' is a part of the 'DrWhy.AI' universe. 10 | #' @import MASS 11 | #' @import tidyr 12 | #' 13 | #' @name EIX-package 14 | NULL 15 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | $("#sidebar").stick_in_parent({offset_top: 40}); 3 | $('body').scrollspy({ 4 | target: '#sidebar', 5 | offset: 60 6 | }); 7 | 8 | var cur_path = location.href; 9 | $("#navbar ul li a").each(function(index, value) { 10 | if (value.text == "Home") 11 | return; 12 | if (value.getAttribute("href") === "#") 13 | return; 14 | 15 | var path = value.href; 16 | if (cur_path == path) { 17 | // Add class to parent
  • , and enclosing
  • if in dropdown 18 | var menu_anchor = $(value); 19 | menu_anchor.parent().addClass("active"); 20 | menu_anchor.closest("li.dropdown").addClass("active"); 21 | } 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /man/EIX-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package.R 3 | \name{EIX-package} 4 | \alias{EIX-package} 5 | \title{EIX package} 6 | \description{ 7 | Structure mining from 'XGBoost' and 'LightGBM' models. 8 | Key functionalities of this package cover: visualisation of tree-based ensembles models, 9 | identification of interactions, measuring of variable importance, 10 | measuring of interaction importance, explanation of single prediction 11 | with break down plots (based on 'xgboostExplainer' and 'iBreakDown' packages). 12 | To download the 'LightGBM' use the following link: . 13 | EIX' is a part of the 'DrWhy.AI' universe. 14 | } 15 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,importance) 4 | S3method(plot,interactions) 5 | S3method(plot,lollipop) 6 | export(importance) 7 | export(interactions) 8 | export(lollipop) 9 | export(waterfall) 10 | import(MASS) 11 | import(data.table) 12 | import(ggplot2) 13 | import(iBreakDown) 14 | import(tidyr) 15 | importFrom(DALEX,theme_drwhy) 16 | importFrom(ggiraphExtra,coord_radar) 17 | importFrom(ggrepel,geom_label_repel) 18 | importFrom(ggrepel,geom_text_repel) 19 | importFrom(purrr,map) 20 | importFrom(scales,pseudo_log_trans) 21 | importFrom(stats,frequency) 22 | importFrom(stats,predict) 23 | importFrom(stats,weighted.mean) 24 | importFrom(xgboost,slice) 25 | importFrom(xgboost,xgb.DMatrix) 26 | importFrom(xgboost,xgb.model.dt.tree) 27 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /R/HR_data.R: -------------------------------------------------------------------------------- 1 | #' Why are our best and most experienced employees leaving prematurely? 2 | #' 3 | #' A dataset from Kaggle competition Human Resources Analytics. 4 | #' https://www.kaggle.com/ludobenistant/hr-analytics/data 5 | #' 6 | #' The description of the dataset was copied from the \code{breakDown} package. 7 | #' 8 | #' \itemize{ 9 | #' \item satisfaction_level Level of satisfaction (0-1) 10 | #' \item last_evaluation Time since last performance evaluation (in Years) 11 | #' \item number_project Number of projects completed while at work 12 | #' \item average_montly_hours Average monthly hours at workplace 13 | #' \item time_spend_company Number of years spent in the company 14 | #' \item Work_accident Whether the employee had a workplace accident 15 | #' \item left Whether the employee left the workplace or not (1 or 0) Factor 16 | #' \item promotion_last_5years Whether the employee was promoted in the last five years 17 | #' \item sales Department in which they work for 18 | #' \item salary Relative level of salary (high) 19 | #' } 20 | #' 21 | #' @name HR_data 22 | #' @format A data table with 14999 rows and 10 variables 23 | #' @source https://www.kaggle.com/ludobenistant/hr-analytics/data, \url{https://cran.r-project.org/package=breakDown} 24 | NULL 25 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: EIX 2 | Title: Explain Interactions in 'XGBoost' 3 | Version: 1.1 4 | Authors@R: c( 5 | person("Ewelina", "Karbowiak", email = "ewelina.karbowiak12@gmail.com", role = c("aut", "cre")), 6 | person("Przemyslaw", "Biecek", email = "przemyslaw.biecek@gmail.com", role = c("aut","ths")) 7 | ) 8 | Description: Structure mining from 'XGBoost' and 'LightGBM' models. 9 | Key functionalities of this package cover: visualisation of tree-based ensembles models, 10 | identification of interactions, measuring of variable importance, 11 | measuring of interaction importance, explanation of single prediction 12 | with break down plots (based on 'xgboostExplainer' and 'iBreakDown' packages). 13 | To download the 'LightGBM' use the following link: . 14 | 'EIX' is a part of the 'DrWhy.AI' universe. 15 | Depends: R (>= 3.4.0) 16 | License: GPL-2 17 | Encoding: UTF-8 18 | LazyData: true 19 | Imports: 20 | MASS, 21 | ggplot2, 22 | data.table, 23 | purrr, 24 | xgboost, 25 | DALEX, 26 | ggrepel, 27 | ggiraphExtra, 28 | iBreakDown, 29 | tidyr, 30 | scales 31 | RoxygenNote: 7.1.1 32 | Suggests: 33 | Matrix, 34 | knitr, 35 | rmarkdown, 36 | lightgbm 37 | VignetteBuilder: knitr 38 | URL: https://github.com/ModelOriented/EIX 39 | BugReports: https://github.com/ModelOriented/EIX/issues 40 | -------------------------------------------------------------------------------- /man/HR_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/HR_data.R 3 | \name{HR_data} 4 | \alias{HR_data} 5 | \title{Why are our best and most experienced employees leaving prematurely?} 6 | \format{ 7 | A data table with 14999 rows and 10 variables 8 | } 9 | \source{ 10 | https://www.kaggle.com/ludobenistant/hr-analytics/data, \url{https://cran.r-project.org/package=breakDown} 11 | } 12 | \description{ 13 | A dataset from Kaggle competition Human Resources Analytics. 14 | https://www.kaggle.com/ludobenistant/hr-analytics/data 15 | } 16 | \details{ 17 | The description of the dataset was copied from the \code{breakDown} package. 18 | 19 | \itemize{ 20 | \item satisfaction_level Level of satisfaction (0-1) 21 | \item last_evaluation Time since last performance evaluation (in Years) 22 | \item number_project Number of projects completed while at work 23 | \item average_montly_hours Average monthly hours at workplace 24 | \item time_spend_company Number of years spent in the company 25 | \item Work_accident Whether the employee had a workplace accident 26 | \item left Whether the employee left the workplace or not (1 or 0) Factor 27 | \item promotion_last_5years Whether the employee was promoted in the last five years 28 | \item sales Department in which they work for 29 | \item salary Relative level of salary (high) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/lollipop.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lollipop.R 3 | \name{lollipop} 4 | \alias{lollipop} 5 | \title{Tables needed for lollipop plot} 6 | \usage{ 7 | lollipop(xgb_model, data) 8 | } 9 | \arguments{ 10 | \item{xgb_model}{a xgboost or lightgbm model.} 11 | 12 | \item{data}{a data table with data used to train the model.} 13 | } 14 | \value{ 15 | an object of the lollipop class 16 | } 17 | \description{ 18 | This function calculates two tables needed to generate lollipop plot, which visualise the model. 19 | The first table contains information about all nodes in the trees forming a model. 20 | It includes gain value, depth and ID of each nodes. 21 | The second table contains similarly information about roots in the trees. 22 | } 23 | \examples{ 24 | library("EIX") 25 | library("Matrix") 26 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 27 | 28 | library("xgboost") 29 | param <- list(objective = "binary:logistic", max_depth = 2) 30 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose = 0) 31 | 32 | lolli <- lollipop(xgb_model, sm) 33 | plot(lolli, labels = "topAll", log_scale = TRUE) 34 | 35 | \donttest{ 36 | library(lightgbm) 37 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 38 | params <- list(objective = "binary", max_depth = 2) 39 | lgb_model <- lgb.train(params, train_data, 25) 40 | 41 | lolli <- lollipop(lgb_model, sm) 42 | plot(lolli, labels = "topAll", log_scale = TRUE) 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /man/plot.interactions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_interactions.R 3 | \name{plot.interactions} 4 | \alias{plot.interactions} 5 | \title{Plot importance of interactions or pairs} 6 | \usage{ 7 | \method{plot}{interactions}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{a result from the \code{interactions} function.} 11 | 12 | \item{...}{other parameters.} 13 | } 14 | \value{ 15 | a ggplot object 16 | } 17 | \description{ 18 | This function plots the importance ranking of interactions and pairs in the model. 19 | } 20 | \details{ 21 | NOTE: Be careful use of this function with \code{option="pairs"} parameter, 22 | because high gain of pair can be a result of high gain of child variable. 23 | As strong interactions should be considered only these pairs of variables, 24 | where variable on the bottom (child) has higher gain than variable on the top (parent). 25 | } 26 | \examples{ 27 | library("EIX") 28 | library("Matrix") 29 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 30 | 31 | library("xgboost") 32 | param <- list(objective = "binary:logistic", max_depth = 2) 33 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 34 | 35 | inter <- interactions(xgb_model, sm, option = "interactions") 36 | inter 37 | plot(inter) 38 | 39 | inter <- interactions(xgb_model, sm, option = "pairs") 40 | inter 41 | plot(inter) 42 | 43 | \donttest{ 44 | library(lightgbm) 45 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 46 | params <- list(objective = "binary", max_depth = 2) 47 | lgb_model <- lgb.train(params, train_data, 25) 48 | 49 | inter <- interactions(lgb_model, sm, option = "interactions") 50 | inter 51 | plot(inter) 52 | 53 | inter <- interactions(lgb_model, sm, option = "pairs") 54 | inter 55 | plot(inter) 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /man/plot.lollipop.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_lollipop.R 3 | \name{plot.lollipop} 4 | \alias{plot.lollipop} 5 | \title{Visualiation of the model} 6 | \usage{ 7 | \method{plot}{lollipop}(x, ..., labels = "topAll", log_scale = TRUE, threshold = 0.1) 8 | } 9 | \arguments{ 10 | \item{x}{a result from the \code{lollipop} function.} 11 | 12 | \item{...}{other parameters.} 13 | 14 | \item{labels}{if "topAll" then labels for the most important interactions (vertical label) 15 | and variables in the roots (horizontal label) will be displayed, 16 | if "interactions" then labels for all interactions, 17 | if "roots" then labels for all variables in the root.} 18 | 19 | \item{log_scale}{TRUE/FALSE logarithmic scale on the plot. Default TRUE.} 20 | 21 | \item{threshold}{on the plot will occur only labels with Gain higher than `threshold` of the max Gain value in the model. 22 | The lower threshold, the more labels on the plot. Range from 0 to 1. Default 0.1.} 23 | } 24 | \value{ 25 | a ggplot object 26 | } 27 | \description{ 28 | The lollipop plots the model with the most important interactions and variables in the roots. 29 | } 30 | \examples{ 31 | library("EIX") 32 | library("Matrix") 33 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 34 | 35 | library("xgboost") 36 | param <- list(objective = "binary:logistic", max_depth = 2) 37 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose = 0) 38 | 39 | lolli <- lollipop(xgb_model, sm) 40 | plot(lolli, labels = "topAll", log_scale = TRUE) 41 | 42 | \donttest{ 43 | library(lightgbm) 44 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 45 | params <- list(objective = "binary", max_depth = 3) 46 | lgb_model <- lgb.train(params, train_data, 25) 47 | 48 | lolli <- lollipop(lgb_model, sm) 49 | plot(lolli, labels = "topAll", log_scale = TRUE) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /man/waterfall.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/waterfall.R 3 | \name{waterfall} 4 | \alias{waterfall} 5 | \title{Explain prediction of a single observation} 6 | \usage{ 7 | waterfall( 8 | xgb_model, 9 | new_observation, 10 | data, 11 | type = "binary", 12 | option = "interactions", 13 | baseline = 0 14 | ) 15 | } 16 | \arguments{ 17 | \item{xgb_model}{a xgboost model.} 18 | 19 | \item{new_observation}{a new observation.} 20 | 21 | \item{data}{row from the original dataset with the new observation to explain (not one-hot-encoded). 22 | The param above has to be set to merge categorical features. 23 | If you dont wont to merge categorical features, set this parameter the same as \code{new_observation}.} 24 | 25 | \item{type}{the learning task of the model. Available tasks: "binary" for binary classification or "regression" for linear regression.} 26 | 27 | \item{option}{if "variables", the plot includes only single variables, 28 | if "interactions", then only interactions. 29 | Default "interaction".} 30 | 31 | \item{baseline}{a number or a character "Intercept" (for model intercept). 32 | The baseline for the plot, where the rectangles should start. 33 | Default 0.} 34 | } 35 | \value{ 36 | an object of the broken class 37 | } 38 | \description{ 39 | This function calculates a table with influence of variables and interactions 40 | on the prediction of a given observation. It supports only xgboost models. 41 | } 42 | \details{ 43 | The function contains code or pieces of code 44 | from \code{breakDown} code created by Przemysław Biecek 45 | and \code{xgboostExplainer} code created by David Foster. 46 | } 47 | \examples{ 48 | 49 | \donttest{ 50 | library("EIX") 51 | library("Matrix") 52 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 53 | 54 | library("xgboost") 55 | param <- list(objective = "binary:logistic", max_depth = 2) 56 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 57 | 58 | data <- HR_data[9,-7] 59 | new_observation <- sm[9,] 60 | 61 | wf <- waterfall(xgb_model, new_observation, data, option = "interactions") 62 | wf 63 | 64 | plot(wf) 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /man/interactions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/interactions.R 3 | \name{interactions} 4 | \alias{interactions} 5 | \title{Importance of interactions and pairs in the model} 6 | \usage{ 7 | interactions(xgb_model, data, option = "interactions") 8 | } 9 | \arguments{ 10 | \item{xgb_model}{a xgboost or lightgbm model.} 11 | 12 | \item{data}{a data table with data used to train the model.} 13 | 14 | \item{option}{if "interactions", the table contains interactions, 15 | if "pairs", this table contains all the pairs in the model. 16 | Default "interactions".} 17 | } 18 | \value{ 19 | a data table 20 | } 21 | \description{ 22 | This function calculates a table with two measures of importance for interactions and pairs in the model. 23 | } 24 | \details{ 25 | Available measures: 26 | \itemize{ 27 | \item "sumGain" - sum of Gain value in all nodes, in which given variable occurs, 28 | \item "freqency" - number of occurrences in the nodes for given variable. 29 | } 30 | 31 | NOTE: Be careful use of this function with \code{option="pairs"} parameter, 32 | because high gain of pair can be a result of high gain of child variable. 33 | As strong interactions should be considered only these pairs of variables, 34 | where variable on the bottom (child) has higher gain than variable on the top (parent). 35 | } 36 | \examples{ 37 | library("EIX") 38 | library("Matrix") 39 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 40 | 41 | library("xgboost") 42 | param <- list(objective = "binary:logistic", max_depth = 2) 43 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 44 | 45 | inter <- interactions(xgb_model, sm, option = "interactions") 46 | inter 47 | plot(inter) 48 | 49 | inter <- interactions(xgb_model, sm, option = "pairs") 50 | inter 51 | plot(inter) 52 | 53 | \donttest{ 54 | library(lightgbm) 55 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 56 | params <- list(objective = "binary", max_depth = 2) 57 | lgb_model <- lgb.train(params, train_data, 25) 58 | 59 | inter <- interactions(lgb_model, sm, option = "interactions") 60 | inter 61 | plot(inter) 62 | 63 | inter <- interactions(lgb_model, sm, option = "pairs") 64 | inter 65 | plot(inter) 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /R/lollipop.R: -------------------------------------------------------------------------------- 1 | #' Tables needed for lollipop plot 2 | #' 3 | #' This function calculates two tables needed to generate lollipop plot, which visualise the model. 4 | #' The first table contains information about all nodes in the trees forming a model. 5 | #' It includes gain value, depth and ID of each nodes. 6 | #' The second table contains similarly information about roots in the trees. 7 | #' 8 | #' @param xgb_model a xgboost or lightgbm model. 9 | #' @param data a data table with data used to train the model. 10 | #' 11 | #' @return an object of the lollipop class 12 | #' 13 | #' @import data.table 14 | #' 15 | #' @examples 16 | #' library("EIX") 17 | #' library("Matrix") 18 | #' sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 19 | #' 20 | #' library("xgboost") 21 | #' param <- list(objective = "binary:logistic", max_depth = 2) 22 | #' xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose = 0) 23 | #' 24 | #' lolli <- lollipop(xgb_model, sm) 25 | #' plot(lolli, labels = "topAll", log_scale = TRUE) 26 | #' 27 | #'\donttest{ 28 | #'library(lightgbm) 29 | #'train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 30 | #'params <- list(objective = "binary", max_depth = 2) 31 | #'lgb_model <- lgb.train(params, train_data, 25) 32 | #' 33 | #' lolli <- lollipop(lgb_model, sm) 34 | #' plot(lolli, labels = "topAll", log_scale = TRUE) 35 | #' 36 | #'} 37 | #' 38 | #' @export 39 | 40 | lollipop <- function(xgb_model, data){ 41 | 42 | Feature <- Quality <- Node <- Tree <- ID <- depth <- 43 | interaction <- . <- parentsName <- name_pair <- NULL 44 | 45 | trees = rbindlist(calculateGain(xgb_model, data)) 46 | roots <- trees[Node == 0, .(Quality, Feature, Tree, ID, depth)] 47 | nodes <- trees[Feature != "Leaf", .(Quality, 48 | Feature, 49 | Node, 50 | Tree, 51 | ID, 52 | interaction, 53 | depth, 54 | parentsName, 55 | name_pair)] 56 | nodes <- nodes[interaction == TRUE, Feature := name_pair] 57 | 58 | lollipop <- list(nodes, roots) 59 | class(lollipop) <- c("lollipop", "list") 60 | 61 | return(lollipop) 62 | } 63 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /vignettes/titanic_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "EIX: Titanic data " 3 | author: "Ewelina Karbowiak" 4 | date: "2018-29-03" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{EIX for Titanic data} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, echo=FALSE} 13 | knitr::opts_chunk$set(fig.height = 6, fig.width = 6, fig.align = "center") 14 | 15 | ``` 16 | 17 | ## Data Info 18 | This vignette shows usage of `EIX` package for titanic data. This dataset was copied from `stablelearner` package. With `EIX` package we explain XGBoost classification model concerning the survival problem. More details about `EIX` package [here](https://modeloriented.github.io/EIX/articles/EIX.html). 19 | 20 | 21 | ```{r} 22 | #devtools :: install_github("ModelOriented/EIX") 23 | library("EIX") 24 | library(data.table) 25 | set.seed(4) 26 | titanic_data<-data.table(na.omit(titanic_data)) 27 | knitr::kable(head(titanic_data)) 28 | 29 | library("Matrix") 30 | sparse_matrix <- sparse.model.matrix(survived ~ . - 1, data = titanic_data) 31 | ``` 32 | 33 | 34 | ## Xgboost model creation 35 | 36 | 37 | ```{r, warning=FALSE, message=FALSE} 38 | library("xgboost") 39 | param <- list(objective = "binary:logistic", max_depth = 2) 40 | xgb_model <- xgboost(sparse_matrix, params = param, label = titanic_data[, "survived"] == "yes", nrounds = 50, verbose = FALSE) 41 | ``` 42 | 43 | 44 | ## Model visualization 45 | 46 | First let's plot the model. 47 | 48 | ```{r} 49 | lolli<-lollipop(xgb_model,sparse_matrix) 50 | plot(lolli, threshold=0.02) 51 | ``` 52 | 53 | 54 | ## Interactions 55 | 56 | Next we explore interactions using `interactions()` functions and its plot. 57 | 58 | ```{r} 59 | interactions<-interactions(xgb_model, sparse_matrix, option = "interactions") 60 | head(interactions, 15) 61 | plot(interactions) 62 | ``` 63 | 64 | ## Variables' and interactions’ importance 65 | 66 | ```{r} 67 | importance<-importance(xgb_model, sparse_matrix, option = "both") 68 | head(importance, 15) 69 | plot(importance, radar=FALSE) 70 | ``` 71 | 72 | ```{r} 73 | plot(importance) 74 | ``` 75 | 76 | ## Explanation of the single prediction including interactions 77 | 78 | Let's see an explanation of the prediction for an 18-year-old from England who has traveled 3rd class. 79 | 80 | ```{r} 81 | data <- titanic_data[27,] 82 | new_observation <- sparse_matrix[27,] 83 | wf<-waterfall(xgb_model, new_observation, data, option = "interactions") 84 | wf 85 | plot(wf) 86 | ``` 87 | -------------------------------------------------------------------------------- /docs/jquery.sticky-kit.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net 3 | */ 4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); 5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
    "))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, 6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), 8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", 9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n threshold * (max(nodes[, Quality]))),] 60 | roots_labels <- roots[Quality > threshold * (max(nodes[, Quality])),] 61 | 62 | p <- { 63 | switch(labels, 64 | topAll = { 65 | p + geom_text_repel(data = data.frame(nodes_labels), 66 | aes(label = Feature), 67 | angle = 90, nudge_y = 0.05, direction = "x", vjust = 0, segment.size = 0.2) + 68 | geom_label_repel(data = data.frame(roots_labels), 69 | aes(label = Feature)) 70 | }, 71 | interactions = { 72 | p + geom_text_repel(data = data.frame(nodes_labels), 73 | aes(label=Feature), 74 | angle = 90, nudge_y = 0.05, direction = "x", vjust = 0, segment.size = 0.2 ) 75 | }, 76 | roots = { 77 | p + geom_label_repel(data = data.frame(roots_labels), aes(label = Feature)) 78 | })} 79 | 80 | q <- p + theme_drwhy()+ ylab("Gain") + 81 | scale_shape_discrete("Depth") + 82 | scale_colour_discrete("Depth") + if (log_scale){scale_x_continuous(trans="pseudo_log")} 83 | q 84 | } 85 | 86 | -------------------------------------------------------------------------------- /man/plot.importance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_importance.R 3 | \name{plot.importance} 4 | \alias{plot.importance} 5 | \title{Plot importance measures} 6 | \usage{ 7 | \method{plot}{importance}( 8 | x, 9 | ..., 10 | top = 10, 11 | radar = TRUE, 12 | text_start_point = 0.5, 13 | text_size = 3.5, 14 | xmeasure = "sumCover", 15 | ymeasure = "sumGain" 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{a result from the \code{importance} function.} 20 | 21 | \item{...}{other parameters.} 22 | 23 | \item{top}{number of positions on the plot or NULL for all variable. Default 10.} 24 | 25 | \item{radar}{TRUE/FALSE. If TRUE the plot shows 26 | six measures of variables' or interactions' importance in the model. 27 | If FALSE the plot containing two chosen measures 28 | of variables' or interactions' importance in the model.} 29 | 30 | \item{text_start_point}{place, where the names of the particular feature start. Available for `radar=TRUE`. Range from 0 to 1. Default 0.5.} 31 | 32 | \item{text_size}{size of the text on the plot. Default 3.5.} 33 | 34 | \item{xmeasure}{measure on the x-axis.Available for `radar=FALSE`. Default "sumCover".} 35 | 36 | \item{ymeasure}{measure on the y-axis. Available for `radar=FALSE`. Default "sumGain".} 37 | } 38 | \value{ 39 | a ggplot object 40 | } 41 | \description{ 42 | This functions plots selected measures of importance for variables and interactions. 43 | It is possible to visualise importance table in two ways: radar plot with six measures 44 | and scatter plot with two choosen measures. 45 | } 46 | \details{ 47 | Available measures: 48 | \itemize{ 49 | \item "sumGain" - sum of Gain value in all nodes, in which given variable occurs, 50 | \item "sumCover" - sum of Cover value in all nodes, in which given variable occurs; for LightGBM models: number of observation, which pass through the node, 51 | \item "mean5Gain" - mean gain from 5 occurrences of given variable with the highest gain, 52 | \item "meanGain" - mean Gain value in all nodes, in which given variable occurs, 53 | \item "meanCover" - mean Cover value in all nodes, in which given variable occurs; for LightGBM models: mean number of observation, which pass through the node, 54 | \item "freqency" - number of occurrences in the nodes for given variable. 55 | } 56 | 57 | Additionally for plots with single variables: 58 | \itemize{ 59 | \item "meanDepth" - mean depth weighted by gain, 60 | \item "numberOfRoots" - number of occurrences in the root, 61 | \item "weightedRoot" - mean number of occurrences in the root, which is weighted by gain. 62 | } 63 | } 64 | \examples{ 65 | library("EIX") 66 | library("Matrix") 67 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 68 | 69 | library("xgboost") 70 | param <- list(objective = "binary:logistic", max_depth = 2) 71 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 72 | 73 | imp <- importance(xgb_model, sm, option = "both") 74 | imp 75 | plot(imp, top = 10) 76 | 77 | imp <- importance(xgb_model, sm, option = "variables") 78 | imp 79 | plot(imp, top = nrow(imp)) 80 | 81 | imp <- importance(xgb_model, sm, option = "interactions") 82 | imp 83 | plot(imp, top = nrow(imp)) 84 | 85 | imp <- importance(xgb_model, sm, option = "variables") 86 | imp 87 | plot(imp, top = NULL, radar = FALSE, xmeasure = "sumCover", ymeasure = "sumGain") 88 | 89 | \donttest{ 90 | library(lightgbm) 91 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 92 | params <- list(objective = "binary", max_depth = 2) 93 | lgb_model <- lgb.train(params, train_data, 25) 94 | 95 | imp <- importance(lgb_model, sm, option = "both") 96 | imp 97 | plot(imp, top = nrow(imp)) 98 | 99 | imp <- importance(lgb_model, sm, option = "variables") 100 | imp 101 | plot(imp, top = NULL, radar = FALSE, xmeasure = "sumCover", ymeasure = "sumGain") 102 | 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /R/interactions.R: -------------------------------------------------------------------------------- 1 | #' Importance of interactions and pairs in the model 2 | #' 3 | #' This function calculates a table with two measures of importance for interactions and pairs in the model. 4 | #' 5 | #' Available measures: 6 | #'\itemize{ 7 | #'\item "sumGain" - sum of Gain value in all nodes, in which given variable occurs, 8 | #'\item "freqency" - number of occurrences in the nodes for given variable. 9 | #'} 10 | #' 11 | #' NOTE: Be careful use of this function with \code{option="pairs"} parameter, 12 | #' because high gain of pair can be a result of high gain of child variable. 13 | #' As strong interactions should be considered only these pairs of variables, 14 | #' where variable on the bottom (child) has higher gain than variable on the top (parent). 15 | #' 16 | #' @param xgb_model a xgboost or lightgbm model. 17 | #' @param data a data table with data used to train the model. 18 | #' @param option if "interactions", the table contains interactions, 19 | #' if "pairs", this table contains all the pairs in the model. 20 | #' Default "interactions". 21 | #' 22 | #' @return a data table 23 | #' 24 | #' @import data.table 25 | #' @importFrom purrr map 26 | #' @importFrom stats frequency 27 | #' 28 | #' @examples 29 | #' library("EIX") 30 | #' library("Matrix") 31 | #' sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 32 | #' 33 | #' library("xgboost") 34 | #' param <- list(objective = "binary:logistic", max_depth = 2) 35 | #' xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 36 | #' 37 | #' inter <- interactions(xgb_model, sm, option = "interactions") 38 | #' inter 39 | #' plot(inter) 40 | #' 41 | #' inter <- interactions(xgb_model, sm, option = "pairs") 42 | #' inter 43 | #' plot(inter) 44 | #' 45 | #' \donttest{ 46 | #'library(lightgbm) 47 | #'train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 48 | #'params <- list(objective = "binary", max_depth = 2) 49 | #'lgb_model <- lgb.train(params, train_data, 25) 50 | #' 51 | #' inter <- interactions(lgb_model, sm, option = "interactions") 52 | #' inter 53 | #' plot(inter) 54 | #' 55 | #' inter <- interactions(lgb_model, sm, option = "pairs") 56 | #' inter 57 | #' plot(inter) 58 | #'} 59 | #' 60 | #' @export 61 | #' 62 | #' 63 | 64 | interactions <- function(xgb_model, data, option = "interactions"){ 65 | Child <- Parent <- Feature <- sumGain <- . <- NULL 66 | 67 | if (option == "interactions") { 68 | gainTable <- importanceInteraction(xgb_model, data)[, .(Feature, sumGain, frequency)] 69 | gainTable <-gainTable[, `:=`(Parent = as.vector(unlist(map(strsplit(gainTable[, Feature], "[:]"), 1))), 70 | Child = as.vector(unlist(map(strsplit(gainTable[, Feature], "[:]"), 2))))] 71 | gainTable <- gainTable[, -1] 72 | gainTable <- gainTable[,.(Parent, Child, sumGain, frequency)] 73 | } 74 | if (option == "pairs") { 75 | gainTable <- calculatePairsGainTable(xgb_model, data) 76 | } 77 | class(gainTable) <- c("interactions", "data.table") 78 | return(gainTable) 79 | 80 | } 81 | 82 | #calculatePairsGainTable containing gains of all variables' pairs occur in the model. 83 | calculatePairsGainTable <- function(xgb_model, data) { 84 | name_pair <- childsGain <- Parent <- Child <- sumGain <- N <- . <- NULL 85 | 86 | treeList <- calculateGain(xgb_model, data) 87 | trees <- rbindlist(treeList) 88 | 89 | importanceCount <- data.table(table(trees[, "name_pair"],dnn = "name_pair")) 90 | importanceGain <- trees[, .(sumGain = sum(childsGain)), by = "name_pair"] 91 | importance <- merge(importanceCount, importanceGain, by = "name_pair") 92 | importance <- 93 | importance[, `:=`(Parent = as.vector(unlist(map(strsplit(importance[, name_pair], "[:]"), 1))), 94 | Child = as.vector(unlist(map(strsplit(importance[, name_pair], "[:]"), 2 ))))] 95 | importance <- importance[, -1] 96 | setorderv(importance, "sumGain", -1) 97 | 98 | return(importance[,.(Parent, Child, sumGain, frequency = N)]) 99 | } 100 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 |
      102 |
    • 103 |

      Ewelina Karbowiak. Author, maintainer. 104 |

      105 |
    • 106 |
    • 107 |

      Przemyslaw Biecek. Author, thesis advisor. 108 |

      109 |
    • 110 |
    111 | 112 |
    113 | 114 |
    115 | 116 | 117 |
    118 | 121 | 122 |
    123 |

    Site built with pkgdown.

    124 |
    125 | 126 |
    127 |
    128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 |
    103 |

    All vignettes

    104 |

    105 | 106 | 110 |
    111 |
    112 |
    113 | 114 |
    115 | 118 | 119 |
    120 |

    Site built with pkgdown.

    121 |
    122 | 123 |
    124 |
    125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticker footer */ 2 | body > .container { 3 | display: flex; 4 | padding-top: 60px; 5 | min-height: calc(100vh); 6 | flex-direction: column; 7 | } 8 | 9 | body > .container .row { 10 | flex: 1; 11 | } 12 | 13 | footer { 14 | margin-top: 45px; 15 | padding: 35px 0 36px; 16 | border-top: 1px solid #e5e5e5; 17 | color: #666; 18 | display: flex; 19 | } 20 | footer p { 21 | margin-bottom: 0; 22 | } 23 | footer div { 24 | flex: 1; 25 | } 26 | footer .pkgdown { 27 | text-align: right; 28 | } 29 | footer p { 30 | margin-bottom: 0; 31 | } 32 | 33 | img.icon { 34 | float: right; 35 | } 36 | 37 | img { 38 | max-width: 100%; 39 | } 40 | 41 | /* Section anchors ---------------------------------*/ 42 | 43 | a.anchor { 44 | margin-left: -30px; 45 | display:inline-block; 46 | width: 30px; 47 | height: 30px; 48 | visibility: hidden; 49 | 50 | background-image: url(./link.svg); 51 | background-repeat: no-repeat; 52 | background-size: 20px 20px; 53 | background-position: center center; 54 | } 55 | 56 | .hasAnchor:hover a.anchor { 57 | visibility: visible; 58 | } 59 | 60 | @media (max-width: 767px) { 61 | .hasAnchor:hover a.anchor { 62 | visibility: hidden; 63 | } 64 | } 65 | 66 | 67 | /* Fixes for fixed navbar --------------------------*/ 68 | 69 | .contents h1, .contents h2, .contents h3, .contents h4 { 70 | padding-top: 60px; 71 | margin-top: -60px; 72 | } 73 | 74 | /* Static header placement on mobile devices */ 75 | @media (max-width: 767px) { 76 | .navbar-fixed-top { 77 | position: absolute; 78 | } 79 | .navbar { 80 | padding: 0; 81 | } 82 | } 83 | 84 | 85 | /* Sidebar --------------------------*/ 86 | 87 | #sidebar { 88 | margin-top: 30px; 89 | } 90 | #sidebar h2 { 91 | font-size: 1.5em; 92 | margin-top: 1em; 93 | } 94 | 95 | #sidebar h2:first-child { 96 | margin-top: 0; 97 | } 98 | 99 | #sidebar .list-unstyled li { 100 | margin-bottom: 0.5em; 101 | } 102 | 103 | /* Reference index & topics ----------------------------------------------- */ 104 | 105 | .ref-index th {font-weight: normal;} 106 | .ref-index h2 {font-size: 20px;} 107 | 108 | .ref-index td {vertical-align: top;} 109 | .ref-index .alias {width: 40%;} 110 | .ref-index .title {width: 60%;} 111 | 112 | .ref-index .alias {width: 40%;} 113 | .ref-index .title {width: 60%;} 114 | 115 | .ref-arguments th {text-align: right; padding-right: 10px;} 116 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 117 | .ref-arguments .name {width: 20%;} 118 | .ref-arguments .desc {width: 80%;} 119 | 120 | /* Nice scrolling for wide elements --------------------------------------- */ 121 | 122 | table { 123 | display: block; 124 | overflow: auto; 125 | } 126 | 127 | /* Syntax highlighting ---------------------------------------------------- */ 128 | 129 | pre { 130 | word-wrap: normal; 131 | word-break: normal; 132 | border: 1px solid #eee; 133 | } 134 | 135 | pre, code { 136 | background-color: #f8f8f8; 137 | color: #333; 138 | } 139 | 140 | pre .img { 141 | margin: 5px 0; 142 | } 143 | 144 | pre .img img { 145 | background-color: #fff; 146 | display: block; 147 | height: auto; 148 | } 149 | 150 | code a, pre a { 151 | color: #375f84; 152 | } 153 | table { 154 | display: block; 155 | overflow: auto; 156 | width: 100% !important; 157 | } 158 | 159 | .fl {color: #1514b5;} 160 | .fu {color: #000000;} /* function */ 161 | .ch,.st {color: #036a07;} /* string */ 162 | .kw {color: #264D66;} /* keyword */ 163 | .co {color: #888888;} /* comment */ 164 | 165 | .message { color: black; font-weight: bolder;} 166 | .error { color: orange; font-weight: bolder;} 167 | .warning { color: #6A0366; font-weight: bolder;} 168 | 169 | .navbar-mi2logo { 170 | float: left; 171 | margin-right: 15px; 172 | margin-top: 2px; 173 | } 174 | .navbar-mi2 { 175 | background-color: #4a3c89; 176 | color: #fff !important; 177 | margin-right: 0px; 178 | } 179 | .navbar-mi2 > li > a { 180 | color: #fff !important; 181 | } 182 | .navbar-mi2 > .active > a{ 183 | background-color: #370f54 !important; 184 | } 185 | .navbar-mi2 > .open > a:focus, .nav-pills> .open > a:focus{ 186 | background-color: #370f54 !important; 187 | } 188 | .dropdown-menu > .active > a, .dropdown-menu > .active > a:focus{ 189 | background-color: #370f54 !important; 190 | } 191 | 192 | .contents-mi2 > li > a:focus, .nav-pills > li > a:focus { 193 | background-color: #4a3c89 !important; 194 | color: #fff; 195 | } 196 | .contents-mi2 > li.active > a, .nav-pills > li.active > a{ 197 | background-color: #370f54 !important; 198 | } 199 | .contents-mi2 > li > a, .nav-pills > li > a{ 200 | background-color: #4a3c89 !important; 201 | color: #fff; 202 | } 203 | 204 | .sidebar-logo { 205 | display:block; 206 | margin-left:auto; 207 | margin-right:auto; 208 | text-align: justify; 209 | } -------------------------------------------------------------------------------- /docs/reference/tableOfTrees.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | tableOfTrees — tableOfTrees • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    tableOfTrees

    104 | 105 | 106 |
    tableOfTrees(model, data)
    107 | 108 |

    Arguments

    109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 |
    model

    a xgboost or lightgbm model

    data

    a data table with data used to train the model

    120 | 121 |

    Value

    122 | 123 |

    a data table

    124 | 125 | 126 |
    127 | 136 |
    137 | 138 |
    139 | 142 | 143 |
    144 |

    Site built with pkgdown.

    145 |
    146 | 147 |
    148 |
    149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /docs/reference/calculateGain.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | calculateGain — calculateGain • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    List of trees with pairs of variable and other needed fields

    104 | 105 | 106 |
    calculateGain(xgb.model, data)
    107 | 108 |

    Arguments

    109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 |
    xgb.model

    a xgboost or lightgbm model

    data

    a data table with data used to train the model

    120 | 121 |

    Value

    122 | 123 |

    a list

    124 | 125 | 126 |
    127 | 136 |
    137 | 138 |
    139 | 142 | 143 |
    144 |

    Site built with pkgdown.

    145 |
    146 | 147 |
    148 |
    149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /R/calculateGain.R: -------------------------------------------------------------------------------- 1 | #' calculateGain 2 | #' 3 | #' List of trees with pairs of variable and other needed fields 4 | #' 5 | #' @param xgb.model a xgboost or lightgbm model 6 | #' @param data a data table with data used to train the model 7 | #' 8 | #' @return a list 9 | #' 10 | #' @import data.table 11 | #' 12 | #' @keywords internal 13 | #' 14 | 15 | # @import stats 16 | # @import utils 17 | calculateGain <- function(xgb.model, data) { 18 | 19 | leaf <- Feature <- Yes <- No <- ID <- parentsGain <- Quality <- parentsCover <- 20 | Cover <- name_pair <- childsGain <- depth <- parentsName <- NULL 21 | 22 | trees = tableOfTrees(xgb.model, data) 23 | trees[, leaf := Feature == "Leaf"] 24 | trees$depth <- 0 25 | treeList = split(trees, as.factor(trees$Tree)) 26 | 27 | for (tree in treeList) { 28 | num_nodes = nrow(tree) 29 | non_leaf_rows = which(tree[, leaf] == F) 30 | for (r in non_leaf_rows) { 31 | left = tree[r, Yes] 32 | right = tree[r, No] 33 | if (tree[ID == left, leaf] == F) { 34 | # newDepth <- tree[r , depth] + 1 35 | tree[ID == left,`:=`(parentsGain = tree[r, Quality], 36 | parentsCover = tree[r, Cover], 37 | name_pair = paste(tree[r, Feature], tree[ID == left, Feature], sep = ":"), 38 | childsGain = Quality, 39 | depth = tree[r , depth] + 1, 40 | parentsName = tree[r, Feature])] 41 | tree[ID == left, interaction := ((parentsGain < childsGain) & (Feature != parentsName))] 42 | } 43 | 44 | if (tree[ID == right, leaf]==F) { 45 | 46 | #newDepth <- tree[r , depth] + 1 47 | tree[ID == right, `:=`(parentsGain = tree[r, Quality], 48 | parentsCover = tree[r, Cover], 49 | name_pair = paste(tree[r, Feature], tree[ID == right, Feature], sep = ":"), 50 | childsGain = Quality, 51 | depth = tree[r , depth] + 1, 52 | parentsName = tree[r, Feature])] 53 | tree[ID == right, interaction := ((parentsGain < childsGain) & (Feature != parentsName))] 54 | } 55 | } 56 | } 57 | 58 | return(treeList) 59 | } 60 | 61 | #'tableOfTrees 62 | #' 63 | #'tableOfTrees 64 | #' 65 | #' @param model a xgboost or lightgbm model 66 | #' @param data a data table with data used to train the model 67 | #' 68 | #' @return a data table 69 | #' 70 | #' @import data.table 71 | #' @importFrom xgboost xgb.model.dt.tree 72 | #' 73 | #' @keywords internal 74 | #' 75 | # @import lightgbm 76 | 77 | 78 | tableOfTrees <- function(model, data){ 79 | count <- split_feature <- leaf_count <- internal_count <- 80 | split_index <- tree_index <- leaf_index <- threshold <- 81 | leaf_value <- split_gain <- flag <- node_parent <- leaf_parent<- 82 | Node <- Feature <- . <- Cover <- Yes <- No <- ID <- 83 | Tree<- Quality <- Missing <-Leaf_old_num<- Split <- NULL 84 | 85 | 86 | if(class(model)[1] == "xgb.Booster") { 87 | return(xgb.model.dt.tree(colnames(data), model)[]) 88 | } 89 | if (class(model)[1] == "lgb.Booster") { 90 | lgb.trees <- lightgbm::lgb.model.dt.tree(model) 91 | 92 | lgb.trees <- lgb.trees[, count := ifelse(is.na(split_feature), leaf_count, internal_count)] 93 | 94 | lgb.trees <- lgb.trees[, max := max(split_index, na.rm = TRUE), by = tree_index] 95 | 96 | #UWAGA: nie jest tu istotne rodzaj nierówności, interesuje nas, że ktoś jest rodzicem, a nie, czy idzie w prawo i w lewo, dlatego losowe przypisanie Yes, No 97 | 98 | trees <- lgb.trees[, `:=`(Tree = tree_index, 99 | Node = ifelse(is.na(split_index), max + leaf_index + 1, split_index), 100 | Feature = ifelse(is.na(split_feature), "Leaf", split_feature), 101 | Split = threshold, Missing = NA, Quality = ifelse(is.na(split_feature), leaf_value, split_gain), 102 | Cover = (1/4)*count, 103 | Leaf_old_num=leaf_index)] 104 | trees[, `:=`(ID = paste(Tree, Node, sep = "-"))] 105 | 106 | trees[, flag := FALSE] 107 | treeList = split(trees, as.factor(trees$Tree)) 108 | 109 | for (tree in treeList) { 110 | num_nodes = nrow(tree) 111 | for (i in 1:num_nodes) { 112 | if (is.na(tree[i, node_parent]) == FALSE) { 113 | if (tree[Node == tree[i, node_parent] , flag] == FALSE) { 114 | tree[Node == tree[i, node_parent] , Yes := paste(tree[i, Tree], tree[i, Node], sep = "-")] 115 | tree[Node == tree[i, node_parent] , flag := TRUE] 116 | } else{ 117 | tree[Node == tree[i, node_parent] , No := paste(tree[i, Tree], tree[i, Node], sep = "-")] 118 | } 119 | } 120 | if (is.na(tree[i, leaf_parent]) == FALSE) { 121 | if (tree[Node == tree[i, leaf_parent] , flag] == FALSE) { 122 | tree[Node == tree[i, leaf_parent] , Yes := paste(tree[i, Tree], tree[i, Node], sep = "-")] 123 | tree[Node == tree[i, leaf_parent] , flag := TRUE] 124 | } else{ 125 | tree[Node == tree[i, leaf_parent] , No := paste(tree[i, Tree], tree[i, Node], sep = "-")] 126 | } 127 | } 128 | } 129 | } 130 | trees <- rbindlist(treeList) 131 | trees[, .(Tree, Node, ID, Feature, Split, Yes, No, Missing, Quality, Cover)] 132 | return(trees[, .(Tree, Node, ID, Feature, Split, Yes, No, Missing, Quality, Cover,Leaf_old_num)][]) 133 | } 134 | if (class(model)[1] != "xgb.Booster" || "lgb.Booster") { 135 | return(cat( "You should choose one of two available models: xgboost, lightgbm \n" )) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Explain Interactions in 'XGBoost' • EIX 9 | 10 | 11 | 12 | 16 | 17 | 18 |
    19 |
    71 | 72 | 73 | 74 |
    75 |
    76 | 77 |
    78 |

    79 | EIX - Explain Interactions in Xgboost

    80 |

    A set of tools to explain XGBoost and LightGBM models.

    81 |
    82 |
    83 |

    84 | Installation

    85 |

    Install from GitHub

    86 |
    devtools::install_github("ModelOriented/EIX")
    87 |
    88 |
    89 |

    90 | Cheatsheets

    91 |

    92 |
    93 | 94 |
    95 | 96 | 126 |
    127 | 128 | 129 |
    132 | 133 |
    134 |

    Site built with pkgdown.

    135 |
    136 | 137 |
    138 |
    139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /docs/reference/HR_data.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Why are our best and most experienced employees leaving prematurely? — HR_data • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    A dataset from Kaggle competition Human Resources Analytics. 104 | https://www.kaggle.com/ludobenistant/hr-analytics/data

    105 | 106 | 107 | 108 |

    Format

    109 | 110 |

    A data table with 14999 rows and 10 variables

    111 | 112 |

    Source

    113 | 114 |

    https://www.kaggle.com/ludobenistant/hr-analytics/data, https://cran.r-project.org/package=breakDown

    115 | 116 |

    Details

    117 | 118 |

    The description of the dataset was copied from the breakDown package.

    119 |
      120 |
    • satisfaction_level Level of satisfaction (0-1)

    • 121 |
    • last_evaluation Time since last performance evaluation (in Years)

    • 122 |
    • number_project Number of projects completed while at work

    • 123 |
    • average_montly_hours Average monthly hours at workplace

    • 124 |
    • time_spend_company Number of years spent in the company

    • 125 |
    • Work_accident Whether the employee had a workplace accident

    • 126 |
    • left Whether the employee left the workplace or not (1 or 0) Factor

    • 127 |
    • promotion_last_5years Whether the employee was promoted in the last five years

    • 128 |
    • sales Department in which they work for

    • 129 |
    • salary Relative level of salary (high)

    • 130 |
    131 | 132 | 133 |
    134 | 146 |
    147 | 148 |
    149 | 152 | 153 |
    154 |

    Site built with pkgdown.

    155 |
    156 | 157 |
    158 |
    159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /R/plot_importance.R: -------------------------------------------------------------------------------- 1 | #' Plot importance measures 2 | #' 3 | #' This functions plots selected measures of importance for variables and interactions. 4 | #' It is possible to visualise importance table in two ways: radar plot with six measures 5 | #' and scatter plot with two choosen measures. 6 | #' 7 | #' Available measures: 8 | #'\itemize{ 9 | #'\item "sumGain" - sum of Gain value in all nodes, in which given variable occurs, 10 | #'\item "sumCover" - sum of Cover value in all nodes, in which given variable occurs; for LightGBM models: number of observation, which pass through the node, 11 | #'\item "mean5Gain" - mean gain from 5 occurrences of given variable with the highest gain, 12 | #'\item "meanGain" - mean Gain value in all nodes, in which given variable occurs, 13 | #'\item "meanCover" - mean Cover value in all nodes, in which given variable occurs; for LightGBM models: mean number of observation, which pass through the node, 14 | #'\item "freqency" - number of occurrences in the nodes for given variable. 15 | #'} 16 | #' 17 | #' Additionally for plots with single variables: 18 | #'\itemize{ 19 | #'\item "meanDepth" - mean depth weighted by gain, 20 | #'\item "numberOfRoots" - number of occurrences in the root, 21 | #'\item "weightedRoot" - mean number of occurrences in the root, which is weighted by gain. 22 | #'} 23 | #' 24 | #' @param x a result from the \code{importance} function. 25 | #' @param top number of positions on the plot or NULL for all variable. Default 10. 26 | #' @param radar TRUE/FALSE. If TRUE the plot shows 27 | #' six measures of variables' or interactions' importance in the model. 28 | #' If FALSE the plot containing two chosen measures 29 | #' of variables' or interactions' importance in the model. 30 | #' @param text_start_point place, where the names of the particular feature start. Available for `radar=TRUE`. Range from 0 to 1. Default 0.5. 31 | #' @param text_size size of the text on the plot. Default 3.5. 32 | #' @param xmeasure measure on the x-axis.Available for `radar=FALSE`. Default "sumCover". 33 | #' @param ymeasure measure on the y-axis. Available for `radar=FALSE`. Default "sumGain". 34 | #' @param ... other parameters. 35 | #' 36 | #' @return a ggplot object 37 | #' 38 | #' @import ggplot2 39 | #' @import data.table 40 | #' @importFrom DALEX theme_drwhy 41 | #' @importFrom ggrepel geom_label_repel 42 | #' @importFrom ggiraphExtra coord_radar 43 | #' 44 | #' @examples 45 | #' library("EIX") 46 | #' library("Matrix") 47 | #' sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 48 | #' 49 | #' library("xgboost") 50 | #' param <- list(objective = "binary:logistic", max_depth = 2) 51 | #' xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 52 | #' 53 | #' imp <- importance(xgb_model, sm, option = "both") 54 | #' imp 55 | #' plot(imp, top = 10) 56 | #' 57 | #' imp <- importance(xgb_model, sm, option = "variables") 58 | #' imp 59 | #' plot(imp, top = nrow(imp)) 60 | #' 61 | #' imp <- importance(xgb_model, sm, option = "interactions") 62 | #' imp 63 | #' plot(imp, top = nrow(imp)) 64 | #' 65 | #' imp <- importance(xgb_model, sm, option = "variables") 66 | #' imp 67 | #' plot(imp, top = NULL, radar = FALSE, xmeasure = "sumCover", ymeasure = "sumGain") 68 | #' 69 | #'\donttest{ 70 | #'library(lightgbm) 71 | #'train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 72 | #'params <- list(objective = "binary", max_depth = 2) 73 | #'lgb_model <- lgb.train(params, train_data, 25) 74 | #' 75 | #' imp <- importance(lgb_model, sm, option = "both") 76 | #' imp 77 | #' plot(imp, top = nrow(imp)) 78 | #' 79 | #' imp <- importance(lgb_model, sm, option = "variables") 80 | #' imp 81 | #' plot(imp, top = NULL, radar = FALSE, xmeasure = "sumCover", ymeasure = "sumGain") 82 | #' 83 | #'} 84 | #' 85 | #' @export 86 | 87 | 88 | plot.importance <- function(x, ..., top = 10, radar = TRUE, text_start_point = 0.5, text_size=3.5, 89 | xmeasure = "sumCover", ymeasure = "sumGain"){ 90 | 91 | Feature <- sumGain <- sumCover <- meanGain <- meanCover <- 92 | mean5Gain <- . <- value <- variable <- hjust <- NULL 93 | 94 | if (is.null(top)) 95 | top <- nrow(x) 96 | 97 | 98 | if (radar == FALSE) { 99 | ggplot(data.frame(x[1:top, ]), 100 | aes_string(x = xmeasure, y = ymeasure, label = "Feature")) + 101 | geom_point() + 102 | scale_size() + geom_label_repel() + theme_drwhy() 103 | 104 | }else{ 105 | import <- as.data.table(x[1:top, ]) 106 | import <- import[1:top, .(Feature, 107 | sumGain = sumGain / max(import[, sumGain]), 108 | sumCover = sumCover / max(import[, sumCover]), 109 | meanGain = meanGain / max(import[, meanGain]), 110 | meanCover = meanCover / max(import[, meanCover]), 111 | mean5Gain = mean5Gain / max(import[, mean5Gain]), 112 | frequency = frequency / max(import[, frequency]))] 113 | data<-import[,Feature:= ifelse(nchar(import[,Feature])>20, gsub(":", ": :",import[,Feature]),Feature)] 114 | 115 | import$Feature <- factor(import$Feature, levels = import$Feature[order(import$sumGain, decreasing = TRUE)]) 116 | 117 | #angles and hjust of labels 118 | numberOfBars=nrow(import) 119 | angle= 90-360*(row(import)[,1]-0.5)/numberOfBars 120 | 121 | import$hjust<-ifelse( angle < -90, 1, 0) 122 | import$angle<-ifelse(angle < -90, angle+180, angle) 123 | 124 | data_to_plot <- melt(import, id = c(1,8,9), measures = 2:6, value.factor = FALSE) 125 | data<-data_to_plot[,.(hjust=mean(hjust),angle=mean(angle)), by=Feature] 126 | 127 | ggplot(data.frame(data_to_plot), 128 | aes(x = Feature, y = value, colour = variable, group = variable)) + 129 | geom_line(size = 1.5) + 130 | geom_point(size = 2.5) + 131 | theme_drwhy() + 132 | theme(axis.title.x = element_blank(), 133 | axis.title.y = element_blank(), 134 | legend.position = "bottom", 135 | panel.grid.major.y = element_line(colour = "gray68", linetype = "dashed", size = 0.4), 136 | axis.line = element_blank(), 137 | axis.text.x=element_blank(),) + 138 | labs(fill = "Measures")+ 139 | coord_radar() + 140 | geom_text(data=data, aes(x=Feature, y= rep(text_start_point,top), label=lapply(strwrap(data[,Feature], width = 10, simplify = FALSE), paste, collapse="\n"), hjust=hjust), color="#371ea3", fontface="bold",alpha=0.6, size=text_size, angle= data$angle, inherit.aes = FALSE ) 141 | 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Function reference • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 104 | 105 |
    106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 119 | 120 | 121 | 122 | 125 | 126 | 127 | 128 | 131 | 132 | 133 | 134 | 137 | 138 | 139 | 140 | 143 | 144 | 145 | 146 | 149 | 150 | 151 | 152 | 155 | 156 | 157 | 158 | 161 | 162 | 163 | 164 | 167 | 168 | 169 | 170 | 173 | 174 | 175 | 176 |
    116 |

    All functions

    117 |

    118 |
    123 |

    HR_data

    124 |

    Why are our best and most experienced employees leaving prematurely?

    129 |

    importance()

    130 |

    Importance of variables and interactions in the model

    135 |

    interactions()

    136 |

    Importance of interactions and pairs in the model

    141 |

    lollipop()

    142 |

    Tables needed for lollipop plot

    147 |

    plot(<importance>)

    148 |

    Plot importance measures

    153 |

    plot(<interactions>)

    154 |

    Plot importance of interactions or pairs

    159 |

    plot(<lollipop>)

    160 |

    Visualiation of the model

    165 |

    titanic_data

    166 |

    Passengers and Crew on the RMS Titanic

    171 |

    waterfall()

    172 |

    Explain prediction of a single observation

    177 |
    178 |
    179 | 180 | 186 |
    187 | 188 |
    189 | 192 | 193 |
    194 |

    Site built with pkgdown.

    195 |
    196 | 197 |
    198 |
    199 | 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /R/importance.R: -------------------------------------------------------------------------------- 1 | #' Importance of variables and interactions in the model 2 | #' 3 | #' This functions calculates a table with selected measures of importance 4 | #' for variables and interactions. 5 | #' 6 | #' Available measures: 7 | #'\itemize{ 8 | #'\item "sumGain" - sum of Gain value in all nodes, in which given variable occurs, 9 | #'\item "sumCover" - sum of Cover value in all nodes, in which given variable occurs; for LightGBM models: number of observation, which pass through the node, 10 | #'\item "mean5Gain" - mean gain from 5 occurrences of given variable with the highest gain, 11 | #'\item "meanGain" - mean Gain value in all nodes, in which given variable occurs, 12 | #'\item "meanCover" - mean Cover value in all nodes, in which given variable occurs; for LightGBM models: mean number of observation, which pass through the node, 13 | #'\item "freqency" - number of occurrences in the nodes for given variable. 14 | #'} 15 | #' 16 | #' Additionally for table with single variables: 17 | #'\itemize{ 18 | #'\item "meanDepth" - mean depth weighted by gain, 19 | #'\item "numberOfRoots" - number of occurrences in the root, 20 | #'\item "weightedRoot" - mean number of occurrences in the root, which is weighted by gain. 21 | #'} 22 | #' 23 | #' @param xgb_model a xgboost or lightgbm model. 24 | #' @param data a data table with data used to train the model. 25 | #' @param option if "variables" then table includes only single variables, 26 | #' if "interactions", then only interactions 27 | #' if "both", then both single variable and interactions. 28 | #' Default "both". 29 | #' @param digits number of significant digits that shall be returned. Will be passed to the signif() functions. 30 | #' 31 | #' @return a data table 32 | #' 33 | #' @import data.table 34 | #' @importFrom stats frequency 35 | #' @importFrom stats weighted.mean 36 | #' 37 | #' @examples 38 | #' library("EIX") 39 | #' library("Matrix") 40 | #' sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 41 | #' 42 | #' library("xgboost") 43 | #' param <- list(objective = "binary:logistic", max_depth = 2) 44 | #' xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 45 | #' 46 | #' imp <- importance(xgb_model, sm, option = "both") 47 | #' imp 48 | #' plot(imp, top = 10) 49 | #' 50 | #' imp <- importance(xgb_model, sm, option = "variables") 51 | #' imp 52 | #' plot(imp, top = nrow(imp)) 53 | #' 54 | #' imp <- importance(xgb_model, sm, option = "interactions") 55 | #' imp 56 | #' plot(imp, top = nrow(imp)) 57 | #' 58 | #' imp <- importance(xgb_model, sm, option = "variables") 59 | #' imp 60 | #' plot(imp, top = NULL, radar = FALSE, xmeasure = "sumCover", ymeasure = "sumGain") 61 | #' 62 | #'\donttest{ 63 | #'library(lightgbm) 64 | #'train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 65 | #'params <- list(objective = "binary", max_depth = 2) 66 | #'lgb_model <- lgb.train(params, train_data, 25) 67 | #' 68 | #' imp <- importance(lgb_model, sm, option = "both") 69 | #' imp 70 | #' plot(imp, top = nrow(imp)) 71 | #' 72 | #' imp <- importance(lgb_model, sm, option = "variables") 73 | #' imp 74 | #' plot(imp, top = NULL, radar = FALSE, xmeasure = "sumCover", ymeasure = "sumGain") 75 | #' 76 | #'} 77 | #' 78 | #' @export 79 | 80 | importance <- function(xgb_model, data, option = "both", digits = 4){ 81 | importance <- NULL 82 | 83 | if (option == "both") { 84 | importance <- importanceTableMixed(xgb_model, data) 85 | } 86 | if (option == "variables") { 87 | importance <- importanceSingleVariable(xgb_model, data) 88 | } 89 | if (option == "interactions") { 90 | importance <- importanceInteraction(xgb_model, data) 91 | } 92 | 93 | importance <- cbind(importance[, 1], signif(importance[, -1], digits = digits)) 94 | #importance <- unlist(importance) 95 | 96 | class(importance) <- c("importance", "data.table") 97 | 98 | return(importance[]) 99 | 100 | } 101 | 102 | importanceTableMixed <- function(xgb_model, data){ 103 | parentsGain <- childsGain <- name_pair <- Cover <- Feature <- 104 | Gain <- indx <- . <- Quality <- NULL 105 | 106 | trees <- noLeavesGainTable(xgb_model, data) 107 | 108 | #single variables 109 | importanceSingle <- 110 | trees[(interaction == FALSE) | (is.na(interaction)), .(Feature, Gain = Quality, Cover)] 111 | 112 | #interactions 113 | interactions <- trees[interaction == TRUE] 114 | importanceInter <- interactions[, .(Feature = name_pair, Gain = childsGain, Cover)] 115 | importance <- rbind(importanceSingle, importanceInter) 116 | 117 | importance4 <- 118 | merge(importance[, .(sumGain = sum(Gain), 119 | sumCover = sum(Cover), 120 | meanGain = mean(Gain), 121 | meanCover = mean(Cover), 122 | frequency = .N), by = Feature], 123 | mean5gain(importance), by = "Feature") 124 | 125 | setorderv(importance4, "sumGain", -1) 126 | 127 | return(importance4[]) 128 | 129 | } 130 | 131 | 132 | importanceInteraction <- function(xgb_model, data) { 133 | parentsGain <- childsGain <- name_pair <- Cover <- . <- Feature <- Gain <- indx <- NULL 134 | 135 | trees <- noLeavesGainTable(xgb_model, data) 136 | trees <- trees[interaction == TRUE] 137 | tress <- trees[, `:=`(Feature = name_pair, Gain = childsGain)] 138 | tress <- trees[, .(Feature, Gain, Cover)] 139 | importance <- merge(trees[, .(sumGain = sum(Gain), 140 | sumCover = sum(Cover), 141 | meanGain = mean(Gain), 142 | meanCover = mean(Cover), 143 | frequency = .N), by = Feature], 144 | mean5gain(trees), by = "Feature") 145 | 146 | setorderv(importance, "sumGain", -1) 147 | 148 | return(importance[]) 149 | } 150 | 151 | 152 | importanceSingleVariable <- function(xgb_model, data) { 153 | Feature <- Gain <- Quality <- Cover <- indx <- . <- NULL 154 | 155 | trees <- noLeavesGainTable(xgb_model, data) 156 | trees[, Gain := Quality] 157 | 158 | importance1 <- merge(countRoots(trees),calculateWeightedDepth(trees), by = "Feature", all = TRUE)[, -"sumGain"] 159 | 160 | trees <- trees[, .(Feature, Gain, Cover)] 161 | 162 | importance2 <- merge(trees[,.(sumGain=sum(Gain), 163 | sumCover=sum(Cover), 164 | meanGain=mean(Gain), 165 | meanCover=mean(Cover), 166 | frequency=.N),,by=Feature], 167 | mean5gain(trees), by="Feature") 168 | importance <- merge(importance1, importance2, by = "Feature")[, -"count"] 169 | 170 | setorderv(importance, "sumGain", -1) 171 | importance[is.na(importance)] <- 0 172 | 173 | return(importance[]) 174 | 175 | } 176 | 177 | #Table with number of roots and weighedRoot 178 | #counts how many times each variable is in the root of the tree and calculates the weighedRoot-number of occurrences in root weighed by Gain. 179 | countRoots <- function(trees) { 180 | Node <- Quality <- Feature <- sumGain <- . <- 181 | weightedRoot <- numberOfRoots <- NULL 182 | 183 | roots <- trees[Node == 0, ] 184 | roots <- roots[, .(sumGain = sum(Quality), numberOfRoots = .N), by = Feature] 185 | sumGains <- sum(roots[, sumGain]) 186 | roots <- roots[, weightedRoot := round(roots[, sumGain] * roots[, numberOfRoots] / sumGains, 4)] 187 | 188 | return(roots[]) 189 | 190 | } 191 | 192 | #Mean form 5 nodes with the highests gain 193 | mean5gain <- function(trees) { 194 | indx <- Gain <- . <- Feature <- NULL 195 | 196 | setorder(setDT(trees), Feature,-Gain)[, indx := seq_len(.N), by = Feature] 197 | importanceTop <- trees[indx <= 5] 198 | importance <- importanceTop[, .(mean5Gain = mean(Gain)), by = Feature] 199 | 200 | return(importance[]) 201 | } 202 | 203 | #calculates depth mean for every variable weighted by Gain 204 | calculateWeightedDepth <- function(trees) { 205 | Feature <- depth <- Quality <- . <- NULL 206 | 207 | trees <- trees[, .(meanDepth = weighted.mean(depth, Quality), count = .N), by = Feature] 208 | 209 | return(trees[]) 210 | } 211 | 212 | 213 | noLeavesGainTable <- function(xgb_model, data) { 214 | parentsName <- Feature <- Tree <- name_pair <- parentsGain <- childsGain <- 215 | . <- Cover <- parentsCover <- interaction <- Node <- Quality <- depth <- NULL 216 | 217 | treeList <- calculateGain(xgb_model, data) 218 | trees <- rbindlist(treeList) 219 | trees <- trees[Feature != "Leaf", .(Tree, Node, name_pair, parentsGain, childsGain, Cover, 220 | parentsCover, Feature, Quality, parentsName, interaction, depth)] 221 | 222 | return(trees[]) 223 | } 224 | -------------------------------------------------------------------------------- /docs/reference/titanic.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Passengers and Crew on the RMS Titanic — titanic • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 87 | 88 | 89 |
    90 | 91 |
    92 |
    93 | 96 | 97 | 98 |

    The titanic data is a complete list of passengers and crew members on the RMS Titanic. 99 | It includes a variable indicating whether a person did survive the sinking of the RMS 100 | Titanic on April 15, 1912.

    101 | 102 | 103 |
    data(titanic)
    104 | 105 |

    Format

    106 | 107 |

    a data frame with 2207 rows and 11 columns

    108 | 109 |

    Source

    110 | 111 |

    The description of dataset was copied from the DALEX package. 112 | This dataset was copied from the stablelearner package and went through few variable 113 | transformations. The complete list of persons on the RMS titanic was downloaded from 114 | https://www.encyclopedia-titanica.org on April 5, 2016. The information given 115 | in sibsp and parch was adopoted from a data set obtained from http://biostat.mc.vanderbilt.edu/DataSets.

    116 | 117 |

    Details

    118 | 119 |

    The description of the dataset was copied from the DALEX package.

    120 |

    This dataset was copied from the stablelearner package and went through few variable 121 | transformations. Levels in embarked was replaced with full names, sibsp, parch and fare 122 | were converted to numerical variables and values for crew were replaced with 0. 123 | If you use this dataset please cite the original package.

    124 |

    From stablelearner: The website https://www.encyclopedia-titanica.org offers detailed information about passengers and crew 125 | members on the RMS Titanic. According to the website 1317 passengers and 890 crew member were abord. 126 | 8 musicians and 9 employees of the shipyard company are listed as passengers, but travelled with a 127 | free ticket, which is why they have NA values in fare. In addition to that, fare 128 | is truely missing for a few regular passengers.

    129 |
      130 |
    • gender a factor with levels male and female.

    • 131 |
    • age a numeric value with the persons age on the day of the sinking.

    • 132 |
    • class a factor specifying the class for passengers or the type of service aboard for crew members.

    • 133 |
    • embarked a factor with the persons place of of embarkment (Belfast/Cherbourg/Queenstown/Southampton).

    • 134 |
    • country a factor with the persons home country.

    • 135 |
    • fare a numeric value with the ticket price (0 for crew members, musicians and employees of the shipyard company).

    • 136 |
    • sibsp an ordered factor specifying the number if siblings/spouses aboard; adopted from Vanderbild data set (see below).

    • 137 |
    • parch an ordered factor specifying the number of parents/children aboard; adopted from Vanderbild data set (see below).

    • 138 |
    • survived a factor with two levels (no and yes) specifying whether the person has survived the sinking.

    • 139 |
    140 | 141 |

    References

    142 | 143 |

    https://www.encyclopedia-titanica.org, http://biostat.mc.vanderbilt.edu/DataSets, 144 | https://CRAN.R-project.org/package=stablelearner, https://cran.r-project.org/web/packages/DALEX/index.html.

    145 | 146 | 147 |
    148 | 162 |
    163 | 164 |
    165 | 168 | 169 |
    170 |

    Site built with pkgdown.

    171 |
    172 | 173 |
    174 |
    175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /docs/reference/titanic_data.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Passengers and Crew on the RMS Titanic — titanic_data • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    The titanic data is a complete list of passengers and crew members on the RMS Titanic. 104 | It includes a variable indicating whether a person did survive the sinking of the RMS 105 | Titanic on April 15, 1912.

    106 | 107 | 108 |
    data(titanic_data)
    109 | 110 |

    Format

    111 | 112 |

    a data frame with 2207 rows and 11 columns

    113 | 114 |

    Source

    115 | 116 |

    The description of dataset was copied from the DALEX package. 117 | This dataset was copied from the stablelearner package and went through few variable 118 | transformations. The complete list of persons on the RMS titanic was downloaded from 119 | https://www.encyclopedia-titanica.org on April 5, 2016. The information given 120 | in sibsp and parch was adopoted from a data set obtained from http://biostat.mc.vanderbilt.edu/DataSets.

    121 | 122 |

    Details

    123 | 124 |

    The description of the dataset was copied from the DALEX package.

    125 |

    This dataset was copied from the stablelearner package and went through few variable 126 | transformations. Levels in embarked was replaced with full names, sibsp, parch and fare 127 | were converted to numerical variables and values for crew were replaced with 0. 128 | If you use this dataset please cite the original package.

    129 |

    From stablelearner: The website https://www.encyclopedia-titanica.org offers detailed information about passengers and crew 130 | members on the RMS Titanic. According to the website 1317 passengers and 890 crew member were abord. 131 | 8 musicians and 9 employees of the shipyard company are listed as passengers, but travelled with a 132 | free ticket, which is why they have NA values in fare. In addition to that, fare 133 | is truely missing for a few regular passengers.

    134 |
      135 |
    • gender a factor with levels male and female.

    • 136 |
    • age a numeric value with the persons age on the day of the sinking.

    • 137 |
    • class a factor specifying the class for passengers or the type of service aboard for crew members.

    • 138 |
    • embarked a factor with the persons place of of embarkment (Belfast/Cherbourg/Queenstown/Southampton).

    • 139 |
    • country a factor with the persons home country.

    • 140 |
    • fare a numeric value with the ticket price (0 for crew members, musicians and employees of the shipyard company).

    • 141 |
    • sibsp an ordered factor specifying the number if siblings/spouses aboard; adopted from Vanderbild data set (see below).

    • 142 |
    • parch an ordered factor specifying the number of parents/children aboard; adopted from Vanderbild data set (see below).

    • 143 |
    • survived a factor with two levels (no and yes) specifying whether the person has survived the sinking.

    • 144 |
    145 | 146 |

    References

    147 | 148 |

    https://www.encyclopedia-titanica.org, http://biostat.mc.vanderbilt.edu/DataSets, 149 | https://CRAN.R-project.org/package=stablelearner, https://cran.r-project.org/package=DALEX.

    150 | 151 | 152 |
    153 | 167 |
    168 | 169 |
    170 | 173 | 174 |
    175 |

    Site built with pkgdown.

    176 |
    177 | 178 |
    179 |
    180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /docs/reference/countPairs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Table of occurancess number — countPairs • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 57 | 58 | 59 | 60 | 61 | 62 |
    63 |
    64 | 99 | 100 | 101 |
    102 | 103 |
    104 |
    105 | 110 | 111 |
    112 | 113 |

    Table containing occurancess number of variables' pairs in the model.

    114 | 115 |
    116 | 117 |
    countPairs(xgb.model, data)
    118 | 119 |

    Arguments

    120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 |
    xgb.model

    a xgboost or lightgbm model

    data

    a data table with data used to train the model

    131 | 132 |

    Value

    133 | 134 |

    a data table

    135 | 136 | 137 |

    Examples

    138 |
    library("EIX") 139 | library("Matrix")
    #> Warning: pakiet 'Matrix' został zbudowany w wersji R 3.4.4
    library("data.table")
    #> Warning: pakiet 'data.table' został zbudowany w wersji R 3.4.4
    library("xgboost")
    #> Warning: pakiet 'xgboost' został zbudowany w wersji R 3.4.4
    140 | dt_HR <- data.table(HR_data) 141 | sm <- sparse.model.matrix(left ~ . - 1, data = dt_HR) 142 | 143 | param <- list(objective = "binary:logistic", base_score = 0.5, max_depth = 2) 144 | xgb.model <- xgboost( param = param, data = sm, label = dt_HR[, left] == 1, nrounds = 50, verbose = FALSE) 145 | 146 | countPairs(xgb.model, sm)
    #> Error in countPairs(xgb.model, sm): nie udało się znaleźć funkcji 'countPairs'
    147 |
    148 |
    149 | 160 |
    161 | 162 |
    163 | 166 | 167 |
    168 |

    Site built with pkgdown 1.3.0.

    169 |
    170 |
    171 |
    172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /docs/reference/lollipop.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Tables needed for lollipop plot — lollipop • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    This function calculates two tables needed to generate lollipop plot, which visualise the model. 104 | The first table contains information about all nodes in the trees forming a model. 105 | It includes gain value, depth and ID of each nodes. 106 | The second table contains similarly information about roots in the trees.

    107 | 108 | 109 |
    lollipop(xgb_model, data)
    110 | 111 |

    Arguments

    112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 |
    xgb_model

    a xgboost or lightgbm model.

    data

    a data table with data used to train the model.

    123 | 124 |

    Value

    125 | 126 |

    an object of the lollipop class

    127 | 128 | 129 |

    Examples

    130 |
    library("EIX") 131 | library("Matrix") 132 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 133 | 134 | library("xgboost") 135 | param <- list(objective = "binary:logistic", max_depth = 2) 136 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose = 0) 137 | 138 | lolli <- lollipop(xgb_model, sm) 139 | plot(lolli, labels = "topAll", log_scale = TRUE)
    140 |
    library(lightgbm) 141 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 142 | params <- list(objective = "binary", max_depth = 2) 143 | lgb_model <- lgb.train(params, train_data, 25) 144 | 145 | lolli <- lollipop(lgb_model, sm) 146 | plot(lolli, labels = "topAll", log_scale = TRUE)
    147 |
    148 |
    149 |
    150 | 161 |
    162 | 163 |
    164 | 167 | 168 |
    169 |

    Site built with pkgdown.

    170 |
    171 | 172 |
    173 |
    174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /docs/reference/EIX_lollipop.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Tables needed for lollipop plot — EIX_lollipop • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 72 | 73 | 74 |
    75 | 76 |
    77 |
    78 | 81 | 82 | 83 |

    This function calculates two tables needed to generate lollipop plot, which visualise the model. 84 | The first table contains information about all nodes in the trees forming a model. 85 | It includes gain value, depth and ID of each nodes. 86 | The second table contains similarly information about roots in the trees.

    87 | 88 | 89 |
    EIX_lollipop(xgb.model, data)
    90 | 91 |

    Arguments

    92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
    xgb.model

    a xgboost or lightgbm model.

    data

    a data table with data used to train the model.

    103 | 104 |

    Value

    105 | 106 |

    an object of the lollipop class

    107 | 108 | 109 |

    Examples

    110 |
    library("EIX") 111 | library("Matrix")
    #> Warning: pakiet 'Matrix' został zbudowany w wersji R 3.4.4
    sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 112 | 113 | library("xgboost")
    #> Warning: pakiet 'xgboost' został zbudowany w wersji R 3.4.4
    param <- list(objective = "binary:logistic", max_depth = 2) 114 | xgb.model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 50)
    #> [1] train-error:0.150077 115 | #> [2] train-error:0.098007 116 | #> [3] train-error:0.098007 117 | #> [4] train-error:0.098007 118 | #> [5] train-error:0.098007 119 | #> [6] train-error:0.098007 120 | #> [7] train-error:0.098007 121 | #> [8] train-error:0.095873 122 | #> [9] train-error:0.095873 123 | #> [10] train-error:0.095606 124 | #> [11] train-error:0.095473 125 | #> [12] train-error:0.093406 126 | #> [13] train-error:0.061271 127 | #> [14] train-error:0.059404 128 | #> [15] train-error:0.055137 129 | #> [16] train-error:0.063271 130 | #> [17] train-error:0.043070 131 | #> [18] train-error:0.042670 132 | #> [19] train-error:0.039203 133 | #> [20] train-error:0.038536 134 | #> [21] train-error:0.037669 135 | #> [22] train-error:0.037869 136 | #> [23] train-error:0.036802 137 | #> [24] train-error:0.037336 138 | #> [25] train-error:0.036602 139 | #> [26] train-error:0.036402 140 | #> [27] train-error:0.036669 141 | #> [28] train-error:0.035802 142 | #> [29] train-error:0.035402 143 | #> [30] train-error:0.032202 144 | #> [31] train-error:0.031869 145 | #> [32] train-error:0.031469 146 | #> [33] train-error:0.030935 147 | #> [34] train-error:0.030602 148 | #> [35] train-error:0.030269 149 | #> [36] train-error:0.029402 150 | #> [37] train-error:0.029269 151 | #> [38] train-error:0.028802 152 | #> [39] train-error:0.028802 153 | #> [40] train-error:0.028535 154 | #> [41] train-error:0.028269 155 | #> [42] train-error:0.028202 156 | #> [43] train-error:0.027935 157 | #> [44] train-error:0.027669 158 | #> [45] train-error:0.027669 159 | #> [46] train-error:0.027402 160 | #> [47] train-error:0.028269 161 | #> [48] train-error:0.027268 162 | #> [49] train-error:0.026668 163 | #> [50] train-error:0.026335
    164 | lolli <- EIX_lollipop(xgb.model, sm) 165 | plot(lolli, labels = "topAll", log_scale = TRUE)
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Transformation introduced infinite values in continuous x-axis
    #> Warning: Removed 7 rows containing missing values (geom_text_repel).
    166 |
    167 |
    168 | 179 |
    180 | 181 |
    182 | 185 | 186 |
    187 |

    Site built with pkgdown.

    188 |
    189 | 190 |
    191 |
    192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /vignettes/EIX.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "EIX: Explain Interactions in XGBoost" 3 | author: "Ewelina Karbowiak" 4 | date: "2018-12-07" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{EIX Explain Interactions in XGBoost} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, echo=FALSE} 13 | knitr::opts_chunk$set(fig.height = 6, fig.width = 6, fig.align = "center") 14 | 15 | ``` 16 | 17 | Package `EIX` is the set of tools to explore the structure of XGBoost and lightGBM models. It includes functions finding strong interactions and also checking importance of single variables and interactions by usage different measures. `EIX` consists several functions to visualize results. 18 | 19 | Almost all `EIX` functions require only two parameters: a XGBoost or LightGBM model and data table used as training dataset. The exceptions are the `waterfall` function and its plot. The first one requires parameters: a XGBoost model and observation, which prediction has to be explained). These two functions support only XGBoost models. All plots are created with package `ggplot2`. Most of them use plot theme `theme_mi2` from `DALEX`. 20 | 21 | ## Data Info 22 | 23 | This vignette shows usage of `EIX` package. It lets to explain XGBoost prediction model concerning departures of employees from company using HR_data. Dataset was taken from kaggle and consists 14999 observations and 10 variables. The dataset is also available in package `EIX` and there it is described more precisely. 24 | 25 | ```{r} 26 | #devtools :: install_github("ModelOriented/EIX") 27 | library("EIX") 28 | set.seed(4) 29 | knitr::kable(head(HR_data)) 30 | ``` 31 | 32 | To create correct XGBoost model, remember to change categorical features to factors and next change the data frame to sparse matrix. The categorical features are one-hot encoded. 33 | 34 | ```{r, warning=FALSE, message=FALSE} 35 | library("Matrix") 36 | sparse_matrix <- sparse.model.matrix(left ~ . - 1, data = HR_data) 37 | head(sparse_matrix) 38 | ``` 39 | 40 | 41 | ## Xgboost model creation 42 | 43 | Package `EIX` uses table, which was generated by `xgboost::xgb.model.dt.tree` with information about trees, their nodes and leaves. 44 | 45 | ```{r} 46 | library("xgboost") 47 | param <- list(objective = "binary:logistic", max_depth = 2) 48 | xgb_model <- xgboost(sparse_matrix, params = param, label = HR_data[, left] == 1, nrounds = 50, verbose = FALSE) 49 | knitr::kable(head(xgboost::xgb.model.dt.tree(colnames(sparse_matrix),xgb_model))) 50 | ``` 51 | 52 | Function `xgboost::xgb.importance` shows importance of single variables. `EIX` adds new measures of variables’ importance and shows also importance of interactions. 53 | 54 | ```{r} 55 | knitr::kable(head(xgboost::xgb.importance(colnames(sparse_matrix),xgb_model))) 56 | ``` 57 | 58 | ## Model visualization 59 | 60 | The `lollipop` plot is used to visualize the model in such way that the most important variables and interactions are visible. 61 | 62 | On the x-axis, there are tree numbers and on the y-axis there is **Gain** measure for each node. One segment is one tree in the model and each point is one node. On the plot there are all nodes, which are not leaves. Shape of points signifies depth of node. All roots on the plot are connected by a red line. If in the same segment there is a variable with a higher depth above the variable with a lower depth, it means that interaction occurs. 63 | 64 | There is opportunity to set a different way of labeling. On the plot we can see the most important variables in roots (horizontal labels), and interactions (vertical labels), this is option `labels = "topAll"` which is default. Moreover, there are two additional options: `labels = "roots"` - for variables in roots only, `labels = "interactions"` for interactions only. The numbers of labels visible on the plot you can change by parametr `threshold` (range from 0 to 1, default 0.1). 65 | The plot is on a logarithmic scale because the initial trees usually are the most important. You can change the scale of the plot by setting the parameter `log_scale = FALSE`. 66 | 67 | ```{r} 68 | lolli<-lollipop(xgb_model,sparse_matrix) 69 | plot(lolli) 70 | #plot(lolli, threshold=0.05) 71 | #plot(lolli, labels="roots") 72 | #plot(lolli, labels="interactions") 73 | #plot(lolli, labels="roots", threshold=0.05) 74 | #plot(lolli, labels="interactions",threshold=0.05) 75 | #plot(lolli, log_scale = FALSE) 76 | ``` 77 | 78 | 79 | ## Interactions 80 | 81 | We can consider interactions in two ways. In first approach we can explore all pairs of variable, which occur in the model one above the other. This approach is not the best one, because we cannot distinguish if pair of variables are real interaction or not. In this approach high gain of pair can be a result of high gain of down variable (child). To explore pairs of variables you can generate table with them using function `interactions` with parametr `option = "pairs"`. This table includes **Gain** measure and number of occurrences of pairs. You can also use the function `plot` to visualize **Gain** measure. 82 | 83 | ```{r} 84 | pairs<-interactions(xgb_model, sparse_matrix, option = "pairs") 85 | head(pairs) 86 | plot(pairs) 87 | ``` 88 | 89 | The `interactions` plot is a matrix plot with a child from the pair on the x-axis and the parent on the y-axis. The color of the square at the intersection of two variables means value of **sumGain** measure. The darker square, the higher **sumGain** of variable pairs. The range of **sumGain** measure is divided into four equal parts: `very low, low, medium, high`. 90 | 91 | In second approach, to find strong interactions, we can consider only these pairs of variables, where variable on the bottom (child) has higher gain than variable on the top (parent). We can also create ranking of interactions using function `importance` with parameter `option = "interactions"`. More details in the next section. 92 | 93 | ```{r, warning=FALSE, message=FALSE} 94 | interactions<-interactions(xgb_model, sparse_matrix, option = "interactions") 95 | head(interactions) 96 | plot(interactions) 97 | ``` 98 | 99 | 100 | ## Variables' and interactions’ importance 101 | 102 | For exploring variables’ and interactions’ importance there are three functions in `EIX` package: `importance`, its `plot` with parameter `radar = TRUE` or `radar = FALSE`. 103 | With `EIX` package we can compare importance of single variables and interactions. The functions `importance` can return three kinds of outputs, depending on the `opt` parameter: 104 | 105 | * `option = "variables"` - it consists only single variables 106 | 107 | * `option = "interactions"`- only interactions 108 | 109 | * `option = "both"`- output shows importance both single variables and interactions. 110 | 111 | NOTE: `option = "both"` is not direct connection `option = "variables"` and `option = "interactions"`, because values of variable importance measure, which were in the interactions, are not included in importance of single variable. 112 | 113 | In `EIX` the following measures are available: 114 | 115 | * **sumGain** - sum of Gain value in all nodes, in which given variable occurs 116 | * **sumCover** - sum of Cover value in all nodes, in which given variable occurs; for LightGBM models: number of observation, which pass through the node 117 | * **mean5Gain** - mean gain from 5 occurrences of given variable with the highest gain 118 | * **meanGain** - mean Gain value in all nodes, in which given variable occurs 119 | * **meanCover** - mean Cover value in all nodes, in which given variable occurs; for LightGBM models: mean number of observation, which pass through the node 120 | * **freqency** - number of occurrences in the nodes for given variable 121 | 122 | `EIX` package gives additionally measures of variables importance for single variable: 123 | 124 | * **counterRoot** - number of occurrences in the root 125 | * **weightedRoot** - mean number of occurrences in the root, which is weighted by gain 126 | * **meanDepth ** - mean depth weighted by gain 127 | 128 | The function `importance` returns a table with all available importance measures for given option. 129 | The table is sorted by descending value of **sumGain**. 130 | 131 | The function `plot` with parameter `radar = FALSE` and a result from the `importance` function as an argument shows two measures of importance, which can be chosen by `xmeasure` and `ymeasure` parameters. By parameter `top` we can decide how many positions will be included in the plot. 132 | 133 | ```{r} 134 | importance<-importance(xgb_model, sparse_matrix, option = "both") 135 | head(importance) 136 | plot(importance, radar=FALSE) 137 | #plot(importance, xmeasure = "mean5Gain", ymeasure = "sumGain", top = 15, radar=FALSE) 138 | ``` 139 | 140 | The function `plot` with parameter `radar = TRUE` enables to compare different measures of variables and interactions importance on the radar plot from `ggiraphExtra` package. 141 | Bellow I attach the example of radar plot. On the outside of the circle there are names of variables or interactions. Colored lines represent various measures of importance. The positions on the plot are sorted decreasing. The variable with the highest **sumGain** value is on the right of 12 o'clock. Next the **sumGain** value decreases in a clockwise direction. On the plot it is possible to change place, where the features names start by parameter `text_start_point` (range from 0 to 1, default 0.5), and size of this text by parametrer `text_size`. 142 | 143 | ```{r} 144 | plot(importance) 145 | #plot(importance, text_start_point = 0.3) 146 | #plot(importance, text_size = 4) 147 | #plot(importance, top=15) 148 | ``` 149 | 150 | ## Explanation of single prediction including interactions 151 | 152 | For single prediction explaining package `EIX` uses two packages: `xgboostExplainer` i `breakDown`. The package `xgboostExplainer` is a tool to interpreting prediction of xgboost model. The package `EIX` uses its code and modifies it to include interactions. The methodology of plot creation comes from package `breakDown`. 153 | 154 | The function `waterfall` returns table with variables’ impact on the prediction of the model. Depending on the parameter `option`, the table includes interactions (`option = "interactions"`- default) or does not (`option = "variables"`). The function `plot` with `waterfall` object as an argument visualizes this table. On the y-axis there are: intercept (it is the probability that random variable from training dataset will be 1), variables (which have an impact on prediction) and final prognosis of the model. On the x-axis there is log-odds of impact each variables. 155 | 156 | ```{r} 157 | data <- HR_data[9,] 158 | new_observation <- sparse_matrix[9,] 159 | wf<-waterfall(xgb_model, new_observation, data, option = "interactions") 160 | wf 161 | plot(wf) 162 | #wf<-waterfall(xgb_model, new_observation, data, option = "interactions", baseline = "intercept") 163 | #wf 164 | #plot(wf) 165 | ``` 166 | -------------------------------------------------------------------------------- /docs/reference/plot.lollipop.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Visualiation of the model — plot.lollipop • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    The lollipop plots the model with the most important interactions and variables in the roots.

    104 | 105 | 106 |
    # S3 method for lollipop
    107 | plot(x, ..., labels = "topAll", log_scale = TRUE,
    108 |   threshold = 0.1)
    109 | 110 |

    Arguments

    111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 136 | 137 |
    x

    a result from the lollipop function.

    ...

    other parameters.

    labels

    if "topAll" then labels for the most important interactions (vertical label) 124 | and variables in the roots (horizontal label) will be displayed, 125 | if "interactions" then labels for all interactions, 126 | if "roots" then labels for all variables in the root.

    log_scale

    TRUE/FALSE logarithmic scale on the plot. Default TRUE.

    threshold

    on the plot will occur only labels with Gain higher than `threshold` of the max Gain value in the model. 135 | The lower threshold, the more labels on the plot. Range from 0 to 1. Default 0.1.

    138 | 139 |

    Value

    140 | 141 |

    a ggplot object

    142 | 143 | 144 |

    Examples

    145 |
    library("EIX") 146 | library("Matrix") 147 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 148 | 149 | library("xgboost") 150 | param <- list(objective = "binary:logistic", max_depth = 2) 151 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose = 0) 152 | 153 | lolli <- lollipop(xgb_model, sm) 154 | plot(lolli, labels = "topAll", log_scale = TRUE)
    155 |
    library(lightgbm) 156 | train_data <- lgb.Dataset(sm, label = HR_data[, left] == 1) 157 | params <- list(objective = "binary", max_depth = 3) 158 | lgb_model <- lgb.train(params, train_data, 25) 159 | 160 | lolli <- lollipop(lgb_model, sm) 161 | plot(lolli, labels = "topAll", log_scale = TRUE)
    162 |
    163 |
    164 | 175 |
    176 | 177 |
    178 | 181 | 182 |
    183 |

    Site built with pkgdown.

    184 |
    185 | 186 |
    187 |
    188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /docs/reference/waterfall.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Explain prediction of a single observation — waterfall • EIX 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 |
    39 |
    40 | 92 | 93 | 94 |
    95 | 96 |
    97 |
    98 | 101 | 102 | 103 |

    This function calculates a table with influence of variables and interactions 104 | on the prediction of a given observation. It supports only xgboost models.

    105 | 106 | 107 |
    waterfall(xgb_model, new_observation, data, type = "binary",
    108 |   option = "interactions", baseline = 0)
    109 | 110 |

    Arguments

    111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 136 | 137 | 138 | 139 | 142 | 143 |
    xgb_model

    a xgboost model.

    new_observation

    a new observation.

    data

    row from the original dataset with the new observation to explain (not one-hot-encoded). 124 | The param above has to be set to merge categorical features. 125 | If you dont wont to merge categorical features, set this parameter the same as new_observation.

    type

    the learning task of the model. Available tasks: "binary" for binary classification or "regression" for linear regression.

    option

    if "variables", the plot includes only single variables, 134 | if "interactions", then only interactions. 135 | Default "interaction".

    baseline

    a number or a character "Intercept" (for model intercept). 140 | The baseline for the plot, where the rectangles should start. 141 | Default 0.

    144 | 145 |

    Value

    146 | 147 |

    an object of the broken class

    148 | 149 |

    Details

    150 | 151 |

    The function contains code or pieces of code 152 | from breakDown code created by Przemysław Biecek 153 | and xgboostExplainer code created by David Foster.

    154 | 155 | 156 |

    Examples

    157 |
    158 |
    library("EIX") 159 | library("Matrix") 160 | sm <- sparse.model.matrix(left ~ . - 1, data = HR_data) 161 | 162 | library("xgboost") 163 | param <- list(objective = "binary:logistic", max_depth = 2) 164 | xgb_model <- xgboost(sm, params = param, label = HR_data[, left] == 1, nrounds = 25, verbose=0) 165 | 166 | data <- HR_data[9,-7] 167 | new_observation <- sm[9,] 168 | 169 | wf <- waterfall(xgb_model, new_observation, data, option = "interactions") 170 | wf
    #> contribution 171 | #> xgboost: intercept -1.492 172 | #> xgboost: time_spend_company = 5 1.360 173 | #> xgboost: last_evaluation = 1 1.093 174 | #> xgboost: Work_accident = 0 -0.423 175 | #> xgboost: satisfaction_level = 0.89 -0.390 176 | #> xgboost: last_evaluation:time_spend_company = 1:5 0.297 177 | #> xgboost: last_evaluation:average_montly_hours = 1:224 0.227 178 | #> xgboost: satisfaction_level:time_spend_company = 0.89:5 0.223 179 | #> xgboost: number_project = 5 -0.211 180 | #> xgboost: average_montly_hours:last_evaluation = 224:1 -0.156 181 | #> xgboost: average_montly_hours = 224 -0.096 182 | #> xgboost: time_spend_company:last_evaluation = 5:1 0.095 183 | #> xgboost: salary = 2 0.074 184 | #> xgboost: satisfaction_level:number_project = 0.89:5 -0.003 185 | #> xgboost: prediction 0.597
    186 | plot(wf)
    187 |
    188 |
    189 | 202 |
    203 | 204 | 214 |
    215 | 216 | 217 | 218 | 219 | --------------------------------------------------------------------------------