├── style.css ├── img ├── rmse.jpg ├── DALEX_intro.png ├── ml_overview.png └── cross_validation.png ├── setup.R ├── .gitignore ├── README.md ├── LICENSE └── tutorial.Rmd /style.css: -------------------------------------------------------------------------------- 1 | h1, .h1, h2, .h2, h3, .h3 { 2 | margin-top: 84px; 3 | } 4 | -------------------------------------------------------------------------------- /img/rmse.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/rmse.jpg -------------------------------------------------------------------------------- /img/DALEX_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/DALEX_intro.png -------------------------------------------------------------------------------- /img/ml_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/ml_overview.png -------------------------------------------------------------------------------- /img/cross_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/cross_validation.png -------------------------------------------------------------------------------- /setup.R: -------------------------------------------------------------------------------- 1 | # Install R packages for this tutorial 2 | 3 | pkgs <- c("h2o", "DALEX", "breakDown", "pdp", 4 | "knitr", "rmdformats", "DT", "xgboost", "mlbench") 5 | for (pkg in pkgs) { 6 | if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) } 7 | } 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # useR! 2019 Tutorial: Automatic and Explainable Machine Learning with H2O in R 2 | 3 | http://www.user2019.fr/tutorials/ 4 | 5 | ## Key Files 6 | 7 | - `setup.R`: install packages required 8 | - `tutorial.Rmd`: the main RMarkdown file with code 9 | - `tutorial.html`: rendered RMarkdown result [view](https://nbviewer.jupyter.org/github/woobe/useR2019_h2o_tutorial/blob/master/tutorial.html) 10 | - `Introduction Slides`: [Google Drive Link](https://drive.google.com/file/d/1evXrshE4GDZT-z0c_LTYEm9Dkl4mRKn-/view?usp=sharing) 11 | 12 | 13 | ## Additional Info 14 | 15 | - H2O-3 user guide http://docs.h2o.ai/h2o/latest-stable/h2o-docs/index.html 16 | - H2O-3 tutorials (https://github.com/h2oai/h2o-tutorials) 17 | - Java debugging tips (https://twitter.com/ledell/status/1148512123083010048) 18 | - H2O-3 XGBoost on Windows (https://stackoverflow.com/questions/49752125/xgboost-h2o-error-on-windows-os) 19 | - About target encoding http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/target-encoding.html 20 | 21 | ## Thank you for coming to my workshop. Remember to get your hands dirty! 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jo-fai Chow 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "useR! 2019 H2O Tutorial (bit.ly/useR2019_h2o_tutorial)" 3 | date: "`r Sys.Date()`" 4 | output: 5 | rmdformats::readthedown: 6 | number_sections: yes 7 | fig_height: 10 8 | fig_width: 14 9 | highlight: kate 10 | toc_depth: 3 11 | css: style.css 12 | 13 | --- 14 | 15 | 16 | ```{r knitr_init, echo=FALSE, cache=FALSE} 17 | library(knitr) 18 | library(rmdformats) 19 | library(DT) 20 | 21 | ## Global options 22 | options(max.print="75") 23 | opts_chunk$set(echo=TRUE, 24 | cache=FALSE, 25 | prompt=FALSE, 26 | tidy=TRUE, 27 | comment=NA, 28 | message=FALSE, 29 | warning=FALSE) 30 | opts_knit$set(width=75) 31 | ``` 32 | 33 | # Agenda 34 | 35 | - 09:00 to 09:30 Set Up & Introduction 36 | - 09:30 to 10:30 Regression Example 37 | - 10:30 to 11:00 Coffee Break 38 | - 11:00 to 11:30 Classification Example 39 | - 11:30 to 12:30 Bring Your Own Data + Q&A 40 | 41 | 42 | # Set Up 43 | 44 | ## Download -> bit.ly/useR2019_h2o_tutorial 45 | 46 | - `setup.R`: install packages required 47 | - `tutorial.Rmd`: the main RMarkdown file with code 48 | - `tutorial.html`: this webpage 49 | - Full URL https://github.com/woobe/useR2019_h2o_tutorial (if `bit.ly` doesn't work) 50 | 51 | 52 | ## R Packages 53 | 54 | - Check out `setup.R` 55 | - For this tutorial: 56 | - `h2o` for machine learning 57 | - `mlbench` for Boston Housing dataset 58 | - `DALEX`, `breakDown` & `pdp` for explaining model predictions 59 | - For RMarkdown 60 | - `knitr` for rendering this RMarkdown 61 | - `rmdformats` for `readthedown` RMarkdown template 62 | - `DT` for nice tables 63 | 64 | 65 | # Introduction 66 | 67 | General Data Protection Regulation (GDPR) is now in place. Are you ready to explain your models? This is a hands-on tutorial for R beginners. I will demonstrate the use of H2O and other R packages for automatic and interpretable machine learning. Participants will be able to follow and build regression and classification models quickly with H2O's AutoML. They will then be able to explain the model outcomes with various methods. 68 | 69 | It is a workshop for R beginners and anyone interested in machine learning. RMarkdown and the rendered HTML will be provided so everyone can follow without running the code. 70 | 71 | (Now go to slides ...) 72 | 73 | 74 | # Regression Part One: H2O AutoML 75 | 76 | ```{r, message=FALSE} 77 | # Let's go 78 | library(h2o) # for H2O Machine Learning 79 | library(mlbench) # for Datasets 80 | ``` 81 | 82 | ```{r} 83 | # Enter your lucky seed here ... 84 | n_seed <- 12345 85 | ``` 86 | 87 | ## Data - Boston Housing from `mlbench` 88 | 89 | ```{r} 90 | data("BostonHousing") 91 | datatable(head(BostonHousing), 92 | rownames = FALSE, options = list(pageLength = 6, scrollX = TRUE)) 93 | ``` 94 | 95 | **Source**: UCI Machine Learning Repository [Link](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/) 96 | 97 | - **crim**: per capita crime rate by town. 98 | - **zn**: proportion of residential land zoned for lots over 25,000 sq.ft. 99 | - **indus**: proportion of non-retail business acres per town. 100 | - **chas**: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). 101 | - **nox**: nitrogen oxides concentration (parts per 10 million). 102 | - **rm**: average number of rooms per dwelling. 103 | - **age**: proportion of owner-occupied units built prior to 1940. 104 | - **dis**: weighted mean of distances to five Boston employment centres. 105 | - **rad**: index of accessibility to radial highways. 106 | - **tax**: full-value property-tax rate per $10,000. 107 | - **ptratio**: pupil-teacher ratio by town. 108 | - **b**: 1000(Bk - 0.63)^2 where Bk is the proportion of people of African American descent by town. 109 | - **lstat**: lower status of the population (percent). 110 | - **medv** (This is the **TARGET**): median value of owner-occupied homes in $1000s. 111 | 112 | 113 | 114 | ## Define Target and Features 115 | 116 | ```{r} 117 | target <- "medv" # Median House Value 118 | features <- setdiff(colnames(BostonHousing), target) 119 | print(features) 120 | ``` 121 | 122 |
123 | ![ml_overview](img/ml_overview.png) 124 |
125 | 126 | ## Start a local H2O Cluster (JVM) 127 | 128 | ```{r} 129 | h2o.init() 130 | ``` 131 | 132 | ```{r} 133 | h2o.no_progress() # disable progress bar for RMarkdown 134 | h2o.removeAll() # Optional: remove anything from previous session 135 | ``` 136 | 137 | 138 | 139 | ## Convert R dataframe into H2O dataframe 140 | 141 | ```{r} 142 | # H2O dataframe 143 | h_boston <- as.h2o(BostonHousing) 144 | ``` 145 | 146 | 147 | 148 | ## Split Data into Train/Test 149 | 150 | ```{r} 151 | h_split <- h2o.splitFrame(h_boston, ratios = 0.8, seed = n_seed) 152 | h_train <- h_split[[1]] # 80% for modelling 153 | h_test <- h_split[[2]] # 20% for evaluation 154 | ``` 155 | 156 | ```{r} 157 | dim(h_train) 158 | dim(h_test) 159 | ``` 160 | 161 | ## Cross-Validation 162 | 163 |
164 | ![CV](img/cross_validation.png) 165 |
166 | 167 | 168 | ## Baseline Models 169 | 170 | - `h2o.glm()`: H2O Generalized Linear Model 171 | - `h2o.randomForest()`: H2O Random Forest Model 172 | - `h2o.gbm()`: H2O Gradient Boosting Model 173 | - `h2o.deeplearning()`: H2O Deep Neural Network Model 174 | - `h2o.xgboost()`: H2O wrapper for eXtreme Gradient Boosting Model from DMLC 175 | 176 | ### Baseline Generalized Linear Model (GLM) 177 | 178 | ```{r} 179 | model_glm <- h2o.glm(x = features, # All 13 features 180 | y = target, # medv (median value of owner-occupied homes in $1000s) 181 | training_frame = h_train, # H2O dataframe with training data 182 | model_id = "baseline_glm", # Give the model a name 183 | nfolds = 5, # Using 5-fold CV 184 | seed = n_seed) # Your lucky seed 185 | ``` 186 | 187 | ```{r} 188 | # Cross-Validation 189 | model_glm@model$cross_validation_metrics 190 | ``` 191 | 192 | 193 | ```{r} 194 | # Evaluate performance on test 195 | h2o.performance(model_glm, newdata = h_test) 196 | ``` 197 | 198 | Let's use RMSE 199 | 200 |
201 | ![RMSE](img/rmse.jpg) 202 |
203 | 204 | 205 | 206 | ### Build Other Baseline Models (DRF, GBM, DNN & XGB) 207 | 208 | ```{r} 209 | # Baseline Distributed Random Forest (DRF) 210 | model_drf <- h2o.randomForest(x = features, 211 | y = target, 212 | training_frame = h_train, 213 | model_id = "baseline_drf", 214 | nfolds = 5, 215 | seed = n_seed) 216 | ``` 217 | 218 | ```{r} 219 | # Baseline Gradient Boosting Model (GBM) 220 | model_gbm <- h2o.gbm(x = features, 221 | y = target, 222 | training_frame = h_train, 223 | model_id = "baseline_gbm", 224 | nfolds = 5, 225 | seed = n_seed) 226 | ``` 227 | 228 | ```{r} 229 | # Baseline Deep Nerual Network (DNN) 230 | # By default, DNN is not reproducible with multi-core. You may get slightly different results here. 231 | # You can enable the `reproducible` option but it will run on a single core (very slow). 232 | model_dnn <- h2o.deeplearning(x = features, 233 | y = target, 234 | training_frame = h_train, 235 | model_id = "baseline_dnn", 236 | nfolds = 5, 237 | seed = n_seed) 238 | ``` 239 | 240 | ```{r} 241 | # Baseline eXtreme Gradient Boosting Model (XGBoost) 242 | model_xgb <- h2o.xgboost(x = features, 243 | y = target, 244 | training_frame = h_train, 245 | model_id = "baseline_xgb", 246 | nfolds = 5, 247 | seed = n_seed) 248 | ``` 249 | 250 | ### Comparison (RMSE: Lower = Better) 251 | 252 | ```{r} 253 | # Create a table to compare RMSE from different models 254 | d_eval <- data.frame(model = c("H2O GLM: Generalized Linear Model (Baseline)", 255 | "H2O DRF: Distributed Random Forest (Baseline)", 256 | "H2O GBM: Gradient Boosting Model (Baseline)", 257 | "H2O DNN: Deep Neural Network (Baseline)", 258 | "XGBoost: eXtreme Gradient Boosting Model (Baseline)"), 259 | stringsAsFactors = FALSE) 260 | d_eval$RMSE_cv <- NA 261 | d_eval$RMSE_test <- NA 262 | ``` 263 | 264 | ```{r} 265 | # Store RMSE values 266 | d_eval[1, ]$RMSE_cv <- model_glm@model$cross_validation_metrics@metrics$RMSE 267 | d_eval[2, ]$RMSE_cv <- model_drf@model$cross_validation_metrics@metrics$RMSE 268 | d_eval[3, ]$RMSE_cv <- model_gbm@model$cross_validation_metrics@metrics$RMSE 269 | d_eval[4, ]$RMSE_cv <- model_dnn@model$cross_validation_metrics@metrics$RMSE 270 | d_eval[5, ]$RMSE_cv <- model_xgb@model$cross_validation_metrics@metrics$RMSE 271 | 272 | d_eval[1, ]$RMSE_test <- h2o.rmse(h2o.performance(model_glm, newdata = h_test)) 273 | d_eval[2, ]$RMSE_test <- h2o.rmse(h2o.performance(model_drf, newdata = h_test)) 274 | d_eval[3, ]$RMSE_test <- h2o.rmse(h2o.performance(model_gbm, newdata = h_test)) 275 | d_eval[4, ]$RMSE_test <- h2o.rmse(h2o.performance(model_dnn, newdata = h_test)) 276 | d_eval[5, ]$RMSE_test <- h2o.rmse(h2o.performance(model_xgb, newdata = h_test)) 277 | ``` 278 | 279 | ```{r} 280 | # Show Comparison (RMSE: Lower = Better) 281 | datatable(d_eval, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>% 282 | formatRound(columns = -1, digits = 4) 283 | ``` 284 | 285 | 286 | 287 | ## Manual Tuning 288 | 289 | ### Check out the hyper-parameters for each algo 290 | 291 | ```{r, eval=FALSE} 292 | ?h2o.glm 293 | ?h2o.randomForest 294 | ?h2o.gbm 295 | ?h2o.deeplearning 296 | ?h2o.xgboost 297 | ``` 298 | 299 | ### Train a xgboost model with manual settings 300 | 301 | ```{r} 302 | model_xgb_m <- h2o.xgboost(x = features, 303 | y = target, 304 | training_frame = h_train, 305 | model_id = "model_xgb_m", 306 | nfolds = 5, 307 | seed = n_seed, 308 | # Manual Settings based on experience 309 | learn_rate = 0.1, # use a lower rate (more conservative) 310 | ntrees = 100, # use more trees (due to lower learn_rate) 311 | sample_rate = 0.9, # use random n% of samples for each tree 312 | col_sample_rate = 0.9) # use random n% of features for each tree 313 | ``` 314 | 315 | ### Comparison (RMSE: Lower = Better) 316 | 317 | ```{r} 318 | d_eval_tmp <- data.frame(model = "XGBoost: eXtreme Gradient Boosting Model (Manual Settings)", 319 | RMSE_cv = model_xgb_m@model$cross_validation_metrics@metrics$RMSE, 320 | RMSE_test = h2o.rmse(h2o.performance(model_xgb_m, newdata = h_test))) 321 | d_eval <- rbind(d_eval, d_eval_tmp) 322 | 323 | datatable(d_eval, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>% 324 | formatRound(columns = -1, digits = 4) 325 | ``` 326 | 327 | 328 | 329 | ## H2O AutoML 330 | 331 | ```{r} 332 | # Run AutoML (try n different models) 333 | # Check out all options using ?h2o.automl 334 | automl = h2o.automl(x = features, 335 | y = target, 336 | training_frame = h_train, 337 | nfolds = 5, # 5-fold Cross-Validation 338 | max_models = 20, # Max number of models 339 | stopping_metric = "RMSE", # Metric to optimize 340 | project_name = "automl_boston", # Specify a name so you can add more models later 341 | seed = n_seed) 342 | ``` 343 | 344 | ### Leaderboard 345 | 346 | ```{r} 347 | datatable(as.data.frame(automl@leaderboard), 348 | rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>% 349 | formatRound(columns = -1, digits = 4) 350 | ``` 351 | 352 | ### Best Model (Leader) 353 | 354 | ```{r} 355 | automl@leader 356 | ``` 357 | 358 | ### Comparison (RMSE: Lower = Better) 359 | 360 | ```{r} 361 | d_eval_tmp <- data.frame(model = "Best Model from H2O AutoML", 362 | RMSE_cv = automl@leader@model$cross_validation_metrics@metrics$RMSE, 363 | RMSE_test = h2o.rmse(h2o.performance(automl@leader, newdata = h_test))) 364 | d_eval <- rbind(d_eval, d_eval_tmp) 365 | 366 | datatable(d_eval, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>% 367 | formatRound(columns = -1, digits = 4) 368 | ``` 369 | 370 | 371 | 372 | ## Make Predictions 373 | 374 | ```{r} 375 | yhat_test <- h2o.predict(automl@leader, newdata = h_test) 376 | head(yhat_test) 377 | ``` 378 | 379 | 380 | 381 | # Regression Part Two: XAI 382 | 383 | Let's look at the first house in `h_test` 384 | 385 | ```{r} 386 | datatable(as.data.frame(h_test[1, ]), 387 | rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE)) 388 | ``` 389 | 390 | 391 | ## Using functions in `h2o` 392 | 393 | - `h2o.varimp()` & `h2o.varimp_plot`: Variable Importance (for GBM, DNN, GLM) 394 | - `h2o.partialPlot()`: Partial Dependence Plots 395 | - `h2o.predict_contributions()`: SHAP values (for GBM and XGBoost only) 396 | 397 | ```{r, eval=FALSE} 398 | # Look at the impact of feature `rm` (no. of rooms) 399 | # Not Run 400 | h2o.partialPlot(model_glm, data = h_test, cols = c("rm")) 401 | h2o.partialPlot(model_drf, data = h_test, cols = c("rm")) 402 | h2o.partialPlot(model_gbm, data = h_test, cols = c("rm")) 403 | h2o.partialPlot(model_dnn, data = h_test, cols = c("rm")) 404 | h2o.partialPlot(model_xgb, data = h_test, cols = c("rm")) 405 | h2o.partialPlot(automl@leader, data = h_test, cols = c("rm")) 406 | ``` 407 | 408 | 409 | ## Package `DALEX` 410 | 411 | ```{r} 412 | # Descriptive mAchine Learning EXplanations (DALEX) 413 | library(DALEX) 414 | ``` 415 | 416 | - Website: https://pbiecek.github.io/DALEX/ 417 | - Original DALEX-H2O Example: https://raw.githack.com/pbiecek/DALEX_docs/master/vignettes/DALEX_h2o.html 418 | 419 |
420 | ![DALEX](img/DALEX_intro.png) 421 |
422 | 423 | ### The `explain()` Function 424 | 425 | The first step of using the `DALEX` package is to wrap-up the black-box model with meta-data that unifies model interfacing. 426 | 427 | To create an explainer we use `explain()` function. Validation dataset for the models is `h_test` from part one. For the models created by `h2o` package we have to provide custom predict function which takes two arguments: `model` and `newdata` and returns a numeric vector with predictions. 428 | 429 | ```{r} 430 | # Custom Predict Function 431 | custom_predict <- function(model, newdata) { 432 | newdata_h2o <- as.h2o(newdata) 433 | res <- as.data.frame(h2o.predict(model, newdata_h2o)) 434 | return(as.numeric(res$predict)) 435 | } 436 | ``` 437 | 438 | ### Explainer for H2O Models 439 | 440 | ```{r} 441 | explainer_drf <- DALEX::explain(model = model_drf, 442 | data = as.data.frame(h_test)[, features], 443 | y = as.data.frame(h_test)[, target], 444 | predict_function = custom_predict, 445 | label = "Random Forest") 446 | 447 | explainer_dnn <- DALEX::explain(model = model_dnn, 448 | data = as.data.frame(h_test)[, features], 449 | y = as.data.frame(h_test)[, target], 450 | predict_function = custom_predict, 451 | label = "Deep Neural Networks") 452 | 453 | explainer_xgb <- DALEX::explain(model = model_xgb, 454 | data = as.data.frame(h_test)[, features], 455 | y = as.data.frame(h_test)[, target], 456 | predict_function = custom_predict, 457 | label = "XGBoost") 458 | 459 | explainer_automl <- DALEX::explain(model = automl@leader, 460 | data = as.data.frame(h_test)[, features], 461 | y = as.data.frame(h_test)[, target], 462 | predict_function = custom_predict, 463 | label = "H2O AutoML") 464 | ``` 465 | 466 | ### Variable importance 467 | 468 | Using he DALEX package we are able to better understand which variables are important. 469 | 470 | Model agnostic variable importance is calculated by means of permutations. We simply substract the loss function calculated for validation dataset with permuted values for a single variable from the loss function calculated for validation dataset. 471 | 472 | This method is implemented in the variable_importance() function. 473 | 474 | ```{r} 475 | vi_drf <- variable_importance(explainer_drf, type="difference") 476 | vi_dnn <- variable_importance(explainer_dnn, type="difference") 477 | vi_xgb <- variable_importance(explainer_xgb, type="difference") 478 | vi_automl <- variable_importance(explainer_automl, type="difference") 479 | ``` 480 | 481 | ```{r} 482 | plot(vi_drf, vi_dnn, vi_xgb, vi_automl) 483 | ``` 484 | 485 | 486 | ### Partial Dependence Plots 487 | 488 | Partial Dependence Plots (PDP) are one of the most popular methods for exploration of the relation between a continuous variable and the model outcome. Function variable_response() with the parameter type = "pdp" calls pdp::partial() function to calculate PDP response. 489 | 490 | Let's look at feature `rm` (no. of rooms) 491 | 492 | ```{r} 493 | pdp_drf_rm <- variable_response(explainer_drf, variable = "rm") 494 | pdp_dnn_rm <- variable_response(explainer_dnn, variable = "rm") 495 | pdp_xgb_rm <- variable_response(explainer_xgb, variable = "rm") 496 | pdp_automl_rm <- variable_response(explainer_automl, variable = "rm") 497 | plot(pdp_drf_rm, pdp_dnn_rm, pdp_xgb_rm, pdp_automl_rm) 498 | ``` 499 | 500 | 501 | ### Prediction Understanding 502 | 503 | ```{r} 504 | # Predictions from different models 505 | yhat <- data.frame(model = c("H2O DRF: Distributed Random Forest (Baseline)", 506 | "H2O DNN: Deep Neural Network (Baseline)", 507 | "XGBoost: eXtreme Gradient Boosting Model (Baseline)", 508 | "Best Model from H2O AutoML")) 509 | yhat$prediction <- NA 510 | yhat[1,]$prediction <- as.matrix(h2o.predict(model_drf, h_test[1,])) 511 | yhat[2,]$prediction <- as.matrix(h2o.predict(model_dnn, h_test[1,])) 512 | yhat[3,]$prediction <- as.matrix(h2o.predict(model_xgb, h_test[1,])) 513 | yhat[4,]$prediction <- as.matrix(h2o.predict(automl@leader, h_test[1,])) 514 | 515 | # Show the predictions 516 | datatable(yhat, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE)) %>% 517 | formatRound(columns = -1, digits = 3) 518 | ``` 519 | 520 | The function `prediction_breakdown()` is a wrapper around the `breakDown` package. Model prediction is visualized with Break Down Plots, which show the contribution of every variable present in the model. Function `prediction_breakdown()` generates variable attributions for selected prediction. The generic `plot()` function shows these attributions. 521 | 522 | ```{r} 523 | library(breakDown) 524 | sample <- as.data.frame(h_test)[1, ] # Using the first sample from h_test 525 | pb_drf <- prediction_breakdown(explainer_drf, observation = sample) 526 | pb_dnn <- prediction_breakdown(explainer_dnn, observation = sample) 527 | pb_xgb <- prediction_breakdown(explainer_xgb, observation = sample) 528 | pb_automl <- prediction_breakdown(explainer_automl, observation = sample) 529 | ``` 530 | 531 | 532 | ```{r} 533 | plot(pb_drf) 534 | plot(pb_dnn) 535 | plot(pb_xgb) 536 | plot(pb_automl) 537 | ``` 538 | 539 | 540 | 541 | # Coffee Break 10:30 - 11:00 542 | 543 |
544 | ![coffee_break](https://media.giphy.com/media/U82ik0q8lJly8/giphy.gif) 545 |
546 | 547 | 548 | 549 | 550 | # Classification Part One: H2O AutoML 551 | 552 | ```{r, message=FALSE} 553 | # Let's go 554 | library(h2o) # for H2O Machine Learning 555 | library(mlbench) # for Datasets 556 | ``` 557 | 558 | ```{r} 559 | # Enter your lucky seed here ... 560 | n_seed <- 12345 561 | ``` 562 | 563 | ## Data - Pima Indians Diabetes from `mlbench` 564 | 565 | ```{r} 566 | data("PimaIndiansDiabetes") 567 | datatable(head(PimaIndiansDiabetes), 568 | rownames = FALSE, options = list(pageLength = 6, scrollX = TRUE)) 569 | ``` 570 | 571 | 572 | ## Data Prep 573 | 574 | ```{r} 575 | # Convert pos and neg to 1 and 0 576 | d_new <- PimaIndiansDiabetes[, -ncol(PimaIndiansDiabetes)] 577 | d_new$diabetes <- 0 578 | d_new[which(PimaIndiansDiabetes$diabetes == "pos"), ]$diabetes <- 1 579 | PimaIndiansDiabetes <- d_new 580 | rm(d_new) 581 | ``` 582 | 583 | ```{r} 584 | target <- "diabetes" 585 | features <- setdiff(colnames(PimaIndiansDiabetes), target) 586 | print(features) 587 | ``` 588 | 589 | 590 | 591 | ## Start a local H2O Cluster (JVM) 592 | 593 | ```{r} 594 | h2o.init() 595 | ``` 596 | 597 | ```{r} 598 | h2o.no_progress() # disable progress bar for RMarkdown 599 | h2o.removeAll() # Optional: remove anything from previous session 600 | ``` 601 | 602 | 603 | 604 | ## Convert R dataframe into H2O dataframe 605 | 606 | ```{r} 607 | # H2O dataframe 608 | h_diabetes <- as.h2o(PimaIndiansDiabetes) 609 | 610 | # Make sure the target is a factor (for classification) 611 | h_diabetes$diabetes <- as.factor(h_diabetes$diabetes) 612 | ``` 613 | 614 | 615 | 616 | ## Split Data into Train/Test 617 | 618 | ```{r} 619 | h_split <- h2o.splitFrame(h_diabetes, ratios = 0.8, seed = n_seed) 620 | h_train <- h_split[[1]] # 80% for modelling 621 | h_test <- h_split[[2]] # 20% for evaluation 622 | ``` 623 | 624 | ```{r} 625 | dim(h_train) 626 | dim(h_test) 627 | ``` 628 | 629 | 630 | ## H2O AutoML 631 | 632 | ```{r} 633 | # Run AutoML (try n different models) 634 | # Check out all options using ?h2o.automl 635 | automl = h2o.automl(x = features, 636 | y = target, 637 | training_frame = h_train, 638 | nfolds = 5, # 5-fold Cross-Validation 639 | max_models = 20, # Max number of models 640 | stopping_metric = "logloss", # Metric to optimize 641 | project_name = "automl_diabetes", # Specify a name so you can add more models later 642 | sort_metric = "logloss", 643 | seed = n_seed) 644 | ``` 645 | 646 | ### Leaderboard 647 | 648 | ```{r} 649 | datatable(as.data.frame(automl@leaderboard), 650 | rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>% 651 | formatRound(columns = -1, digits = 4) 652 | ``` 653 | 654 | # Classification Part Two: XAI 655 | 656 | ## Package `DALEX` 657 | 658 | ```{r} 659 | # Descriptive mAchine Learning EXplanations (DALEX) 660 | library(DALEX) 661 | ``` 662 | 663 | 664 | ### The `explain()` Function 665 | 666 | ```{r} 667 | # Custom Predict Function 668 | custom_predict <- function(model, newdata) { 669 | newdata_h2o <- as.h2o(newdata) 670 | res <- as.data.frame(h2o.predict(model, newdata_h2o)) 671 | return(round(res$p1)) # round the probabil 672 | } 673 | ``` 674 | 675 | ### Explainer for H2O Models 676 | 677 | ```{r} 678 | explainer_automl <- DALEX::explain(model = automl@leader, 679 | data = as.data.frame(h_test)[, features], 680 | y = PimaIndiansDiabetes$diabetes, 681 | predict_function = custom_predict, 682 | label = "H2O AutoML") 683 | ``` 684 | 685 | ### Variable importance 686 | 687 | ```{r} 688 | vi_automl <- variable_importance(explainer_automl, type="difference") 689 | plot(vi_automl) 690 | ``` 691 | 692 | 693 | ### Partial Dependence Plots 694 | 695 | Let's look at feature `age` 696 | 697 | ```{r} 698 | pdp_automl_rm <- variable_response(explainer_automl, variable = "age") 699 | plot(pdp_automl_rm) 700 | ``` 701 | 702 | 703 | ## Prediction Understanding 704 | 705 | ```{r} 706 | library(breakDown) 707 | ``` 708 | 709 | ```{r} 710 | # Prediction: Diabetes = Negative (0) 711 | pb_automl <- prediction_breakdown(explainer_automl, observation = as.data.frame(h_test)[1, ]) 712 | plot(pb_automl) 713 | ``` 714 | 715 | 716 | ```{r} 717 | # Prediction: Diabetes = Positive (1) 718 | pb_automl <- prediction_breakdown(explainer_automl, observation = as.data.frame(h_test)[6, ]) 719 | plot(pb_automl) 720 | ``` 721 | 722 | 723 | # Bring Your Own Data + Q&A 724 | 725 | Get your hands dirty! 726 | 727 | --------------------------------------------------------------------------------