├── style.css
├── img
    ├── rmse.jpg
    ├── DALEX_intro.png
    ├── ml_overview.png
    └── cross_validation.png
├── setup.R
├── .gitignore
├── README.md
├── LICENSE
└── tutorial.Rmd


/style.css:
--------------------------------------------------------------------------------
1 | h1, .h1, h2, .h2, h3, .h3 {
2 |     margin-top: 84px;
3 | }
4 | 


--------------------------------------------------------------------------------
/img/rmse.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/rmse.jpg


--------------------------------------------------------------------------------
/img/DALEX_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/DALEX_intro.png


--------------------------------------------------------------------------------
/img/ml_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/ml_overview.png


--------------------------------------------------------------------------------
/img/cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woobe/useR2019_h2o_tutorial/HEAD/img/cross_validation.png


--------------------------------------------------------------------------------
/setup.R:
--------------------------------------------------------------------------------
1 | # Install R packages for this tutorial
2 | 
3 | pkgs <- c("h2o", "DALEX", "breakDown", "pdp",
4 |           "knitr", "rmdformats", "DT", "xgboost", "mlbench")
5 | for (pkg in pkgs) {
6 |   if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }
7 | }
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 | 
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 | 
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 | 
35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
36 | rsconnect/
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # useR! 2019 Tutorial: Automatic and Explainable Machine Learning with H2O in R 
 2 | 
 3 | http://www.user2019.fr/tutorials/
 4 | 
 5 | ## Key Files
 6 | 
 7 | - `setup.R`: install packages required
 8 | - `tutorial.Rmd`: the main RMarkdown file with code
 9 | - `tutorial.html`: rendered RMarkdown result [view](https://nbviewer.jupyter.org/github/woobe/useR2019_h2o_tutorial/blob/master/tutorial.html)
10 | - `Introduction Slides`: [Google Drive Link](https://drive.google.com/file/d/1evXrshE4GDZT-z0c_LTYEm9Dkl4mRKn-/view?usp=sharing)
11 | 
12 | 
13 | ## Additional Info
14 | 
15 | - H2O-3 user guide http://docs.h2o.ai/h2o/latest-stable/h2o-docs/index.html 
16 | - H2O-3 tutorials (https://github.com/h2oai/h2o-tutorials)
17 | - Java debugging tips (https://twitter.com/ledell/status/1148512123083010048)
18 | - H2O-3 XGBoost on Windows (https://stackoverflow.com/questions/49752125/xgboost-h2o-error-on-windows-os)
19 | - About target encoding http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/target-encoding.html
20 | 
21 | ## Thank you for coming to my workshop. Remember to get your hands dirty!
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jo-fai Chow
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "useR! 2019 H2O Tutorial (bit.ly/useR2019_h2o_tutorial)"
  3 | date: "`r Sys.Date()`"
  4 | output:
  5 |   rmdformats::readthedown:
  6 |     number_sections: yes
  7 |     fig_height: 10
  8 |     fig_width: 14
  9 |     highlight: kate
 10 |     toc_depth: 3
 11 |     css: style.css
 12 |     
 13 | ---
 14 | 
 15 | 
 16 | ```{r knitr_init, echo=FALSE, cache=FALSE}
 17 | library(knitr)
 18 | library(rmdformats)
 19 | library(DT)
 20 | 
 21 | ## Global options
 22 | options(max.print="75")
 23 | opts_chunk$set(echo=TRUE,
 24 | 	             cache=FALSE,
 25 |                prompt=FALSE,
 26 |                tidy=TRUE,
 27 |                comment=NA,
 28 |                message=FALSE,
 29 |                warning=FALSE)
 30 | opts_knit$set(width=75)
 31 | ```
 32 | 
 33 | # Agenda
 34 | 
 35 | - 09:00 to 09:30 Set Up & Introduction
 36 | - 09:30 to 10:30 Regression Example
 37 | - 10:30 to 11:00 Coffee Break
 38 | - 11:00 to 11:30 Classification Example
 39 | - 11:30 to 12:30 Bring Your Own Data + Q&A
 40 | 
 41 | 
 42 | # Set Up
 43 | 
 44 | ## Download -> bit.ly/useR2019_h2o_tutorial
 45 | 
 46 | - `setup.R`: install packages required
 47 | - `tutorial.Rmd`: the main RMarkdown file with code 
 48 | - `tutorial.html`: this webpage
 49 | - Full URL https://github.com/woobe/useR2019_h2o_tutorial (if `bit.ly` doesn't work)
 50 | 
 51 | 
 52 | ## R Packages
 53 | 
 54 | - Check out `setup.R`
 55 | - For this tutorial:
 56 |     - `h2o` for machine learning
 57 |     - `mlbench` for Boston Housing dataset
 58 |     - `DALEX`, `breakDown` & `pdp` for explaining model predictions
 59 | - For RMarkdown
 60 |     - `knitr` for rendering this RMarkdown
 61 |     - `rmdformats` for `readthedown` RMarkdown template
 62 |     - `DT` for nice tables
 63 | 
 64 | 
 65 | # Introduction
 66 | 
 67 | General Data Protection Regulation (GDPR) is now in place. Are you ready to explain your models? This is a hands-on tutorial for R beginners. I will demonstrate the use of H2O and other R packages for automatic and interpretable machine learning. Participants will be able to follow and build regression and classification models quickly with H2O's AutoML. They will then be able to explain the model outcomes with various methods.
 68 | 
 69 | It is a workshop for R beginners and anyone interested in machine learning. RMarkdown and the rendered HTML will be provided so everyone can follow without running the code.
 70 | 
 71 | (Now go to slides ...)
 72 | 
 73 | 
 74 | # Regression Part One: H2O AutoML
 75 | 
 76 | ```{r, message=FALSE}
 77 | # Let's go
 78 | library(h2o) # for H2O Machine Learning
 79 | library(mlbench) # for Datasets
 80 | ```
 81 | 
 82 | ```{r}
 83 | # Enter your lucky seed here ...
 84 | n_seed <- 12345
 85 | ```
 86 | 
 87 | ## Data - Boston Housing from `mlbench`
 88 | 
 89 | ```{r}
 90 | data("BostonHousing")
 91 | datatable(head(BostonHousing), 
 92 |           rownames = FALSE, options = list(pageLength = 6, scrollX = TRUE))
 93 | ```
 94 | 
 95 | **Source**: UCI Machine Learning Repository [Link](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/)
 96 | 
 97 | - **crim**: per capita crime rate by town.
 98 | - **zn**: proportion of residential land zoned for lots over 25,000 sq.ft.
 99 | - **indus**: proportion of non-retail business acres per town.
100 | - **chas**: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
101 | - **nox**: nitrogen oxides concentration (parts per 10 million).
102 | - **rm**: average number of rooms per dwelling.
103 | - **age**: proportion of owner-occupied units built prior to 1940.
104 | - **dis**: weighted mean of distances to five Boston employment centres.
105 | - **rad**: index of accessibility to radial highways.
106 | - **tax**: full-value property-tax rate per $10,000.
107 | - **ptratio**: pupil-teacher ratio by town.
108 | - **b**: 1000(Bk - 0.63)^2 where Bk is the proportion of people of African American descent by town.
109 | - **lstat**: lower status of the population (percent).
110 | - **medv** (This is the **TARGET**): median value of owner-occupied homes in $1000s.
111 | 
112 | 
113 | 
114 | ## Define Target and Features
115 | 
116 | ```{r}
117 | target <- "medv" # Median House Value
118 | features <- setdiff(colnames(BostonHousing), target)
119 | print(features)
120 | ```
121 | 
122 | <center>
123 | ![ml_overview](img/ml_overview.png)
124 | </center>
125 | 
126 | ## Start a local H2O Cluster (JVM)
127 | 
128 | ```{r}
129 | h2o.init()
130 | ```
131 | 
132 | ```{r}
133 | h2o.no_progress() # disable progress bar for RMarkdown
134 | h2o.removeAll()   # Optional: remove anything from previous session 
135 | ```
136 | 
137 | 
138 | 
139 | ## Convert R dataframe into H2O dataframe
140 | 
141 | ```{r}
142 | # H2O dataframe
143 | h_boston <- as.h2o(BostonHousing)
144 | ```
145 | 
146 | 
147 | 
148 | ## Split Data into Train/Test
149 | 
150 | ```{r}
151 | h_split <- h2o.splitFrame(h_boston, ratios = 0.8, seed = n_seed)
152 | h_train <- h_split[[1]] # 80% for modelling
153 | h_test <- h_split[[2]] # 20% for evaluation
154 | ```
155 | 
156 | ```{r}
157 | dim(h_train)
158 | dim(h_test)
159 | ```
160 | 
161 | ## Cross-Validation
162 | 
163 | <center>
164 | ![CV](img/cross_validation.png)
165 | </center>
166 | 
167 | 
168 | ## Baseline Models
169 | 
170 | - `h2o.glm()`: H2O Generalized Linear Model
171 | - `h2o.randomForest()`: H2O Random Forest Model
172 | - `h2o.gbm()`: H2O Gradient Boosting Model
173 | - `h2o.deeplearning()`: H2O Deep Neural Network Model 
174 | - `h2o.xgboost()`: H2O wrapper for eXtreme Gradient Boosting Model from DMLC
175 | 
176 | ### Baseline Generalized Linear Model (GLM)
177 | 
178 | ```{r}
179 | model_glm <- h2o.glm(x = features,               # All 13 features
180 |                      y = target,                 # medv (median value of owner-occupied homes in $1000s)
181 |                      training_frame = h_train,   # H2O dataframe with training data
182 |                      model_id = "baseline_glm",  # Give the model a name
183 |                      nfolds = 5,                 # Using 5-fold CV
184 |                      seed = n_seed)              # Your lucky seed
185 | ```
186 | 
187 | ```{r}
188 | # Cross-Validation
189 | model_glm@model$cross_validation_metrics
190 | ```
191 | 
192 | 
193 | ```{r}
194 | # Evaluate performance on test
195 | h2o.performance(model_glm, newdata = h_test)
196 | ```
197 | 
198 | Let's use RMSE
199 | 
200 | <center>
201 | ![RMSE](img/rmse.jpg)
202 | </center>
203 | 
204 | 
205 | 
206 | ### Build Other Baseline Models (DRF, GBM, DNN & XGB)
207 | 
208 | ```{r}
209 | # Baseline Distributed Random Forest (DRF)
210 | model_drf <- h2o.randomForest(x = features,
211 |                               y = target,
212 |                               training_frame = h_train,
213 |                               model_id = "baseline_drf",
214 |                               nfolds = 5,
215 |                               seed = n_seed)
216 | ```
217 | 
218 | ```{r}
219 | # Baseline Gradient Boosting Model (GBM)
220 | model_gbm <- h2o.gbm(x = features,
221 |                      y = target,
222 |                      training_frame = h_train,
223 |                      model_id = "baseline_gbm",
224 |                      nfolds = 5,
225 |                      seed = n_seed)
226 | ```
227 | 
228 | ```{r}
229 | # Baseline Deep Nerual Network (DNN)
230 | # By default, DNN is not reproducible with multi-core. You may get slightly different results here.
231 | # You can enable the `reproducible` option but it will run on a single core (very slow).
232 | model_dnn <- h2o.deeplearning(x = features, 
233 |                               y = target, 
234 |                               training_frame = h_train,
235 |                               model_id = "baseline_dnn", 
236 |                               nfolds = 5, 
237 |                               seed = n_seed)
238 | ```
239 | 
240 | ```{r}
241 | # Baseline eXtreme Gradient Boosting Model (XGBoost)
242 | model_xgb <- h2o.xgboost(x = features, 
243 |                          y = target, 
244 |                          training_frame = h_train,
245 |                          model_id = "baseline_xgb", 
246 |                          nfolds = 5, 
247 |                          seed = n_seed)
248 | ```
249 | 
250 | ### Comparison (RMSE: Lower = Better)
251 | 
252 | ```{r}
253 | # Create a table to compare RMSE from different models
254 | d_eval <- data.frame(model = c("H2O GLM: Generalized Linear Model (Baseline)", 
255 |                                "H2O DRF: Distributed Random Forest (Baseline)",
256 |                                "H2O GBM: Gradient Boosting Model (Baseline)",
257 |                                "H2O DNN: Deep Neural Network (Baseline)",
258 |                                "XGBoost: eXtreme Gradient Boosting Model (Baseline)"),
259 |                      stringsAsFactors = FALSE)
260 | d_eval$RMSE_cv <- NA
261 | d_eval$RMSE_test <- NA
262 | ```
263 | 
264 | ```{r}
265 | # Store RMSE values
266 | d_eval[1, ]$RMSE_cv <- model_glm@model$cross_validation_metrics@metrics$RMSE
267 | d_eval[2, ]$RMSE_cv <- model_drf@model$cross_validation_metrics@metrics$RMSE
268 | d_eval[3, ]$RMSE_cv <- model_gbm@model$cross_validation_metrics@metrics$RMSE
269 | d_eval[4, ]$RMSE_cv <- model_dnn@model$cross_validation_metrics@metrics$RMSE
270 | d_eval[5, ]$RMSE_cv <- model_xgb@model$cross_validation_metrics@metrics$RMSE
271 | 
272 | d_eval[1, ]$RMSE_test <- h2o.rmse(h2o.performance(model_glm, newdata = h_test))
273 | d_eval[2, ]$RMSE_test <- h2o.rmse(h2o.performance(model_drf, newdata = h_test))
274 | d_eval[3, ]$RMSE_test <- h2o.rmse(h2o.performance(model_gbm, newdata = h_test))
275 | d_eval[4, ]$RMSE_test <- h2o.rmse(h2o.performance(model_dnn, newdata = h_test))
276 | d_eval[5, ]$RMSE_test <- h2o.rmse(h2o.performance(model_xgb, newdata = h_test))
277 | ```
278 | 
279 | ```{r}
280 | # Show Comparison (RMSE: Lower = Better)
281 | datatable(d_eval, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>%
282 |   formatRound(columns = -1, digits = 4)
283 | ```
284 | 
285 | 
286 | 
287 | ## Manual Tuning
288 | 
289 | ### Check out the hyper-parameters for each algo
290 | 
291 | ```{r, eval=FALSE}
292 | ?h2o.glm 
293 | ?h2o.randomForest
294 | ?h2o.gbm
295 | ?h2o.deeplearning
296 | ?h2o.xgboost
297 | ```
298 | 
299 | ### Train a xgboost model with manual settings
300 | 
301 | ```{r}
302 | model_xgb_m <- h2o.xgboost(x = features, 
303 |                            y = target, 
304 |                            training_frame = h_train,
305 |                            model_id = "model_xgb_m", 
306 |                            nfolds = 5,
307 |                            seed = n_seed,
308 |                            # Manual Settings based on experience
309 |                            learn_rate = 0.1,       # use a lower rate (more conservative)
310 |                            ntrees = 100,           # use more trees (due to lower learn_rate)
311 |                            sample_rate = 0.9,     # use random n% of samples for each tree  
312 |                            col_sample_rate = 0.9) # use random n% of features for each tree
313 | ```
314 | 
315 | ### Comparison (RMSE: Lower = Better)
316 | 
317 | ```{r}
318 | d_eval_tmp <- data.frame(model = "XGBoost: eXtreme Gradient Boosting Model (Manual Settings)",
319 |                          RMSE_cv = model_xgb_m@model$cross_validation_metrics@metrics$RMSE,
320 |                          RMSE_test = h2o.rmse(h2o.performance(model_xgb_m, newdata = h_test)))
321 | d_eval <- rbind(d_eval, d_eval_tmp)
322 | 
323 | datatable(d_eval, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>%
324 |   formatRound(columns = -1, digits = 4)
325 | ```
326 | 
327 | 
328 | 
329 | ## H2O AutoML
330 | 
331 | ```{r}
332 | # Run AutoML (try n different models)
333 | # Check out all options using ?h2o.automl
334 | automl = h2o.automl(x = features,
335 |                     y = target,
336 |                     training_frame = h_train,
337 |                     nfolds = 5,                     # 5-fold Cross-Validation
338 |                     max_models = 20,                # Max number of models
339 |                     stopping_metric = "RMSE",       # Metric to optimize
340 |                     project_name = "automl_boston", # Specify a name so you can add more models later
341 |                     seed = n_seed)
342 | ```
343 | 
344 | ### Leaderboard
345 | 
346 | ```{r}
347 | datatable(as.data.frame(automl@leaderboard), 
348 |           rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>%
349 |   formatRound(columns = -1, digits = 4)
350 | ```
351 | 
352 | ### Best Model (Leader)
353 | 
354 | ```{r}
355 | automl@leader
356 | ```
357 | 
358 | ### Comparison (RMSE: Lower = Better)
359 | 
360 | ```{r}
361 | d_eval_tmp <- data.frame(model = "Best Model from H2O AutoML",
362 |                          RMSE_cv = automl@leader@model$cross_validation_metrics@metrics$RMSE,
363 |                          RMSE_test = h2o.rmse(h2o.performance(automl@leader, newdata = h_test)))
364 | d_eval <- rbind(d_eval, d_eval_tmp)
365 | 
366 | datatable(d_eval, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>%
367 |   formatRound(columns = -1, digits = 4)
368 | ```
369 | 
370 | 
371 | 
372 | ## Make Predictions
373 | 
374 | ```{r}
375 | yhat_test <- h2o.predict(automl@leader, newdata = h_test)
376 | head(yhat_test)
377 | ```
378 | 
379 | 
380 | 
381 | # Regression Part Two: XAI
382 | 
383 | Let's look at the first house in `h_test`
384 | 
385 | ```{r}
386 | datatable(as.data.frame(h_test[1, ]),
387 |           rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE))
388 | ```
389 | 
390 | 
391 | ## Using functions in `h2o`
392 | 
393 | - `h2o.varimp()` & `h2o.varimp_plot`: Variable Importance (for GBM, DNN, GLM)
394 | - `h2o.partialPlot()`: Partial Dependence Plots
395 | - `h2o.predict_contributions()`: SHAP values (for GBM and XGBoost only)
396 | 
397 | ```{r, eval=FALSE}
398 | # Look at the impact of feature `rm` (no. of rooms)
399 | # Not Run
400 | h2o.partialPlot(model_glm, data = h_test, cols = c("rm"))
401 | h2o.partialPlot(model_drf, data = h_test, cols = c("rm"))
402 | h2o.partialPlot(model_gbm, data = h_test, cols = c("rm"))
403 | h2o.partialPlot(model_dnn, data = h_test, cols = c("rm"))
404 | h2o.partialPlot(model_xgb, data = h_test, cols = c("rm"))
405 | h2o.partialPlot(automl@leader, data = h_test, cols = c("rm"))
406 | ```
407 | 
408 | 
409 | ## Package `DALEX`
410 | 
411 | ```{r}
412 | # Descriptive mAchine Learning EXplanations (DALEX)
413 | library(DALEX)
414 | ```
415 | 
416 | - Website: https://pbiecek.github.io/DALEX/
417 | - Original DALEX-H2O Example: https://raw.githack.com/pbiecek/DALEX_docs/master/vignettes/DALEX_h2o.html
418 | 
419 | <center>
420 | ![DALEX](img/DALEX_intro.png)
421 | </center>
422 | 
423 | ### The `explain()` Function
424 | 
425 | The first step of using the `DALEX` package is to wrap-up the black-box model with meta-data that unifies model interfacing.
426 | 
427 | To create an explainer we use `explain()` function. Validation dataset for the models is `h_test` from part one. For the models created by `h2o` package we have to provide custom predict function which takes two arguments:  `model` and `newdata` and returns a numeric vector with predictions.
428 | 
429 | ```{r}
430 | # Custom Predict Function
431 | custom_predict <- function(model, newdata) {
432 |   newdata_h2o <- as.h2o(newdata)
433 |   res <- as.data.frame(h2o.predict(model, newdata_h2o))
434 |   return(as.numeric(res$predict))
435 |   }
436 | ```
437 | 
438 | ### Explainer for H2O Models
439 | 
440 | ```{r}
441 | explainer_drf <- DALEX::explain(model = model_drf, 
442 |                                 data = as.data.frame(h_test)[, features],
443 |                                 y = as.data.frame(h_test)[, target],
444 |                                 predict_function = custom_predict,
445 |                                 label = "Random Forest")
446 | 
447 | explainer_dnn <- DALEX::explain(model = model_dnn, 
448 |                                 data = as.data.frame(h_test)[, features],
449 |                                 y = as.data.frame(h_test)[, target],
450 |                                 predict_function = custom_predict,
451 |                                 label = "Deep Neural Networks")
452 | 
453 | explainer_xgb <- DALEX::explain(model = model_xgb, 
454 |                                 data = as.data.frame(h_test)[, features],
455 |                                 y = as.data.frame(h_test)[, target],
456 |                                 predict_function = custom_predict,
457 |                                 label = "XGBoost")
458 | 
459 | explainer_automl <- DALEX::explain(model = automl@leader, 
460 |                                 data = as.data.frame(h_test)[, features],
461 |                                 y = as.data.frame(h_test)[, target],
462 |                                 predict_function = custom_predict,
463 |                                 label = "H2O AutoML")
464 | ```
465 | 
466 | ### Variable importance
467 | 
468 | Using he DALEX package we are able to better understand which variables are important.
469 | 
470 | Model agnostic variable importance is calculated by means of permutations. We simply substract the loss function calculated for validation dataset with permuted values for a single variable from the loss function calculated for validation dataset.
471 | 
472 | This method is implemented in the variable_importance() function.
473 | 
474 | ```{r}
475 | vi_drf <- variable_importance(explainer_drf, type="difference")
476 | vi_dnn <- variable_importance(explainer_dnn, type="difference")
477 | vi_xgb <- variable_importance(explainer_xgb, type="difference")
478 | vi_automl <- variable_importance(explainer_automl, type="difference")
479 | ```
480 | 
481 | ```{r}
482 | plot(vi_drf, vi_dnn, vi_xgb, vi_automl)
483 | ```
484 | 
485 | 
486 | ### Partial Dependence Plots
487 | 
488 | Partial Dependence Plots (PDP) are one of the most popular methods for exploration of the relation between a continuous variable and the model outcome. Function variable_response() with the parameter type = "pdp" calls pdp::partial() function to calculate PDP response.
489 | 
490 | Let's look at feature `rm` (no. of rooms)
491 | 
492 | ```{r}
493 | pdp_drf_rm <- variable_response(explainer_drf, variable = "rm")
494 | pdp_dnn_rm <- variable_response(explainer_dnn, variable = "rm")
495 | pdp_xgb_rm <- variable_response(explainer_xgb, variable = "rm")
496 | pdp_automl_rm <- variable_response(explainer_automl, variable = "rm")
497 | plot(pdp_drf_rm, pdp_dnn_rm, pdp_xgb_rm, pdp_automl_rm)
498 | ```
499 | 
500 | 
501 | ### Prediction Understanding
502 | 
503 | ```{r}
504 | # Predictions from different models
505 | yhat <- data.frame(model = c("H2O DRF: Distributed Random Forest (Baseline)",
506 |                              "H2O DNN: Deep Neural Network (Baseline)",
507 |                              "XGBoost: eXtreme Gradient Boosting Model (Baseline)",
508 |                              "Best Model from H2O AutoML"))
509 | yhat$prediction <- NA
510 | yhat[1,]$prediction <- as.matrix(h2o.predict(model_drf, h_test[1,]))
511 | yhat[2,]$prediction <- as.matrix(h2o.predict(model_dnn, h_test[1,]))
512 | yhat[3,]$prediction <- as.matrix(h2o.predict(model_xgb, h_test[1,]))
513 | yhat[4,]$prediction <- as.matrix(h2o.predict(automl@leader, h_test[1,]))
514 | 
515 | # Show the predictions
516 | datatable(yhat, rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE)) %>%
517 |   formatRound(columns = -1, digits = 3)
518 | ```
519 | 
520 | The function `prediction_breakdown()` is a wrapper around the `breakDown` package. Model prediction is visualized with Break Down Plots, which show the contribution of every variable present in the model. Function `prediction_breakdown()` generates variable attributions for selected prediction. The generic `plot()` function shows these attributions.
521 | 
522 | ```{r}
523 | library(breakDown)
524 | sample <- as.data.frame(h_test)[1, ]     # Using the first sample from h_test
525 | pb_drf <- prediction_breakdown(explainer_drf, observation = sample)
526 | pb_dnn <- prediction_breakdown(explainer_dnn, observation = sample)
527 | pb_xgb <- prediction_breakdown(explainer_xgb, observation = sample)
528 | pb_automl <- prediction_breakdown(explainer_automl, observation = sample)
529 | ```
530 | 
531 | 
532 | ```{r}
533 | plot(pb_drf)
534 | plot(pb_dnn)
535 | plot(pb_xgb)
536 | plot(pb_automl)
537 | ```
538 | 
539 | 
540 | 
541 | # Coffee Break 10:30 - 11:00
542 | 
543 | <center>
544 | ![coffee_break](https://media.giphy.com/media/U82ik0q8lJly8/giphy.gif)
545 | </center>
546 | 
547 | 
548 | 
549 | 
550 | # Classification Part One: H2O AutoML
551 | 
552 | ```{r, message=FALSE}
553 | # Let's go
554 | library(h2o) # for H2O Machine Learning
555 | library(mlbench) # for Datasets
556 | ```
557 | 
558 | ```{r}
559 | # Enter your lucky seed here ...
560 | n_seed <- 12345
561 | ```
562 | 
563 | ## Data - Pima Indians Diabetes from `mlbench`
564 | 
565 | ```{r}
566 | data("PimaIndiansDiabetes")
567 | datatable(head(PimaIndiansDiabetes), 
568 |           rownames = FALSE, options = list(pageLength = 6, scrollX = TRUE))
569 | ```
570 | 
571 | 
572 | ## Data Prep
573 | 
574 | ```{r}
575 | # Convert pos and neg to 1 and 0
576 | d_new <- PimaIndiansDiabetes[, -ncol(PimaIndiansDiabetes)]
577 | d_new$diabetes <- 0
578 | d_new[which(PimaIndiansDiabetes$diabetes == "pos"), ]$diabetes <- 1
579 | PimaIndiansDiabetes <- d_new
580 | rm(d_new)
581 | ```
582 | 
583 | ```{r}
584 | target <- "diabetes" 
585 | features <- setdiff(colnames(PimaIndiansDiabetes), target)
586 | print(features)
587 | ```
588 | 
589 | 
590 | 
591 | ## Start a local H2O Cluster (JVM)
592 | 
593 | ```{r}
594 | h2o.init()
595 | ```
596 | 
597 | ```{r}
598 | h2o.no_progress() # disable progress bar for RMarkdown
599 | h2o.removeAll()   # Optional: remove anything from previous session 
600 | ```
601 | 
602 | 
603 | 
604 | ## Convert R dataframe into H2O dataframe
605 | 
606 | ```{r}
607 | # H2O dataframe
608 | h_diabetes <- as.h2o(PimaIndiansDiabetes)
609 | 
610 | # Make sure the target is a factor (for classification)
611 | h_diabetes$diabetes <- as.factor(h_diabetes$diabetes)
612 | ```
613 | 
614 | 
615 | 
616 | ## Split Data into Train/Test
617 | 
618 | ```{r}
619 | h_split <- h2o.splitFrame(h_diabetes, ratios = 0.8, seed = n_seed)
620 | h_train <- h_split[[1]] # 80% for modelling
621 | h_test <- h_split[[2]] # 20% for evaluation
622 | ```
623 | 
624 | ```{r}
625 | dim(h_train)
626 | dim(h_test)
627 | ```
628 | 
629 | 
630 | ## H2O AutoML
631 | 
632 | ```{r}
633 | # Run AutoML (try n different models)
634 | # Check out all options using ?h2o.automl
635 | automl = h2o.automl(x = features,
636 |                     y = target,
637 |                     training_frame = h_train,
638 |                     nfolds = 5,                        # 5-fold Cross-Validation
639 |                     max_models = 20,                   # Max number of models
640 |                     stopping_metric = "logloss",       # Metric to optimize
641 |                     project_name = "automl_diabetes",  # Specify a name so you can add more models later
642 |                     sort_metric = "logloss",
643 |                     seed = n_seed)
644 | ```
645 | 
646 | ### Leaderboard
647 | 
648 | ```{r}
649 | datatable(as.data.frame(automl@leaderboard), 
650 |           rownames = FALSE, options = list(pageLength = 10, scrollX = TRUE, round)) %>%
651 |   formatRound(columns = -1, digits = 4)
652 | ```
653 | 
654 | # Classification Part Two: XAI
655 | 
656 | ## Package `DALEX`
657 | 
658 | ```{r}
659 | # Descriptive mAchine Learning EXplanations (DALEX)
660 | library(DALEX)
661 | ```
662 | 
663 | 
664 | ### The `explain()` Function
665 | 
666 | ```{r}
667 | # Custom Predict Function
668 | custom_predict <- function(model, newdata) {
669 |   newdata_h2o <- as.h2o(newdata)
670 |   res <- as.data.frame(h2o.predict(model, newdata_h2o))
671 |   return(round(res$p1)) # round the probabil
672 |   }
673 | ```
674 | 
675 | ### Explainer for H2O Models
676 | 
677 | ```{r}
678 | explainer_automl <- DALEX::explain(model = automl@leader, 
679 |                                 data = as.data.frame(h_test)[, features],
680 |                                 y = PimaIndiansDiabetes$diabetes,
681 |                                 predict_function = custom_predict,
682 |                                 label = "H2O AutoML")
683 | ```
684 | 
685 | ### Variable importance
686 | 
687 | ```{r}
688 | vi_automl <- variable_importance(explainer_automl, type="difference")
689 | plot(vi_automl)
690 | ```
691 | 
692 | 
693 | ### Partial Dependence Plots
694 | 
695 | Let's look at feature `age` 
696 | 
697 | ```{r}
698 | pdp_automl_rm <- variable_response(explainer_automl, variable = "age")
699 | plot(pdp_automl_rm)
700 | ```
701 | 
702 | 
703 | ## Prediction Understanding
704 | 
705 | ```{r}
706 | library(breakDown)
707 | ```
708 | 
709 | ```{r}
710 | # Prediction: Diabetes = Negative (0)
711 | pb_automl <- prediction_breakdown(explainer_automl, observation = as.data.frame(h_test)[1, ])
712 | plot(pb_automl)
713 | ```
714 | 
715 | 
716 | ```{r}
717 | # Prediction: Diabetes = Positive (1)
718 | pb_automl <- prediction_breakdown(explainer_automl, observation = as.data.frame(h_test)[6, ])
719 | plot(pb_automl)
720 | ```
721 | 
722 | 
723 | # Bring Your Own Data + Q&A
724 | 
725 | Get your hands dirty!
726 | 
727 | 


--------------------------------------------------------------------------------