├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── automl.R ├── boost_tree.R ├── control_h2o.R ├── glm_wrapper.R ├── imports.R ├── linear_reg.R ├── logistic_reg.R ├── make_call.R ├── metrics.R ├── mlp.R ├── multi_predicts.R ├── multinomial_reg.R ├── naive_Bayes.R ├── rand_forest.R ├── read_h2o.R ├── rule_fit.R ├── translate_args.R ├── tune_grid_h2o.R ├── utils-pipe.R ├── utils.R └── zzz.R ├── README.md ├── _pkgdown.yml ├── h2oparsnip.Rproj ├── man ├── automl.Rd ├── control_h2o.Rd ├── extract_h2o_models.Rd ├── extract_h2o_preds.Rd ├── extract_h2o_scores.Rd ├── h2o_automl_train.Rd ├── h2o_gbm_train.Rd ├── h2o_glm_train.Rd ├── h2o_mlp_train.Rd ├── h2o_naiveBayes_train.Rd ├── h2o_rf_train.Rd ├── h2o_rulefit_train.Rd ├── model_spec_to_algorithm.Rd ├── mse_vec.Rd ├── pipe.Rd ├── remove_h2o_models.Rd ├── tune_grid_h2o.Rd └── write_h2o.Rd └── tests ├── testthat.R └── testthat ├── test_boost_tree_h2o.R ├── test_control_h2o.R ├── test_linear_reg_h2o.R ├── test_logistic_reg_h2o.R ├── test_metrics.R ├── test_mlp_h2o.R ├── test_multinom_reg_h2o.R ├── test_naive_Bayes.R ├── test_rand_forest_h2o.R ├── test_rule_fit.R └── test_save.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^_pkgdown\.yml$ 5 | ^docs$ 6 | ^pkgdown$ 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | docs 6 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: h2oparsnip 2 | Type: Package 3 | Title: Model Wrappers for H2O models 4 | Version: 0.1.0 5 | Author: Steven Pawley 6 | Maintainer: Steven Pawley 7 | Description: Bindings for additional models for use with the 'parsnip' package. 8 | Models include learning algorithms available in the R H2O package. 9 | License: MIT + file LICENSE 10 | URL: https://github.com/stevenpawley/h2oparsnip 11 | Suggests: 12 | testthat, 13 | roxygen2, 14 | covr 15 | Encoding: UTF-8 16 | LazyData: true 17 | Depends: 18 | R (>= 2.10), 19 | parsnip 20 | Imports: 21 | rlang, 22 | tibble, 23 | magrittr, 24 | h2o, 25 | dials, 26 | tune, 27 | yardstick, 28 | rsample, 29 | discrim 30 | BugReports: https://github.com/stevenpawley/h2oparsnip/issues 31 | RoxygenNote: 7.1.2 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2020 2 | COPYRIGHT HOLDER: Steven Pawley 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2020 Steven Pawley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(mse,data.frame) 4 | S3method(multi_predict,"_H2OMultinomialModel") 5 | S3method(multi_predict,"_H2ORegressionModel") 6 | S3method(print,automl) 7 | S3method(read_h2o,model_fit) 8 | S3method(read_h2o,workflow) 9 | S3method(write_h2o,model_fit) 10 | S3method(write_h2o,workflow) 11 | export("%>%") 12 | export(automl) 13 | export(control_h2o) 14 | export(h2o_automl_train) 15 | export(h2o_gbm_train) 16 | export(h2o_glm_train) 17 | export(h2o_mlp_train) 18 | export(h2o_naiveBayes_train) 19 | export(h2o_rf_train) 20 | export(h2o_rulefit_train) 21 | export(mse) 22 | export(mse_vec) 23 | export(read_h2o) 24 | export(tune_grid_h2o) 25 | export(write_h2o) 26 | importFrom(magrittr,"%>%") 27 | importFrom(parsnip,multi_predict) 28 | importFrom(tibble,as_tibble) 29 | importFrom(tibble,tibble) 30 | importFrom(yardstick,accuracy) 31 | importFrom(yardstick,mn_log_loss) 32 | importFrom(yardstick,pr_auc) 33 | importFrom(yardstick,rmse) 34 | importFrom(yardstick,roc_auc) 35 | importFrom(yardstick,rsq) 36 | -------------------------------------------------------------------------------- /R/automl.R: -------------------------------------------------------------------------------- 1 | #' General interface for automl models 2 | #' 3 | #' @param mode A single character string for the type of model. 4 | #' 5 | #' @return A model_spec 6 | #' @export 7 | automl <- function(mode = "classification") { 8 | args <- list() 9 | 10 | parsnip::new_model_spec( 11 | "automl", 12 | args = args, 13 | eng_args = NULL, 14 | mode = mode, 15 | method = NULL, 16 | engine = "h2o" 17 | ) 18 | } 19 | 20 | #' @export 21 | print.automl <- function(x, ...) { 22 | cat("Automl Model Specification (", x$mode, ")\n\n", sep = "") 23 | parsnip::model_printer(x, ...) 24 | 25 | if (!is.null(x$method$fit$args)) { 26 | cat("Model fit template:\n") 27 | print(parsnip::show_call(x)) 28 | } 29 | 30 | invisible(x) 31 | } 32 | 33 | 34 | add_automl <- function() { 35 | # define model 36 | parsnip::set_new_model("automl") 37 | 38 | # define model modes 39 | parsnip::set_model_mode(model = "automl", mode = "classification") 40 | parsnip::set_model_engine("automl", mode = "classification", eng = "h2o") 41 | 42 | parsnip::set_model_mode(model = "automl", mode = "regression") 43 | parsnip::set_model_engine("automl", mode = "regression", eng = "h2o") 44 | 45 | # define dependencies for each mode 46 | parsnip::set_dependency("automl", "h2o", "h2o") 47 | 48 | # define fit methods 49 | parsnip::set_fit( 50 | model = "automl", 51 | eng = "h2o", 52 | mode = "regression", 53 | value = list( 54 | interface = "formula", 55 | protect = c("formula", "x", "y", "training_frame"), 56 | func = c(fun = "h2o_automl_train"), 57 | defaults = list() 58 | ) 59 | ) 60 | parsnip::set_fit( 61 | model = "automl", 62 | eng = "h2o", 63 | mode = "classification", 64 | value = list( 65 | interface = "formula", 66 | protect = c("formula", "x", "y", "training_frame"), 67 | func = c(fun = "h2o_automl_train"), 68 | defaults = list() 69 | ) 70 | ) 71 | parsnip::set_encoding( 72 | model = "automl", 73 | eng = "h2o", 74 | mode = "regression", 75 | options = list( 76 | predictor_indicators = "none", 77 | compute_intercept = FALSE, 78 | remove_intercept = FALSE, 79 | allow_sparse_x = FALSE 80 | ) 81 | ) 82 | parsnip::set_encoding( 83 | model = "automl", 84 | eng = "h2o", 85 | mode = "classification", 86 | options = list( 87 | predictor_indicators = "none", 88 | compute_intercept = FALSE, 89 | remove_intercept = FALSE, 90 | allow_sparse_x = FALSE 91 | ) 92 | ) 93 | 94 | # regression predict 95 | parsnip::set_pred( 96 | model = "automl", 97 | eng = "h2o", 98 | mode = "regression", 99 | type = "numeric", 100 | value = list( 101 | pre = function(x, object) h2o::as.h2o(x), 102 | post = function(x, object) as.data.frame(x)$predict, 103 | func = c(pkg = "h2o", fun = "h2o.predict"), 104 | args = list( 105 | object = quote(object$fit), 106 | newdata = quote(new_data) 107 | ) 108 | ) 109 | ) 110 | parsnip::set_pred( 111 | model = "automl", 112 | eng = "h2o", 113 | mode = "regression", 114 | type = "raw", 115 | value = list( 116 | pre = function(x, object) h2o::as.h2o(x), 117 | post = function(x, object) as.data.frame(x), 118 | func = c(pkg = "h2o", fun = "h2o.predict"), 119 | args = list( 120 | object = quote(object$fit), 121 | newdata = quote(new_data) 122 | ) 123 | ) 124 | ) 125 | 126 | # classification predict 127 | parsnip::set_pred( 128 | model = "automl", 129 | eng = "h2o", 130 | mode = "classification", 131 | type = "class", 132 | value = list( 133 | pre = function(x, object) h2o::as.h2o(x), 134 | post = function(x, object) as.data.frame(x)$predict, 135 | func = c(pkg = "h2o", fun = "h2o.predict"), 136 | args = list( 137 | object = quote(object$fit), 138 | newdata = quote(new_data) 139 | ) 140 | ) 141 | ) 142 | parsnip::set_pred( 143 | model = "automl", 144 | eng = "h2o", 145 | mode = "classification", 146 | type = "prob", 147 | value = list( 148 | pre = function(x, object) h2o::as.h2o(x), 149 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 150 | func = c(pkg = "h2o", fun = "h2o.predict"), 151 | args = list( 152 | object = quote(object$fit), 153 | newdata = quote(new_data) 154 | ) 155 | ) 156 | ) 157 | parsnip::set_pred( 158 | model = "automl", 159 | eng = "h2o", 160 | mode = "classification", 161 | type = "raw", 162 | value = list( 163 | pre = function(x, object) h2o::as.h2o(x), 164 | post = function(x, object) as.data.frame(x), 165 | func = c(pkg = "h2o", fun = "h2o.predict"), 166 | args = list( 167 | object = quote(object$fit), 168 | newdata = quote(new_data) 169 | ) 170 | ) 171 | ) 172 | } 173 | 174 | 175 | #' Wrapper for training a h2o.automl model 176 | #' 177 | #' @param formula formula 178 | #' @param data data.frame of training data 179 | #' @param ... Other arguments to pass the h2o.automl 180 | #' 181 | #' @return evaluated h2o model call 182 | #' @export 183 | h2o_automl_train <- function(formula, data, ...) { 184 | others <- list(...) 185 | 186 | # get term names 187 | X <- attr(stats::terms(formula, data = data), "term.labels") 188 | y <- all.vars(formula)[1] 189 | 190 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 191 | if (!inherits(data, "H2OFrame")) { 192 | data <- h2o::as.h2o(data) 193 | } 194 | 195 | # define arguments 196 | args <- list( 197 | x = X, 198 | y = y, 199 | training_frame = data 200 | ) 201 | 202 | others <- list(...) 203 | res <- make_h2o_call("h2o.automl", args, others) 204 | 205 | res 206 | } 207 | -------------------------------------------------------------------------------- /R/boost_tree.R: -------------------------------------------------------------------------------- 1 | add_boost_tree_h2o <- function() { 2 | parsnip::set_model_engine("boost_tree", "classification", "h2o") 3 | parsnip::set_model_engine("boost_tree", "regression", "h2o") 4 | parsnip::set_dependency("boost_tree", "h2o", "h2o") 5 | 6 | parsnip::set_model_arg( 7 | model = "boost_tree", 8 | eng = "h2o", 9 | parsnip = "trees", 10 | original = "ntrees", 11 | func = list(pkg = "dials", fun = "trees"), 12 | has_submodel = TRUE 13 | ) 14 | parsnip::set_model_arg( 15 | model = "boost_tree", 16 | eng = "h2o", 17 | parsnip = "tree_depth", 18 | original = "max_depth", 19 | func = list(pkg = "dials", fun = "tree_depth"), 20 | has_submodel = FALSE 21 | ) 22 | parsnip::set_model_arg( 23 | model = "boost_tree", 24 | eng = "h2o", 25 | parsnip = "min_n", 26 | original = "min_rows", 27 | func = list(pkg = "dials", fun = "min_n"), 28 | has_submodel = FALSE 29 | ) 30 | parsnip::set_model_arg( 31 | model = "boost_tree", 32 | eng = "h2o", 33 | parsnip = "learn_rate", 34 | original = "learn_rate", 35 | func = list(pkg = "dials", fun = "learn_rate"), 36 | has_submodel = FALSE 37 | ) 38 | parsnip::set_model_arg( 39 | model = "boost_tree", 40 | eng = "h2o", 41 | parsnip = "sample_size", 42 | original = "sample_rate", 43 | func = list(pkg = "dials", fun = "sample_size"), 44 | has_submodel = FALSE 45 | ) 46 | parsnip::set_model_arg( 47 | model = "boost_tree", 48 | eng = "h2o", 49 | parsnip = "mtry", 50 | original = "col_sample_rate", 51 | func = list(pkg = "dials", fun = "mtry"), 52 | has_submodel = FALSE 53 | ) 54 | parsnip::set_model_arg( 55 | model = "boost_tree", 56 | eng = "h2o", 57 | parsnip = "loss_reduction", 58 | original = "min_split_improvement", 59 | func = list(pkg = "dials", fun = "loss_reduction"), 60 | has_submodel = FALSE 61 | ) 62 | parsnip::set_model_arg( 63 | model = "boost_tree", 64 | eng = "h2o", 65 | parsnip = "stop_iter", 66 | original = "stopping_rounds", 67 | func = list(pkg = "dials", fun = "stop_iter"), 68 | has_submodel = FALSE 69 | ) 70 | parsnip::set_fit( 71 | model = "boost_tree", 72 | eng = "h2o", 73 | mode = "regression", 74 | value = list( 75 | interface = "formula", 76 | protect = c("formula", "x", "y", "training_frame"), 77 | func = c(fun = "h2o_gbm_train"), 78 | defaults = list() 79 | ) 80 | ) 81 | parsnip::set_fit( 82 | model = "boost_tree", 83 | eng = "h2o", 84 | mode = "classification", 85 | value = list( 86 | interface = "formula", 87 | protect = c("formula", "x", "y", "training_frame"), 88 | func = c(fun = "h2o_gbm_train"), 89 | defaults = list() 90 | ) 91 | ) 92 | parsnip::set_encoding( 93 | model = "boost_tree", 94 | eng = "h2o", 95 | mode = "classification", 96 | options = list( 97 | predictor_indicators = "none", 98 | compute_intercept = FALSE, 99 | remove_intercept = FALSE, 100 | allow_sparse_x = FALSE 101 | ) 102 | ) 103 | parsnip::set_encoding( 104 | model = "boost_tree", 105 | eng = "h2o", 106 | mode = "regression", 107 | options = list( 108 | predictor_indicators = "none", 109 | compute_intercept = FALSE, 110 | remove_intercept = FALSE, 111 | allow_sparse_x = FALSE 112 | ) 113 | ) 114 | 115 | # regression predict 116 | parsnip::set_pred( 117 | model = "boost_tree", 118 | eng = "h2o", 119 | mode = "regression", 120 | type = "numeric", 121 | value = list( 122 | pre = function(x, object) h2o::as.h2o(x), 123 | post = function(x, object) as.data.frame(x)$predict, 124 | func = c(pkg = "h2o", fun = "h2o.predict"), 125 | args = list( 126 | object = quote(object$fit), 127 | newdata = quote(new_data) 128 | ) 129 | ) 130 | ) 131 | parsnip::set_pred( 132 | model = "boost_tree", 133 | eng = "h2o", 134 | mode = "regression", 135 | type = "raw", 136 | value = list( 137 | pre = function(x, object) h2o::as.h2o(x), 138 | post = function(x, object) as.data.frame(x), 139 | func = c(pkg = "h2o", fun = "h2o.predict"), 140 | args = list( 141 | object = quote(object$fit), 142 | newdata = quote(new_data) 143 | ) 144 | ) 145 | ) 146 | 147 | # classification predict 148 | parsnip::set_pred( 149 | model = "boost_tree", 150 | eng = "h2o", 151 | mode = "classification", 152 | type = "class", 153 | value = list( 154 | pre = function(x, object) h2o::as.h2o(x), 155 | post = function(x, object) as.data.frame(x)$predict, 156 | func = c(pkg = "h2o", fun = "h2o.predict"), 157 | args = list( 158 | object = quote(object$fit), 159 | newdata = quote(new_data) 160 | ) 161 | ) 162 | ) 163 | parsnip::set_pred( 164 | model = "boost_tree", 165 | eng = "h2o", 166 | mode = "classification", 167 | type = "prob", 168 | value = list( 169 | pre = function(x, object) h2o::as.h2o(x), 170 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 171 | func = c(pkg = "h2o", fun = "h2o.predict"), 172 | args = list( 173 | object = quote(object$fit), 174 | newdata = quote(new_data) 175 | ) 176 | ) 177 | ) 178 | parsnip::set_pred( 179 | model = "boost_tree", 180 | eng = "h2o", 181 | mode = "classification", 182 | type = "raw", 183 | value = list( 184 | pre = function(x, object) h2o::as.h2o(x), 185 | post = function(x, object) as.data.frame(x), 186 | func = c(pkg = "h2o", fun = "h2o.predict"), 187 | args = list( 188 | object = quote(object$fit), 189 | newdata = quote(new_data) 190 | ) 191 | ) 192 | ) 193 | } 194 | 195 | #' Wrapper for training a h2o.gbm model as part of a parsnip `boost_tree` 196 | #' h2o engine 197 | #' 198 | #' @param formula formula 199 | #' @param data data.frame of training data 200 | #' @param ntrees integer, the number of trees to build (default = 50). 201 | #' @param max_depth integer, the maximum tree depth (default = 10). 202 | #' @param min_rows integer, the minimum number of observations for a leaf 203 | #' (default = 10). 204 | #' @param learn_rate numeric, the learning rate (default = 0.1, range is from 205 | #' 0.0 to 1.0). 206 | #' @param sample_rate numeric, the proportion of samples to use to build each 207 | #' tree (default = 1.0). 208 | #' @param col_sample_rate numeric, the proportion of features available during 209 | #' each node split (default = 1.0). 210 | #' @param min_split_improvement numeric, minimum relative improvement in 211 | #' squared error reduction in order for a split to happen (default = 1e-05) 212 | #' @param stopping_rounds An integer specifying the number of training 213 | #' iterations without improvement before stopping. If `stopping_rounds = 0` 214 | #' (the default) then early stopping is disabled. If `validation` is used, 215 | #' performance is base on the validation set; otherwise the training set is 216 | #' used. 217 | #' @param validation A positive number. If on `[0, 1)` the value, `validation` 218 | #' is a random proportion of data in `x` and `y` that are used for performance 219 | #' assessment and potential early stopping. If 1 or greater, it is the _number_ 220 | #' of training set samples use for these purposes. 221 | #' @param algorithm Whether to use the default h2o 'h2o.gbm' algorithm or use 222 | #' 'h2o.xgboost' via h2o. 223 | #' @param ... other arguments passed to the h2o engine. 224 | #' 225 | #' @return evaluated h2o model call 226 | #' @export 227 | h2o_gbm_train <- 228 | function(formula, 229 | data, 230 | ntrees = 50, 231 | max_depth = 5, 232 | min_rows = 10, 233 | learn_rate = 0.1, 234 | sample_rate = 1.0, 235 | col_sample_rate = 1.0, 236 | min_split_improvement = 1e-05, 237 | stopping_rounds = 0, 238 | validation = 0, 239 | algorithm = "h2o.gbm", 240 | ...) { 241 | others <- list(...) 242 | 243 | # get term names and convert to h2o 244 | X <- attr(stats::terms(formula, data = data), "term.labels") 245 | y <- all.vars(formula)[1] 246 | 247 | # early stopping 248 | if (validation > 1) { 249 | validation <- validation / nrow(data) 250 | } 251 | 252 | if (stopping_rounds > 0 & validation > 0) { 253 | n <- nrow(data) 254 | trn_index <- sample(1:n, size = floor(n * validation) + 1) 255 | valid <- data[-trn_index, ] 256 | data <- data[trn_index, ] 257 | } else { 258 | valid <- NULL 259 | } 260 | 261 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 262 | if (!inherits(data, "H2OFrame")) { 263 | data <- h2o::as.h2o(data) 264 | } 265 | 266 | if (!is.null(valid)) { 267 | valid <- h2o::as.h2o(valid) 268 | } 269 | 270 | # convert mtry (number of features) to proportions 271 | if (col_sample_rate > 1) { 272 | col_sample_rate <- col_sample_rate / length(X) 273 | } 274 | 275 | # define arguments 276 | args <- list( 277 | x = X, 278 | y = y, 279 | training_frame = data, 280 | validation_frame = valid, 281 | ntrees = ntrees, 282 | max_depth = max_depth, 283 | min_rows = min_rows, 284 | learn_rate = learn_rate, 285 | sample_rate = sample_rate, 286 | min_split_improvement = min_split_improvement, 287 | stopping_rounds = stopping_rounds 288 | ) 289 | 290 | if (algorithm == "h2o.gbm") { 291 | args$col_sample_rate <- col_sample_rate 292 | } else if (algorithm == "h2o.xgboost") { 293 | args$colsample_bynode <- col_sample_rate 294 | } 295 | 296 | res <- make_h2o_call(algorithm, args, others) 297 | 298 | res 299 | } 300 | -------------------------------------------------------------------------------- /R/control_h2o.R: -------------------------------------------------------------------------------- 1 | #' Control aspects of the grid search process 2 | #' 3 | #' `control_h2o` provides a function to set various aspects of the grid search process. 4 | #' By default during tuning, the resampling predictions are stored within the h2o 5 | #' cluster. To save memory and space, use 'save_pred = FALSE'. 6 | #' `` 7 | #' 8 | #' @param verbose A logical for logging results as they are generated. 9 | #' @param save_pred A logical for whether the out-of-sample predictions should be saved 10 | #' for each model evaluated. 11 | #' @param save_models A logical for whether to retain the models associated with the 12 | #' tuning and resampling iterations within the h2o cluster and append their h2o model 13 | #' ids to the resamples object as a '.models' column. 14 | #' 15 | #' @return An object of `control_grid` and `control_resamples` class. 16 | #' @export 17 | #' 18 | #' @examples 19 | #' # to save space in the cluster use these settings (the defaults) 20 | #' control_h2o(verbose = TRUE, save_pred = FALSE) 21 | control_h2o <- function(verbose = FALSE, save_pred = FALSE, save_models = FALSE, 22 | event_level = "first") { 23 | res <- list( 24 | verbose = verbose, 25 | save_pred = save_pred, 26 | save_models = save_models 27 | ) 28 | 29 | class(res) <- c("control_grid", "control_resamples") 30 | res 31 | } 32 | -------------------------------------------------------------------------------- /R/glm_wrapper.R: -------------------------------------------------------------------------------- 1 | #' Wrapper for training a h2o.glm model as part of a parsnip 2 | #' 3 | #' @param formula formula 4 | #' @param data data.frame of training data 5 | #' @param alpha numeric, Distribution of regularization between the L1 (Lasso) 6 | #' and L2 (Ridge) penalties. A value of 1 for alpha represents Lasso 7 | #' regression, a value of 0 produces Ridge regression. 8 | #' @param lambda numeric, regularization strength 9 | #' @param family character, one of c("gaussian", "binomial", "quasibinomial", 10 | #' "ordinal", "multinomial", "poisson", "gamma", "tweedie", 11 | #' "negativebinomial") 12 | #' @param ... other arguments passed to the h2o engine. 13 | #' 14 | #' @return evaluated h2o model call 15 | #' @export 16 | h2o_glm_train <- 17 | function(formula, 18 | data, 19 | alpha = NULL, 20 | lambda = NULL, 21 | family = NULL, 22 | ...) { 23 | others <- list(...) 24 | 25 | # get term names and convert to h2o 26 | X <- attr(stats::terms(formula, data = data), "term.labels") 27 | y <- all.vars(formula)[1] 28 | 29 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 30 | if (!inherits(data, "H2OFrame")) { 31 | data <- h2o::as.h2o(data) 32 | } 33 | 34 | # define arguments 35 | args <- list( 36 | x = X, 37 | y = y, 38 | training_frame = data, 39 | alpha = alpha, 40 | lambda = lambda, 41 | family = family 42 | ) 43 | 44 | res <- make_h2o_call("h2o.glm", args, others) 45 | 46 | if (!"alpha" %in% names(res@parameters)) { 47 | res@parameters$alpha <- alpha 48 | } 49 | 50 | if (!"lambda" %in% names(res@parameters)) { 51 | res@parameters$lambda <- lambda 52 | } 53 | 54 | if (!"family" %in% names(res@parameters)) { 55 | res@parameters$family <- family 56 | } 57 | 58 | res 59 | } 60 | -------------------------------------------------------------------------------- /R/imports.R: -------------------------------------------------------------------------------- 1 | ## usethis namespace: start 2 | #' @importFrom tibble tibble as_tibble 3 | #' @importFrom parsnip multi_predict 4 | ## usethis namespace: end 5 | NULL 6 | -------------------------------------------------------------------------------- /R/linear_reg.R: -------------------------------------------------------------------------------- 1 | add_linear_reg_h2o <- function() { 2 | parsnip::set_model_engine("linear_reg", "regression", "h2o") 3 | parsnip::set_dependency("linear_reg", "h2o", "h2o") 4 | 5 | parsnip::set_model_arg( 6 | model = "linear_reg", 7 | eng = "h2o", 8 | parsnip = "mixture", 9 | original = "alpha", 10 | func = list(pkg = "dials", fun = "mixture"), 11 | has_submodel = FALSE 12 | ) 13 | parsnip::set_model_arg( 14 | model = "linear_reg", 15 | eng = "h2o", 16 | parsnip = "penalty", 17 | original = "lambda", 18 | func = list(pkg = "dials", fun = "penalty"), 19 | has_submodel = FALSE 20 | ) 21 | parsnip::set_fit( 22 | model = "linear_reg", 23 | eng = "h2o", 24 | mode = "regression", 25 | value = list( 26 | interface = "formula", 27 | protect = c("formula", "x", "y", "training_frame"), 28 | func = c(fun = "h2o_glm_train"), 29 | defaults = list( 30 | family = "gaussian" 31 | ) 32 | ) 33 | ) 34 | parsnip::set_encoding( 35 | model = "linear_reg", 36 | eng = "h2o", 37 | mode = "regression", 38 | options = list( 39 | predictor_indicators = "none", 40 | compute_intercept = FALSE, 41 | remove_intercept = FALSE, 42 | allow_sparse_x = FALSE 43 | ) 44 | ) 45 | 46 | # regression predict 47 | parsnip::set_pred( 48 | model = "linear_reg", 49 | eng = "h2o", 50 | mode = "regression", 51 | type = "numeric", 52 | value = list( 53 | pre = function(x, object) h2o::as.h2o(x), 54 | post = function(x, object) as.data.frame(x)$predict, 55 | func = c(pkg = "h2o", fun = "h2o.predict"), 56 | args = list( 57 | object = quote(object$fit), 58 | newdata = quote(new_data) 59 | ) 60 | ) 61 | ) 62 | parsnip::set_pred( 63 | model = "linear_reg", 64 | eng = "h2o", 65 | mode = "regression", 66 | type = "raw", 67 | value = list( 68 | pre = function(x, object) h2o::as.h2o(x), 69 | post = function(x, object) as.data.frame(x), 70 | func = c(pkg = "h2o", fun = "h2o.predict"), 71 | args = list( 72 | object = quote(object$fit), 73 | newdata = quote(new_data) 74 | ) 75 | ) 76 | ) 77 | } 78 | -------------------------------------------------------------------------------- /R/logistic_reg.R: -------------------------------------------------------------------------------- 1 | add_logistic_reg_h2o <- function() { 2 | parsnip::set_model_engine("logistic_reg", "classification", "h2o") 3 | parsnip::set_dependency("logistic_reg", "h2o", "h2o") 4 | 5 | parsnip::set_model_arg( 6 | model = "logistic_reg", 7 | eng = "h2o", 8 | parsnip = "mixture", 9 | original = "alpha", 10 | func = list(pkg = "dials", fun = "mixture"), 11 | has_submodel = FALSE 12 | ) 13 | parsnip::set_model_arg( 14 | model = "logistic_reg", 15 | eng = "h2o", 16 | parsnip = "penalty", 17 | original = "lambda", 18 | func = list(pkg = "dials", fun = "penalty"), 19 | has_submodel = FALSE 20 | ) 21 | parsnip::set_fit( 22 | model = "logistic_reg", 23 | eng = "h2o", 24 | mode = "classification", 25 | value = list( 26 | interface = "formula", 27 | protect = c("formula", "x", "y", "training_frame", "family"), 28 | func = c(fun = "h2o_glm_train"), 29 | defaults = list( 30 | family = "binomial" 31 | ) 32 | ) 33 | ) 34 | parsnip::set_encoding( 35 | model = "logistic_reg", 36 | eng = "h2o", 37 | mode = "classification", 38 | options = list( 39 | predictor_indicators = "none", 40 | compute_intercept = FALSE, 41 | remove_intercept = FALSE, 42 | allow_sparse_x = FALSE 43 | ) 44 | ) 45 | 46 | # classification predict 47 | parsnip::set_pred( 48 | model = "logistic_reg", 49 | eng = "h2o", 50 | mode = "classification", 51 | type = "class", 52 | value = list( 53 | pre = function(x, object) h2o::as.h2o(x), 54 | post = function(x, object) as.data.frame(x)$predict, 55 | func = c(pkg = "h2o", fun = "h2o.predict"), 56 | args = list( 57 | object = quote(object$fit), 58 | newdata = quote(new_data) 59 | ) 60 | ) 61 | ) 62 | parsnip::set_pred( 63 | model = "logistic_reg", 64 | eng = "h2o", 65 | mode = "classification", 66 | type = "prob", 67 | value = list( 68 | pre = function(x, object) h2o::as.h2o(x), 69 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 70 | func = c(pkg = "h2o", fun = "h2o.predict"), 71 | args = list( 72 | object = quote(object$fit), 73 | newdata = quote(new_data) 74 | ) 75 | ) 76 | ) 77 | parsnip::set_pred( 78 | model = "logistic_reg", 79 | eng = "h2o", 80 | mode = "classification", 81 | type = "raw", 82 | value = list( 83 | pre = function(x, object) h2o::as.h2o(x), 84 | post = function(x, object) as.data.frame(x), 85 | func = c(pkg = "h2o", fun = "h2o.predict"), 86 | args = list( 87 | object = quote(object$fit), 88 | newdata = quote(new_data) 89 | ) 90 | ) 91 | ) 92 | } 93 | -------------------------------------------------------------------------------- /R/make_call.R: -------------------------------------------------------------------------------- 1 | make_h2o_call <- function(.fn, args, others) { 2 | # remove args with NULLs 3 | args <- args[lengths(args) != 0] 4 | 5 | # create unevaluated model call 6 | model_call <- rlang::call2(.fn = .fn, !!!args, .ns = "h2o") 7 | 8 | # add others if not NULL 9 | if (length(others) > 0) { 10 | model_call <- rlang::call_standardise(model_call) 11 | model_call <- rlang::call_modify(model_call, !!!others) 12 | } 13 | 14 | rlang::eval_tidy(model_call) 15 | } 16 | -------------------------------------------------------------------------------- /R/metrics.R: -------------------------------------------------------------------------------- 1 | #' Mean squared error 2 | #' 3 | #' Calculate the mean squared error. This metric is in squared units of the original data. 4 | #' 5 | #' @param data A `data.frame` containing the `truth` and `estimate` columns. 6 | #' @param truth The column identifier for the true class results (that is a 7 | #' factor). This should be an unquoted column name although this argument is 8 | #' passed by expression and supports quasiquotation (you can unquote column 9 | #' names). For _vec() functions, a factor vector. 10 | #' @param estimate The column identifier for the predicted class results (that 11 | #' is also factor). As with truth this can be specified different ways but the 12 | #' primary method is to use an unquoted variable name. For _vec() functions, a 13 | #' factor vector. 14 | #' @param na_rm A logical value indicating whether NA values should be stripped 15 | #' before the computation proceeds. 16 | #' @param ... Not currently used. 17 | #' 18 | #' @return A `tibble` with columns `.metric`, `.estimator`, and `.estimate` and 1 row of values. 19 | #' @export 20 | mse_vec <- function(truth, estimate, na_rm = TRUE, ...) { 21 | mse_impl <- function(truth, estimate) { 22 | mean((estimate - truth)^2) 23 | } 24 | 25 | yardstick::metric_vec_template( 26 | metric_impl = mse_impl, 27 | truth = truth, 28 | estimate = estimate, 29 | na_rm = na_rm, 30 | cls = "numeric", 31 | ... 32 | ) 33 | } 34 | 35 | #' @export 36 | #' @rdname mse_vec 37 | mse <- function(data, ...) { 38 | UseMethod("mse") 39 | } 40 | 41 | class(mse) <- c("numeric_metric", "function") 42 | attr(mse, "direction") <- "minimize" 43 | 44 | 45 | #' @export 46 | #' @rdname mse_vec 47 | mse.data.frame <- function(data, truth, estimate, na_rm = TRUE, ...) { 48 | yardstick::metric_summarizer( 49 | metric_nm = "mse", 50 | metric_fn = mse_vec, 51 | data = data, 52 | truth = !!rlang::enquo(truth), 53 | estimate = !!rlang::enquo(estimate), 54 | na_rm = na_rm, 55 | ... 56 | ) 57 | } 58 | 59 | #' @importFrom yardstick rsq rmse accuracy mn_log_loss pr_auc roc_auc 60 | convert_h2o_metrics <- function(metrics) { 61 | allowed_metrics <- c( 62 | # regression 63 | "yardstick::rsq", 64 | "yardstick::rmse", 65 | "h2oparsnip::mse", 66 | 67 | # classification 68 | "yardstick::accuracy", 69 | "yardstick::mn_log_loss", 70 | "yardstick::pr_auc", 71 | "yardstick::roc_auc" 72 | ) 73 | 74 | allowed_metrics <- 75 | c(allowed_metrics, gsub("yardstick::", "", allowed_metrics)) 76 | allowed_metrics <- 77 | c(allowed_metrics, gsub("h2oparsnip::", "", allowed_metrics)) 78 | 79 | if (any(!names(attributes(metrics)$metrics) %in% allowed_metrics)) { 80 | msg <- "`metrics` argument must contain a `yardstick::metric_set` with one or 81 | several of the following metrics:" 82 | rlang::abort(paste(msg, paste(allowed_metrics, collapse = ", "))) 83 | } 84 | 85 | metric_names <- names(attributes(metrics)$metric) 86 | metric_names <- gsub("yardstick::", "", metric_names) 87 | metric_names <- gsub("h2oparsnip::", "", metric_names) 88 | 89 | convert_metric <- function(yardstick_name) { 90 | switch(yardstick_name, 91 | # regression 92 | rsq = "r2", 93 | rmse = "rmse", 94 | mse = "mse", 95 | 96 | # classification 97 | accuracy = "accuracy", 98 | mn_log_loss = "logloss", 99 | roc_auc = "auc", 100 | pr_auc = "aucpr" 101 | ) 102 | } 103 | 104 | sapply(metric_names, convert_metric) 105 | } 106 | -------------------------------------------------------------------------------- /R/mlp.R: -------------------------------------------------------------------------------- 1 | add_mlp_h2o <- function() { 2 | parsnip::set_model_engine("mlp", "classification", "h2o") 3 | parsnip::set_model_engine("mlp", "regression", "h2o") 4 | parsnip::set_dependency("mlp", "h2o", "h2o") 5 | 6 | parsnip::set_model_arg( 7 | model = "mlp", 8 | eng = "h2o", 9 | parsnip = "cost", 10 | original = "l2", 11 | func = list(pkg = "dials", fun = "cost"), 12 | has_submodel = FALSE 13 | ) 14 | parsnip::set_model_arg( 15 | model = "mlp", 16 | eng = "h2o", 17 | parsnip = "dropout", 18 | original = "hidden_dropout_ratios", 19 | func = list(pkg = "dials", fun = "dropout"), 20 | has_submodel = FALSE 21 | ) 22 | parsnip::set_model_arg( 23 | model = "mlp", 24 | eng = "h2o", 25 | parsnip = "hidden_units", 26 | original = "hidden", 27 | func = list(pkg = "dials", fun = "hidden_units"), 28 | has_submodel = FALSE 29 | ) 30 | parsnip::set_model_arg( 31 | model = "mlp", 32 | eng = "h2o", 33 | parsnip = "epochs", 34 | original = "epochs", 35 | func = list(pkg = "dials", fun = "epochs"), 36 | has_submodel = FALSE 37 | ) 38 | parsnip::set_model_arg( 39 | model = "mlp", 40 | eng = "h2o", 41 | parsnip = "activation", 42 | original = "activation", 43 | func = list(pkg = "dials", fun = "activation"), 44 | has_submodel = FALSE 45 | ) 46 | parsnip::set_fit( 47 | model = "mlp", 48 | eng = "h2o", 49 | mode = "regression", 50 | value = list( 51 | interface = "formula", 52 | protect = c("formula", "x", "y", "training_frame"), 53 | func = c(fun = "h2o_mlp_train"), 54 | defaults = list() 55 | ) 56 | ) 57 | parsnip::set_fit( 58 | model = "mlp", 59 | eng = "h2o", 60 | mode = "classification", 61 | value = list( 62 | interface = "formula", 63 | protect = c("formula", "x", "y", "training_frame"), 64 | func = c(fun = "h2o_mlp_train"), 65 | defaults = list() 66 | ) 67 | ) 68 | parsnip::set_encoding( 69 | model = "mlp", 70 | eng = "h2o", 71 | mode = "classification", 72 | options = list( 73 | predictor_indicators = "none", 74 | compute_intercept = FALSE, 75 | remove_intercept = FALSE, 76 | allow_sparse_x = FALSE 77 | ) 78 | ) 79 | parsnip::set_encoding( 80 | model = "mlp", 81 | eng = "h2o", 82 | mode = "regression", 83 | options = list( 84 | predictor_indicators = "none", 85 | compute_intercept = FALSE, 86 | remove_intercept = FALSE, 87 | allow_sparse_x = FALSE 88 | ) 89 | ) 90 | 91 | # regression predict 92 | parsnip::set_pred( 93 | model = "mlp", 94 | eng = "h2o", 95 | mode = "regression", 96 | type = "numeric", 97 | value = list( 98 | pre = function(x, object) h2o::as.h2o(x), 99 | post = function(x, object) as.data.frame(x)$predict, 100 | func = c(pkg = "h2o", fun = "h2o.predict"), 101 | args = list( 102 | object = quote(object$fit), 103 | newdata = quote(new_data) 104 | ) 105 | ) 106 | ) 107 | parsnip::set_pred( 108 | model = "mlp", 109 | eng = "h2o", 110 | mode = "regression", 111 | type = "raw", 112 | value = list( 113 | pre = function(x, object) h2o::as.h2o(x), 114 | post = function(x, object) as.data.frame(x), 115 | func = c(pkg = "h2o", fun = "h2o.predict"), 116 | args = list( 117 | object = quote(object$fit), 118 | newdata = quote(new_data) 119 | ) 120 | ) 121 | ) 122 | 123 | # classification predict 124 | parsnip::set_pred( 125 | model = "mlp", 126 | eng = "h2o", 127 | mode = "classification", 128 | type = "class", 129 | value = list( 130 | pre = function(x, object) h2o::as.h2o(x), 131 | post = function(x, object) as.data.frame(x)$predict, 132 | func = c(pkg = "h2o", fun = "h2o.predict"), 133 | args = list( 134 | object = quote(object$fit), 135 | newdata = quote(new_data) 136 | ) 137 | ) 138 | ) 139 | parsnip::set_pred( 140 | model = "mlp", 141 | eng = "h2o", 142 | mode = "classification", 143 | type = "prob", 144 | value = list( 145 | pre = function(x, object) h2o::as.h2o(x), 146 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 147 | func = c(pkg = "h2o", fun = "h2o.predict"), 148 | args = list( 149 | object = quote(object$fit), 150 | newdata = quote(new_data) 151 | ) 152 | ) 153 | ) 154 | parsnip::set_pred( 155 | model = "mlp", 156 | eng = "h2o", 157 | mode = "classification", 158 | type = "raw", 159 | value = list( 160 | pre = function(x, object) h2o::as.h2o(x), 161 | post = function(x, object) as.data.frame(x), 162 | func = c(pkg = "h2o", fun = "h2o.predict"), 163 | args = list( 164 | object = quote(object$fit), 165 | newdata = quote(new_data) 166 | ) 167 | ) 168 | ) 169 | } 170 | 171 | #' Wrapper for training a h2o.deeplearning model as part of a parsnip `mlp` 172 | #' h2o engine 173 | #' 174 | #' @param formula formula 175 | #' @param data data.frame of training data 176 | #' @param l2 numeric, l2 regulation parameter, default = 0 177 | #' @param hidden_dropout_ratios dropout ratio for a single hidden layer (default 178 | #' = 0) 179 | #' @param hidden integer, number of neurons in the hidden layer (default = c(200, 200)) 180 | #' @param epochs integer, number of epochs (default = 10) 181 | #' @param activation character, activation function. Must be one of: "Tanh", 182 | #' "TanhWithDropout", "Rectifier", "RectifierWithDropout", "Maxout", 183 | #' "MaxoutWithDropout". Defaults to "Rectifier. If `hidden_dropout_ratios` > 0 184 | #' then the equivalent activation function with dropout is used. 185 | #' @param stopping_rounds An integer specifying the number of training 186 | #' iterations without improvement before stopping. If `stopping_rounds = 0` 187 | #' (the default) then early stopping is disabled. If `validation` is used, 188 | #' performance is base on the validation set; otherwise the training set is 189 | #' used. 190 | #' @param validation A positive number. If on `[0, 1)` the value, `validation` 191 | #' is a random proportion of data in `x` and `y` that are used for performance 192 | #' assessment and potential early stopping. If 1 or greater, it is the _number_ 193 | #' of training set samples use for these purposes. 194 | #' @param ... other arguments not currently used 195 | #' 196 | #' @return evaluated h2o model call 197 | #' @export 198 | h2o_mlp_train <- 199 | function(formula, 200 | data, 201 | l2 = 0, 202 | hidden_dropout_ratios = 0, 203 | hidden = 100, 204 | epochs = 10, 205 | activation = "Rectifier", 206 | stopping_rounds = 0, 207 | validation = 0, 208 | ...) { 209 | others <- list(...) 210 | 211 | # get term names and convert to h2o 212 | X <- attr(stats::terms(formula, data = data), "term.labels") 213 | y <- all.vars(formula)[1] 214 | 215 | # early stopping 216 | if (validation > 1) { 217 | validation <- validation / nrow(data) 218 | } 219 | 220 | if (stopping_rounds > 0 & validation > 0) { 221 | n <- nrow(data) 222 | trn_index <- sample(1:n, size = floor(n * validation) + 1) 223 | valid <- data[-trn_index, ] 224 | data <- data[trn_index, ] 225 | } else { 226 | valid <- NULL 227 | } 228 | 229 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 230 | if (!inherits(data, "H2OFrame")) { 231 | data <- h2o::as.h2o(data) 232 | } 233 | 234 | if (!is.null(valid)) { 235 | valid <- h2o::as.h2o(valid) 236 | } 237 | 238 | # remap dials::values_activation to permissible h2o activation values 239 | if (activation %in% c("linear", "elu", "softmax")) { 240 | stop( 241 | paste( 242 | activation, 243 | "activation function is not available when using the h2o engine." 244 | ) 245 | ) 246 | } 247 | 248 | activation <- switch(activation, 249 | relu = "Rectifier", 250 | tanh = "Tanh", 251 | maxout = "Maxout", 252 | activation 253 | ) 254 | 255 | if (activation == "Rectifier" & hidden_dropout_ratios > 0) { 256 | activation <- "RectifierWithDropout" 257 | } else if (activation == "Tanh" & hidden_dropout_ratios > 0) { 258 | activation <- "TanhWithDropout" 259 | } else if (activation == "Maxout" & hidden_dropout_ratios > 0) { 260 | activation <- "MaxoutWithDropout" 261 | } 262 | 263 | if (hidden_dropout_ratios == 0) { 264 | hidden_dropout_ratios <- NULL 265 | } 266 | 267 | # define arguments 268 | args <- list( 269 | x = X, 270 | y = y, 271 | training_frame = data, 272 | validation_frame = valid, 273 | l2 = l2, 274 | hidden_dropout_ratios = hidden_dropout_ratios, 275 | hidden = hidden, 276 | epochs = epochs, 277 | activation = activation, 278 | stopping_rounds = stopping_rounds 279 | ) 280 | 281 | make_h2o_call("h2o.deeplearning", args, others) 282 | } 283 | -------------------------------------------------------------------------------- /R/multi_predicts.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | multi_predict._H2OMultinomialModel <- 3 | function(object, new_data, type = "class", ...) { 4 | args <- list(...) 5 | 6 | if (any(names(rlang::enquos(...)) == "newdata")) { 7 | rlang::abort("Did you mean to use `new_data` instead of `newdata`?") 8 | } 9 | 10 | res <- switch(object$fit@algorithm, 11 | gbm = gbm_multi_predict(object, new_data, type, args$trees), 12 | deeplearning = mlp_multi_predict(object, new_data, type, args$epochs) 13 | ) 14 | 15 | res 16 | } 17 | 18 | #' @export 19 | multi_predict._H2ORegressionModel <- 20 | function(object, new_data, type = "numeric", ...) { 21 | args <- list(...) 22 | 23 | if (any(names(rlang::enquos(...)) == "newdata")) { 24 | rlang::abort("Did you mean to use `new_data` instead of `newdata`?") 25 | } 26 | 27 | res <- switch(object$fit@algorithm, 28 | gbm = gbm_multi_predict(object, new_data, type, args$trees), 29 | deeplearning = mlp_multi_predict(object, new_data, type, args$epochs) 30 | ) 31 | 32 | res 33 | } 34 | 35 | 36 | gbm_multi_predict <- function(object, new_data, type, trees) { 37 | trees <- sort(trees) 38 | 39 | preds <- h2o::staged_predict_proba.H2OModel( 40 | object = object$fit, 41 | newdata = h2o::as.h2o(new_data) 42 | ) 43 | 44 | preds <- as.data.frame(preds) 45 | 46 | if (type %in% c("class", "prob")) { 47 | n_classes <- length(object$lvl) 48 | 49 | res <- purrr::map(trees, function(tree) { 50 | pred_class_names <- paste0("C", seq_len(n_classes)) 51 | tree_names <- paste0("T", tree) 52 | x <- preds[, paste(tree_names, pred_class_names, sep = ".")] 53 | colnames(x) <- object$lvl 54 | x <- as.data.frame(x) 55 | x$predict <- object$lvl[apply(x, 1, which.max)] 56 | x$trees <- tree 57 | x 58 | }) 59 | } else if (type == "numeric") { 60 | res <- purrr::map(trees, function(tree) { 61 | tibble(trees = tree, .pred = preds[, tree]) 62 | }) 63 | } 64 | 65 | # prepare predictions in parsnip format 66 | if (type == "class") { 67 | res <- purrr::map( 68 | res, 69 | ~ dplyr::select(.x, trees, predict) %>% 70 | dplyr::rename(.pred_class = predict) %>% 71 | dplyr::mutate( 72 | .pred_class = factor(.pred_class, levels = object$lvl), 73 | .row = 1:max(dplyr::row_number()) 74 | ) 75 | ) 76 | } else if (type == "numeric") { 77 | res <- purrr::map( 78 | res, 79 | ~ dplyr::mutate(.x, .row = 1:max(dplyr::row_number())) 80 | ) 81 | } else if (type == "prob") { 82 | new_names <- object$lvl 83 | names(new_names) <- paste(".pred", object$lvl, sep = "_") 84 | 85 | res <- purrr::map( 86 | res, 87 | ~ dplyr::select(.x, trees, !!!object$lvl) %>% 88 | dplyr::rename(!!new_names) 89 | ) 90 | 91 | res <- res %>% 92 | dplyr::mutate(.row = 1:max(dplyr::row_number())) 93 | } 94 | 95 | res <- dplyr::bind_rows(res) 96 | res <- dplyr::arrange(res, .row, trees) 97 | res <- split(res[, -ncol(res)], res$.row) 98 | 99 | tibble(.pred = res) 100 | } 101 | 102 | 103 | mlp_multi_predict <- function(object, new_data, type, epochs) { 104 | epochs <- sort(epochs) 105 | new_data <- h2o::as.h2o(new_data) 106 | 107 | preds <- purrr::map(epochs, function(epoch) { 108 | model <- h2o::h2o.deeplearning( 109 | x = object$fit@parameters$x, 110 | y = object$fit@parameters$y, 111 | training_frame = object$fit@parameters$training_frame, 112 | checkpoint = object$fit@model_id, 113 | epochs = epoch, 114 | hidden = object$fit@parameters$hidden, 115 | l2 = object$fit@parameters$l2, 116 | hidden_dropout_ratios = object$fit@parameters$hidden_dropout_ratios, 117 | activation = object$fit@parameters$activation 118 | ) 119 | 120 | predict(model, new_data) %>% 121 | as.data.frame() 122 | }) 123 | 124 | # prepare predictions in parsnip format 125 | if (type == "class") { 126 | res <- purrr::map2( 127 | preds, 128 | epochs, 129 | ~ dplyr::select(.x, predict) %>% 130 | dplyr::rename(.pred_class = predict) %>% 131 | dplyr::mutate( 132 | epochs = .y, 133 | .pred_class = factor(.pred_class, levels = object$lvl), 134 | .row = 1:max(dplyr::row_number()) 135 | ) 136 | ) 137 | } else if (type == "prob") { 138 | new_names <- object$lvl 139 | names(new_names) <- paste(".pred", object$lvl, sep = "_") 140 | 141 | res <- purrr::map2( 142 | preds, 143 | epochs, 144 | ~ dplyr::select(.x, !!!object$lvl) %>% 145 | dplyr::rename(!!!new_names) %>% 146 | dplyr::mutate(epochs = .y, .row = 1:max(dplyr::row_number())) 147 | ) 148 | } else if (type == "numeric") { 149 | res <- purrr::map2(preds, epochs, function(x, epoch) { 150 | tibble(epochs = epoch, .pred = x$predict) 151 | }) 152 | 153 | res <- purrr::map(res, ~ dplyr::mutate(.x, .row = 1:max(dplyr::row_number()))) 154 | } 155 | 156 | res <- dplyr::bind_rows(res) 157 | res <- dplyr::arrange(res, .row, epochs) 158 | res <- split(res[, -ncol(res)], res$.row) 159 | 160 | tibble(.pred = res) 161 | } 162 | -------------------------------------------------------------------------------- /R/multinomial_reg.R: -------------------------------------------------------------------------------- 1 | add_multinom_reg_h2o <- function() { 2 | parsnip::set_model_engine("multinom_reg", "classification", "h2o") 3 | parsnip::set_dependency("multinom_reg", "h2o", "h2o") 4 | 5 | parsnip::set_model_arg( 6 | model = "multinom_reg", 7 | eng = "h2o", 8 | parsnip = "mixture", 9 | original = "alpha", 10 | func = list(pkg = "dials", fun = "mixture"), 11 | has_submodel = FALSE 12 | ) 13 | parsnip::set_model_arg( 14 | model = "multinom_reg", 15 | eng = "h2o", 16 | parsnip = "penalty", 17 | original = "lambda", 18 | func = list(pkg = "dials", fun = "penalty"), 19 | has_submodel = FALSE 20 | ) 21 | parsnip::set_fit( 22 | model = "multinom_reg", 23 | eng = "h2o", 24 | mode = "classification", 25 | value = list( 26 | interface = "formula", 27 | protect = c("formula", "x", "y", "training_frame", "family"), 28 | func = c(fun = "h2o_glm_train"), 29 | defaults = list( 30 | family = "multinomial" 31 | ) 32 | ) 33 | ) 34 | parsnip::set_encoding( 35 | model = "multinom_reg", 36 | eng = "h2o", 37 | mode = "classification", 38 | options = list( 39 | predictor_indicators = "none", 40 | compute_intercept = FALSE, 41 | remove_intercept = FALSE, 42 | allow_sparse_x = FALSE 43 | ) 44 | ) 45 | 46 | # classification predict 47 | parsnip::set_pred( 48 | model = "multinom_reg", 49 | eng = "h2o", 50 | mode = "classification", 51 | type = "class", 52 | value = list( 53 | pre = function(x, object) h2o::as.h2o(x), 54 | post = function(x, object) as.data.frame(x)$predict, 55 | func = c(pkg = "h2o", fun = "h2o.predict"), 56 | args = list( 57 | object = quote(object$fit), 58 | newdata = quote(new_data) 59 | ) 60 | ) 61 | ) 62 | parsnip::set_pred( 63 | model = "multinom_reg", 64 | eng = "h2o", 65 | mode = "classification", 66 | type = "prob", 67 | value = list( 68 | pre = function(x, object) h2o::as.h2o(x), 69 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 70 | func = c(pkg = "h2o", fun = "h2o.predict"), 71 | args = list( 72 | object = quote(object$fit), 73 | newdata = quote(new_data) 74 | ) 75 | ) 76 | ) 77 | parsnip::set_pred( 78 | model = "multinom_reg", 79 | eng = "h2o", 80 | mode = "classification", 81 | type = "raw", 82 | value = list( 83 | pre = function(x, object) h2o::as.h2o(x), 84 | post = function(x, object) as.data.frame(x), 85 | func = c(pkg = "h2o", fun = "h2o.predict"), 86 | args = list( 87 | object = quote(object$fit), 88 | newdata = quote(new_data) 89 | ) 90 | ) 91 | ) 92 | } 93 | -------------------------------------------------------------------------------- /R/naive_Bayes.R: -------------------------------------------------------------------------------- 1 | add_naive_Bayes_h2o <- function() { 2 | parsnip::set_model_engine("naive_Bayes", "classification", "h2o") 3 | parsnip::set_dependency("naive_Bayes", "h2o", "h2o") 4 | 5 | parsnip::set_model_arg( 6 | model = "naive_Bayes", 7 | eng = "h2o", 8 | parsnip = "Laplace", 9 | original = "laplace", 10 | func = list(pkg = "dials", fun = "Laplace"), 11 | has_submodel = FALSE 12 | ) 13 | 14 | # fit 15 | parsnip::set_fit( 16 | model = "naive_Bayes", 17 | eng = "h2o", 18 | mode = "classification", 19 | value = list( 20 | interface = "formula", 21 | protect = c("formula", "x", "y", "training_frame"), 22 | func = c(fun = "h2o_naiveBayes_train"), 23 | defaults = list() 24 | ) 25 | ) 26 | parsnip::set_encoding( 27 | model = "naive_Bayes", 28 | eng = "h2o", 29 | mode = "classification", 30 | options = list( 31 | predictor_indicators = "none", 32 | compute_intercept = FALSE, 33 | remove_intercept = FALSE, 34 | allow_sparse_x = FALSE 35 | ) 36 | ) 37 | 38 | # classification predict 39 | parsnip::set_pred( 40 | model = "naive_Bayes", 41 | eng = "h2o", 42 | mode = "classification", 43 | type = "class", 44 | value = list( 45 | pre = function(x, object) h2o::as.h2o(x), 46 | post = function(x, object) as.data.frame(x)$predict, 47 | func = c(pkg = "h2o", fun = "h2o.predict"), 48 | args = list( 49 | object = quote(object$fit), 50 | newdata = quote(new_data) 51 | ) 52 | ) 53 | ) 54 | parsnip::set_pred( 55 | model = "naive_Bayes", 56 | eng = "h2o", 57 | mode = "classification", 58 | type = "prob", 59 | value = list( 60 | pre = function(x, object) h2o::as.h2o(x), 61 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 62 | func = c(pkg = "h2o", fun = "h2o.predict"), 63 | args = list( 64 | object = quote(object$fit), 65 | newdata = quote(new_data) 66 | ) 67 | ) 68 | ) 69 | parsnip::set_pred( 70 | model = "naive_Bayes", 71 | eng = "h2o", 72 | mode = "classification", 73 | type = "raw", 74 | value = list( 75 | pre = function(x, object) h2o::as.h2o(x), 76 | post = function(x, object) as.data.frame(x), 77 | func = c(pkg = "h2o", fun = "h2o.predict"), 78 | args = list( 79 | object = quote(object$fit), 80 | newdata = quote(new_data) 81 | ) 82 | ) 83 | ) 84 | } 85 | 86 | #' Wrapper for training a h2o.naiveBayes model as part of a discrim `naive_Bayes` 87 | #' h2o engine 88 | #' 89 | #' @param formula formula 90 | #' @param data data.frame of training data 91 | #' @param laplace numeric, the Laplace smoothing parameter, must be >= 0. 92 | #' @param ... other arguments passed to the h2o engine. 93 | #' 94 | #' @return a fitted h2o model. 95 | #' @export 96 | h2o_naiveBayes_train <- 97 | function(formula, 98 | data, 99 | laplace = 0, 100 | ...) { 101 | others <- list(...) 102 | 103 | # get term names and convert to h2o 104 | X <- attr(stats::terms(formula, data = data), "term.labels") 105 | y <- all.vars(formula)[1] 106 | 107 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 108 | if (!inherits(data, "H2OFrame")) { 109 | data <- h2o::as.h2o(data) 110 | } 111 | 112 | # check arguments 113 | if (laplace < 0) { 114 | laplace <- 0 115 | } 116 | 117 | # define arguments 118 | args <- list( 119 | x = X, 120 | y = y, 121 | training_frame = data, 122 | laplace = laplace 123 | ) 124 | 125 | res <- make_h2o_call("h2o.naiveBayes", args, others) 126 | 127 | res 128 | } 129 | -------------------------------------------------------------------------------- /R/rand_forest.R: -------------------------------------------------------------------------------- 1 | add_rand_forest_h2o <- function() { 2 | parsnip::set_model_engine("rand_forest", "classification", "h2o") 3 | parsnip::set_model_engine("rand_forest", "regression", "h2o") 4 | parsnip::set_dependency("rand_forest", "h2o", "h2o") 5 | 6 | parsnip::set_model_arg( 7 | model = "rand_forest", 8 | eng = "h2o", 9 | parsnip = "trees", 10 | original = "ntrees", 11 | func = list(pkg = "dials", fun = "trees"), 12 | has_submodel = FALSE 13 | ) 14 | parsnip::set_model_arg( 15 | model = "rand_forest", 16 | eng = "h2o", 17 | parsnip = "min_n", 18 | original = "min_rows", 19 | func = list(pkg = "dials", fun = "min_n"), 20 | has_submodel = FALSE 21 | ) 22 | parsnip::set_model_arg( 23 | model = "rand_forest", 24 | eng = "h2o", 25 | parsnip = "mtry", 26 | original = "mtries", 27 | func = list(pkg = "dials", fun = "mtry"), 28 | has_submodel = FALSE 29 | ) 30 | parsnip::set_fit( 31 | model = "rand_forest", 32 | eng = "h2o", 33 | mode = "regression", 34 | value = list( 35 | interface = "formula", 36 | protect = c("formula", "x", "y", "training_frame"), 37 | func = c(fun = "h2o_rf_train"), 38 | defaults = list() 39 | ) 40 | ) 41 | parsnip::set_fit( 42 | model = "rand_forest", 43 | eng = "h2o", 44 | mode = "classification", 45 | value = list( 46 | interface = "formula", 47 | protect = c("formula", "x", "y", "training_frame"), 48 | func = c(fun = "h2o_rf_train"), 49 | defaults = list() 50 | ) 51 | ) 52 | parsnip::set_encoding( 53 | model = "rand_forest", 54 | eng = "h2o", 55 | mode = "classification", 56 | options = list( 57 | predictor_indicators = "none", 58 | compute_intercept = FALSE, 59 | remove_intercept = FALSE, 60 | allow_sparse_x = FALSE 61 | ) 62 | ) 63 | parsnip::set_encoding( 64 | model = "rand_forest", 65 | eng = "h2o", 66 | mode = "regression", 67 | options = list( 68 | predictor_indicators = "none", 69 | compute_intercept = FALSE, 70 | remove_intercept = FALSE, 71 | allow_sparse_x = FALSE 72 | ) 73 | ) 74 | 75 | # regression predict 76 | parsnip::set_pred( 77 | model = "rand_forest", 78 | eng = "h2o", 79 | mode = "regression", 80 | type = "numeric", 81 | value = list( 82 | pre = function(x, object) h2o::as.h2o(x), 83 | post = function(x, object) as.data.frame(x)$predict, 84 | func = c(pkg = "h2o", fun = "h2o.predict"), 85 | args = list( 86 | object = quote(object$fit), 87 | newdata = quote(new_data) 88 | ) 89 | ) 90 | ) 91 | parsnip::set_pred( 92 | model = "rand_forest", 93 | eng = "h2o", 94 | mode = "regression", 95 | type = "raw", 96 | value = list( 97 | pre = function(x, object) h2o::as.h2o(x), 98 | post = function(x, object) as.data.frame(x), 99 | func = c(pkg = "h2o", fun = "h2o.predict"), 100 | args = list( 101 | object = quote(object$fit), 102 | newdata = quote(new_data) 103 | ) 104 | ) 105 | ) 106 | 107 | # classification predict 108 | parsnip::set_pred( 109 | model = "rand_forest", 110 | eng = "h2o", 111 | mode = "classification", 112 | type = "class", 113 | value = list( 114 | pre = function(x, object) h2o::as.h2o(x), 115 | post = function(x, object) as.data.frame(x)$predict, 116 | func = c(pkg = "h2o", fun = "h2o.predict"), 117 | args = list( 118 | object = quote(object$fit), 119 | newdata = quote(new_data) 120 | ) 121 | ) 122 | ) 123 | parsnip::set_pred( 124 | model = "rand_forest", 125 | eng = "h2o", 126 | mode = "classification", 127 | type = "prob", 128 | value = list( 129 | pre = function(x, object) h2o::as.h2o(x), 130 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 131 | func = c(pkg = "h2o", fun = "h2o.predict"), 132 | args = list( 133 | object = quote(object$fit), 134 | newdata = quote(new_data) 135 | ) 136 | ) 137 | ) 138 | parsnip::set_pred( 139 | model = "rand_forest", 140 | eng = "h2o", 141 | mode = "classification", 142 | type = "raw", 143 | value = list( 144 | pre = function(x, object) h2o::as.h2o(x), 145 | post = function(x, object) as.data.frame(x), 146 | func = c(pkg = "h2o", fun = "h2o.predict"), 147 | args = list( 148 | object = quote(object$fit), 149 | newdata = quote(new_data) 150 | ) 151 | ) 152 | ) 153 | } 154 | 155 | #' Wrapper for training a h2o.randomForest model as part of a parsnip 156 | #' `rand_forest` h2o engine 157 | #' 158 | #' @param formula formula 159 | #' @param data data.frame of training data 160 | #' @param ntrees integer, the number of trees to build (default = 50) 161 | #' @param min_rows integer, the minimum number of observations for a leaf 162 | #' (default = 10) 163 | #' @param mtries integer, the number of columns to randomly select at each 164 | #' level. Default of -1 is sqrt(p) for classification and (p/3) for regression. 165 | #' @param stopping_rounds An integer specifying the number of training 166 | #' iterations without improvement before stopping. If `stopping_rounds = 0` 167 | #' (the default) then early stopping is disabled. If `validation` is used, 168 | #' performance is base on the validation set; otherwise the training set is 169 | #' used. 170 | #' @param validation A positive number. If on `[0, 1)` the value, `validation` 171 | #' is a random proportion of data in `x` and `y` that are used for performance 172 | #' assessment and potential early stopping. If 1 or greater, it is the _number_ 173 | #' of training set samples use for these purposes. 174 | #' @param ... other arguments not currently used 175 | #' 176 | #' @return evaluated h2o model call 177 | #' @export 178 | h2o_rf_train <- 179 | function(formula, 180 | data, 181 | ntrees = 50, 182 | min_rows = 10, 183 | mtries = -1, 184 | stopping_rounds = 0, 185 | validation = 0, 186 | ...) { 187 | others <- list(...) 188 | 189 | # get term names and convert to h2o 190 | X <- attr(stats::terms(formula, data = data), "term.labels") 191 | y <- all.vars(formula)[1] 192 | 193 | # early stopping 194 | if (validation > 1) { 195 | validation <- validation / nrow(data) 196 | } 197 | 198 | if (stopping_rounds > 0 & validation > 0) { 199 | n <- nrow(data) 200 | trn_index <- sample(1:n, size = floor(n * validation) + 1) 201 | valid <- data[-trn_index, ] 202 | data <- data[trn_index, ] 203 | } else { 204 | valid <- NULL 205 | } 206 | 207 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 208 | if (!inherits(data, "H2OFrame")) { 209 | data <- h2o::as.h2o(data) 210 | } 211 | 212 | if (!is.null(valid)) { 213 | valid <- h2o::as.h2o(valid) 214 | } 215 | 216 | # check for valid mtries 217 | if (mtries > length(X)) { 218 | mtries <- length(X) 219 | } 220 | 221 | # define arguments 222 | args <- list( 223 | x = X, 224 | y = y, 225 | training_frame = data, 226 | validation_frame = valid, 227 | ntrees = ntrees, 228 | min_rows = min_rows, 229 | mtries = mtries, 230 | stopping_rounds = stopping_rounds 231 | ) 232 | 233 | res <- make_h2o_call("h2o.randomForest", args, others) 234 | 235 | res 236 | } 237 | -------------------------------------------------------------------------------- /R/read_h2o.R: -------------------------------------------------------------------------------- 1 | #' Saves an H2O model to file that is contained within a fitted parsnip model 2 | #' specification or contained within a workflow 3 | #' 4 | #' H2O models cannot be saved using the typical R approaches such as saveRDS 5 | #' because the actual H2O model is contained within a Java virtual machine. H2O 6 | #' models need to be saved and restored using the `h2o::h2o.saveModel` and 7 | #' `h2o::h2o.loadModel` functions. This is inconvenient for using H2O models 8 | #' contained within parsnip model specifications or workflow objects. 9 | #' 10 | #' The `write_h2o` function extracts the H2O model from within a parsnip or 11 | #' workflow fitted model and saves it to file using the `h2o::h2o.saveModel` 12 | #' function. To restore a model and insert it back into a previously fitted 13 | #' model use the `read_h2o` function. 14 | #' 15 | #' @param object Either a `workflows::workflow()` object contained a fitted 16 | #' model when using the workflows package, or a `model_spec` object from a 17 | #' fitted model when using the parsnip package directly. 18 | #' @param filename A `character` specifying the file path used to save the 19 | #' model. H2O models do not require a specific file extension. 20 | #' @param ... Currently not used. 21 | #' 22 | #' @return The file path used to save the model. 23 | #' @export 24 | #' 25 | #' @examples 26 | #' library(parsnip) 27 | #' library(h2o) 28 | #' 29 | #' # start a h2o session 30 | #' h2o.init() 31 | #' 32 | #' # fit a parsnip model using the h2o engine 33 | #' clf <- mlp(mode = "classification") %>% 34 | #' set_engine("h2o") 35 | #' 36 | #' model_fit <- clf %>% fit(Species ~ ., iris) 37 | #' 38 | #' # save the parsnip model 39 | #' saveRDS(model_fit, file.path(tempdir(), "my_model.rds")) 40 | #' 41 | #' # save the h2o component of the model 42 | #' write_h2o(object = model_fit, filename = file.path(tempdir(), "my_h2o_model.mod")) 43 | #' h2o.shutdown(prompt = FALSE) 44 | #' 45 | #' # restore a model 46 | #' h2o.init() 47 | #' model_fit <- readRDS(file.path(tempdir(), "my_model.rds")) 48 | #' 49 | #' # read and insert the H2O model back into the parsnip object 50 | #' model_fit <- read_h2o(model_fit, file.path(tempdir(), "my_h2o_model.mod")) 51 | write_h2o <- function(object, filename, ...) { 52 | UseMethod("write_h2o", object) 53 | } 54 | 55 | 56 | #' @export 57 | #' @rdname write_h2o 58 | read_h2o <- function(object, filename, ...) { 59 | UseMethod("read_h2o", object) 60 | } 61 | 62 | 63 | #' @export 64 | #' @rdname write_h2o 65 | write_h2o.workflow <- function(object, filename, ...) { 66 | # extract model from workflow 67 | parsnip_model_spec <- workflows::pull_workflow_fit(object) 68 | 69 | # save H2O model which uses the model ID as the filename 70 | directory <- dirname(filename) 71 | fileout <- h2o::h2o.saveModel( 72 | object = parsnip_model_spec$fit, 73 | path = directory, 74 | force = TRUE 75 | ) 76 | 77 | # rename the file 78 | file.rename(fileout, filename) 79 | 80 | filename 81 | } 82 | 83 | 84 | #' @export 85 | #' @rdname write_h2o 86 | write_h2o.model_fit <- function(object, filename, ...) { 87 | # extract the h2o model 88 | fit_obj <- object$fit 89 | 90 | # save H2O model which uses the model ID as the filename 91 | directory <- dirname(filename) 92 | fileout <- h2o::h2o.saveModel( 93 | object = fit_obj, 94 | path = directory, 95 | force = TRUE 96 | ) 97 | 98 | # rename the file 99 | file.rename(fileout, filename) 100 | 101 | filename 102 | } 103 | 104 | 105 | #' @export 106 | #' @rdname write_h2o 107 | read_h2o.workflow <- function(object, filename, ...) { 108 | h2o_model <- h2o::h2o.loadModel(filename) 109 | object$fit$fit$fit <- h2o_model 110 | object 111 | } 112 | 113 | 114 | #' @export 115 | #' @rdname write_h2o 116 | read_h2o.model_fit <- function(object, filename, ...) { 117 | h2o_model <- h2o::h2o.loadModel(filename) 118 | object$fit <- h2o_model 119 | object 120 | } 121 | -------------------------------------------------------------------------------- /R/rule_fit.R: -------------------------------------------------------------------------------- 1 | add_rule_fit_h2o <- function() { 2 | parsnip::set_model_engine("rule_fit", "classification", "h2o") 3 | parsnip::set_model_engine("rule_fit", "regression", "h2o") 4 | parsnip::set_dependency("rule_fit", "h2o", "h2o") 5 | 6 | parsnip::set_model_arg( 7 | model = "rule_fit", 8 | eng = "h2o", 9 | parsnip = "trees", 10 | original = "rule_generation_ntrees", 11 | func = list(pkg = "dials", fun = "trees"), 12 | has_submodel = FALSE 13 | ) 14 | parsnip::set_model_arg( 15 | model = "rule_fit", 16 | eng = "h2o", 17 | parsnip = "tree_depth", 18 | original = "max_rule_length", 19 | func = list(pkg = "dials", fun = "tree_depth"), 20 | has_submodel = FALSE 21 | ) 22 | parsnip::set_model_arg( 23 | model = "rule_fit", 24 | eng = "h2o", 25 | parsnip = "penalty", 26 | original = "lambda", 27 | func = list(pkg = "dials", fun = "penalty"), 28 | has_submodel = FALSE 29 | ) 30 | parsnip::set_fit( 31 | model = "rule_fit", 32 | eng = "h2o", 33 | mode = "regression", 34 | value = list( 35 | interface = "formula", 36 | protect = c("formula", "x", "y", "training_frame"), 37 | func = c(fun = "h2o_rulefit_train"), 38 | defaults = list() 39 | ) 40 | ) 41 | parsnip::set_fit( 42 | model = "rule_fit", 43 | eng = "h2o", 44 | mode = "classification", 45 | value = list( 46 | interface = "formula", 47 | protect = c("formula", "x", "y", "training_frame"), 48 | func = c(fun = "h2o_rulefit_train"), 49 | defaults = list() 50 | ) 51 | ) 52 | parsnip::set_encoding( 53 | model = "rule_fit", 54 | eng = "h2o", 55 | mode = "classification", 56 | options = list( 57 | predictor_indicators = "none", 58 | compute_intercept = FALSE, 59 | remove_intercept = FALSE, 60 | allow_sparse_x = FALSE 61 | ) 62 | ) 63 | parsnip::set_encoding( 64 | model = "rule_fit", 65 | eng = "h2o", 66 | mode = "regression", 67 | options = list( 68 | predictor_indicators = "none", 69 | compute_intercept = FALSE, 70 | remove_intercept = FALSE, 71 | allow_sparse_x = FALSE 72 | ) 73 | ) 74 | 75 | # regression predict 76 | parsnip::set_pred( 77 | model = "rule_fit", 78 | eng = "h2o", 79 | mode = "regression", 80 | type = "numeric", 81 | value = list( 82 | pre = function(x, object) h2o::as.h2o(x), 83 | post = function(x, object) as.data.frame(x)$predict, 84 | func = c(pkg = "h2o", fun = "h2o.predict"), 85 | args = list( 86 | object = quote(object$fit), 87 | newdata = quote(new_data) 88 | ) 89 | ) 90 | ) 91 | parsnip::set_pred( 92 | model = "rule_fit", 93 | eng = "h2o", 94 | mode = "regression", 95 | type = "raw", 96 | value = list( 97 | pre = function(x, object) h2o::as.h2o(x), 98 | post = function(x, object) as.data.frame(x), 99 | func = c(pkg = "h2o", fun = "h2o.predict"), 100 | args = list( 101 | object = quote(object$fit), 102 | newdata = quote(new_data) 103 | ) 104 | ) 105 | ) 106 | 107 | # classification predict 108 | parsnip::set_pred( 109 | model = "rule_fit", 110 | eng = "h2o", 111 | mode = "classification", 112 | type = "class", 113 | value = list( 114 | pre = function(x, object) h2o::as.h2o(x), 115 | post = function(x, object) as.data.frame(x)$predict, 116 | func = c(pkg = "h2o", fun = "h2o.predict"), 117 | args = list( 118 | object = quote(object$fit), 119 | newdata = quote(new_data) 120 | ) 121 | ) 122 | ) 123 | parsnip::set_pred( 124 | model = "rule_fit", 125 | eng = "h2o", 126 | mode = "classification", 127 | type = "prob", 128 | value = list( 129 | pre = function(x, object) h2o::as.h2o(x), 130 | post = function(x, object) as.data.frame(x[, 2:ncol(x)]), 131 | func = c(pkg = "h2o", fun = "h2o.predict"), 132 | args = list( 133 | object = quote(object$fit), 134 | newdata = quote(new_data) 135 | ) 136 | ) 137 | ) 138 | parsnip::set_pred( 139 | model = "rule_fit", 140 | eng = "h2o", 141 | mode = "classification", 142 | type = "raw", 143 | value = list( 144 | pre = function(x, object) h2o::as.h2o(x), 145 | post = function(x, object) as.data.frame(x), 146 | func = c(pkg = "h2o", fun = "h2o.predict"), 147 | args = list( 148 | object = quote(object$fit), 149 | newdata = quote(new_data) 150 | ) 151 | ) 152 | ) 153 | } 154 | 155 | #' Wrapper for training a h2o.rulefit model as part of a parsnip 156 | #' `rule_fit` h2o engine 157 | #' 158 | #' @param formula formula 159 | #' @param data data.frame of training data 160 | #' @param rule_generation_ntrees integer, the number of trees to build (default 161 | #' = 50) 162 | #' @param max_rule_length integer, the maximum tree depth (default = 3). 163 | #' @param lambda Specify the regularization strength for LASSO regressor. 164 | #' @param ... other arguments that are passed to the h2o model 165 | #' 166 | #' @return evaluated h2o model call 167 | #' @export 168 | h2o_rulefit_train <- 169 | function(formula, 170 | data, 171 | rule_generation_ntrees = 50, 172 | max_rule_length = 3, 173 | lambda = 0, 174 | ...) { 175 | others <- list(...) 176 | 177 | # get term names and convert to h2o 178 | X <- attr(stats::terms(formula, data = data), "term.labels") 179 | y <- all.vars(formula)[1] 180 | 181 | # convert to H2OFrame (although parsnip doesn't support H2OFrames right now) 182 | if (!inherits(data, "H2OFrame")) { 183 | data <- h2o::as.h2o(data) 184 | } 185 | 186 | # define arguments 187 | args <- list( 188 | x = X, 189 | y = y, 190 | training_frame = data, 191 | rule_generation_ntrees = rule_generation_ntrees, 192 | max_rule_length = max_rule_length, 193 | lambda = lambda 194 | ) 195 | 196 | res <- make_h2o_call("h2o.rulefit", args, others) 197 | 198 | res 199 | } 200 | -------------------------------------------------------------------------------- /R/translate_args.R: -------------------------------------------------------------------------------- 1 | translate_args <- function(model_name) { 2 | envir <- parsnip::get_model_env() 3 | 4 | args <- tibble::tibble(ls(envir)) 5 | args <- rlang::set_names(args, "name") 6 | args <- args[grepl("args", args$name), ] 7 | 8 | args$model <- sub("_args", "", args$name) 9 | args$args <- lapply(args$name, function(x) envir[[x]]) 10 | 11 | args <- args %>% 12 | tidyr::unnest("args") %>% 13 | dplyr::select(!!rlang::sym("model"):!!rlang::sym("original")) 14 | 15 | args <- args[args$model == model_name, ] 16 | 17 | args %>% 18 | dplyr::select(-dplyr::one_of("model")) %>% 19 | tidyr::pivot_wider( 20 | names_from = !!rlang::sym("engine"), 21 | values_from = !!rlang::sym("original") 22 | ) 23 | } 24 | 25 | rename_list <- function(x, new_names) { 26 | rename_args <- 27 | sapply(names(x), function(nm) { 28 | if (nm %in% names(new_names)) { 29 | new_names[names(new_names) == nm] 30 | } else { 31 | nm 32 | } 33 | }, 34 | USE.NAMES = FALSE 35 | ) 36 | names(x) <- rename_args 37 | x 38 | } 39 | -------------------------------------------------------------------------------- /R/tune_grid_h2o.R: -------------------------------------------------------------------------------- 1 | #' Tune h2o models 2 | #' 3 | #' This is a prototype of a version of tune_grid that uses h2o.grid to perform 4 | #' hyperparameter tuning. 5 | #' 6 | #' @section Limitations: 7 | #' - Only model arguments can be tuned, not arguments in the preprocessing 8 | #' recipes. 9 | #' 10 | #' - Parsnip only allows `data.frame` and `tbl_spark` objects to be passed to 11 | #' the `fit` method, not `H2OFrame` objects. 12 | #' 13 | #' @param object A parsnip `model_spec` object. 14 | #' @param preprocessor A `recipe` object. 15 | #' @param resamples An `rset` object. 16 | #' @param param_info A `dials::parameters()` object or NULL. If none is given, a 17 | #' parameters set is derived from other arguments. Passing this argument can 18 | #' be useful when parameter ranges need to be customized. 19 | #' @param grid A `data.frame` of tuning combinations or a positive integer. The 20 | #' data frame should have columns for each parameter being tuned and rows for 21 | #' tuning parameter candidates. An integer denotes the number of candidate 22 | #' parameter sets to be created automatically. If a positive integer is used 23 | #' or no tuning grid is supplied, then a semi-random grid via 24 | #' `dials::grid_latin_hypercube` is created based on the specified number of 25 | #' tuning iterations (default size = 10). 26 | #' @param metrics A `yardstick::metric_set` or NULL. Note that not all yardstick 27 | #' metrics can be used with `tune_grid_h2o`. The metrics must be one of 28 | #' `yardstick::rsq`, `yardstick::rmse` or `h2oparsnip::mse` for regression 29 | #' models, and `yardstick::accuracy`, `yardstick::mn_log_loss`, 30 | #' `yardstick::roc_auc` or `yardstick::pr_auc` for classification models. If 31 | #' NULL then the default is `yardstick::rsq` for regression models and 32 | #' `yardstick::mn_log_loss` for classification models. 33 | #' @param control An object used to modify the tuning process. 34 | #' @param ... Not currently used. 35 | #' 36 | #' @return 37 | #' @export 38 | tune_grid_h2o <- 39 | function(object, 40 | preprocessor = NULL, 41 | resamples, 42 | param_info = NULL, 43 | grid = 10, 44 | metrics = NULL, 45 | control = control_h2o(), 46 | ...) { 47 | 48 | # check parameters 49 | if (inherits(object, "workflow")) { 50 | preprocessor <- workflows::pull_workflow_preprocessor(object) 51 | object <- workflows::pull_workflow_spec(object) 52 | } 53 | 54 | if (is.null(param_info)) { 55 | param_info <- tune::parameters(object) 56 | } 57 | 58 | if (inherits(grid, "numeric")) { 59 | grid <- dials::grid_latin_hypercube(param_info, size = grid) 60 | } 61 | 62 | if (!inherits(metrics, "metric_set")) { 63 | rlang::abort("argument `metrics` must be a `yardstick::metric_set`") 64 | } 65 | 66 | # check for supported scoring metrics 67 | metric_attrs <- attributes(metrics) 68 | metric_names <- names(metric_attrs$metrics) 69 | 70 | # tuning control options 71 | if (isFALSE(control$verbose)) { 72 | h2o::h2o.no_progress() 73 | } 74 | 75 | # get model mode 76 | model_mode <- object$mode 77 | 78 | # get model specification arguments 79 | model_args <- object$args 80 | 81 | # process scoring metric 82 | if (is.null(metrics) & model_mode == "classification") { 83 | metrics <- metric_set(yardstick::mn_log_loss) 84 | } 85 | if (is.null(metrics) & model_mode == "regression") { 86 | metrics <- metric_set(yardstick::rsq) 87 | } 88 | h2o_metrics <- convert_h2o_metrics(metrics) 89 | 90 | # extract complete dataset from resamples 91 | data_train <- rsample::training(resamples$splits[[1]]) 92 | data_test <- rsample::testing(resamples$splits[[1]]) 93 | full_data <- dplyr::bind_rows(data_train, data_test) 94 | row_order <- c(as.numeric(rownames(data_train)), as.numeric(rownames(data_test))) 95 | full_data <- as_tibble(full_data[order(row_order), ]) 96 | 97 | # prep the recipe 98 | prepped_recipe <- preprocessor %>% 99 | recipes::prep(training = full_data, retain = TRUE) 100 | 101 | full_data <- prepped_recipe %>% 102 | recipes::bake(new_data = NULL) 103 | 104 | # get predictor and outcome terms 105 | outcome <- prepped_recipe$term_info %>% 106 | dplyr::filter(!!rlang::sym("role") == "outcome") %>% 107 | dplyr::pull(!!rlang::sym("variable")) 108 | 109 | predictors <- prepped_recipe$term_info %>% 110 | dplyr::filter(!!rlang::sym("role") == "predictor") %>% 111 | dplyr::pull(!!rlang::sym("variable")) 112 | 113 | form <- paste(outcome, "~", paste(predictors, collapse = " + ")) 114 | form <- stats::as.formula(form) 115 | 116 | # replace descriptors 117 | if (parsnip:::requires_descrs(object)) { 118 | data_stats <- parsnip:::get_descr_form(form, full_data) 119 | parsnip:::scoped_descrs(data_stats) 120 | } 121 | 122 | # get row indexes of assessment set data for each rsplit 123 | assessment_indices <- purrr::map(resamples$splits, rsample::complement) 124 | 125 | # pass full data to h2o 126 | full_data_h2o <- h2o::as.h2o(full_data, destination_frame = "grid_data") 127 | 128 | # translate parsnip arguments to h2o 129 | alg <- model_spec_to_algorithm(object, model_args) 130 | algorithm <- alg[[1]] 131 | model_name <- alg[[2]] 132 | model_args <- alg[[3]] 133 | 134 | original_names <- translate_args(model_name) %>% 135 | dplyr::select(dplyr::all_of(c("parsnip", "h2o"))) %>% 136 | tidyr::drop_na() 137 | 138 | nm <- original_names$h2o %>% 139 | rlang::set_names(original_names$parsnip) 140 | 141 | model_args <- rename_list(model_args, nm) 142 | tuning_args <- as.character(nm[tune::tune_args(object)$name]) 143 | rename_args <- nm[nm %in% tuning_args] 144 | 145 | # convert tuning values to a named list of hyperparameters and values 146 | # e.g. list(mtries = c(3, 5, 7), min_rows = c(1, 5, 10)) 147 | params <- as.list(grid) 148 | params <- purrr::map(params, ~ .x[!duplicated(.x)]) 149 | params <- rename_list(params, nm) 150 | 151 | # remove arguments that are not set 152 | null_args <- sapply(model_args, function(x) is.null(rlang::eval_tidy(x))) 153 | model_args <- model_args[!null_args] 154 | 155 | # check tunable 156 | if (!any(tuning_args %in% names(params))) { 157 | missing_tune <- tuning_args[!tuning_args %in% names(params)] 158 | rlang::abort(paste("missing arguments", missing_tune)) 159 | } 160 | 161 | model_args <- model_args[!names(model_args) %in% tuning_args] 162 | model_args <- append(model_args, object$eng_args) 163 | 164 | if (length(model_args) == 0) { 165 | model_args <- NULL 166 | } 167 | 168 | # fit h2o.grid on each resample 169 | grid_ids <- replicate( 170 | length(assessment_indices), 171 | generate_random_id(glue::glue("{algorithm}_grid")) 172 | ) 173 | 174 | search_criteria <- 175 | list(strategy = "RandomDiscrete", max_models = nrow(grid)) 176 | 177 | resamples$.metrics <- 178 | purrr::map2(assessment_indices, grid_ids, function(ids, grid_id) { 179 | grid_args <- list( 180 | grid_id = grid_id, 181 | algorithm = algorithm, 182 | x = predictors, 183 | y = outcome, 184 | training_frame = full_data_h2o[-ids, ], 185 | validation_frame = full_data_h2o[ids, ], 186 | hyper_params = params, 187 | keep_cross_validation_predictions = FALSE, 188 | keep_cross_validation_models = FALSE, 189 | search_criteria = search_criteria, 190 | parallelism = 0 191 | ) 192 | 193 | # set control options 194 | if (control$save_pred) { 195 | grid_args$keep_cross_validation_predictions <- TRUE 196 | } 197 | 198 | # call h2o.grid 199 | res <- make_h2o_call("h2o.grid", grid_args, model_args) 200 | 201 | # extract the scores from the cross-validation predictions 202 | purrr::map2_dfr( 203 | .x = h2o_metrics, .y = names(h2o_metrics), 204 | .f = extract_h2o_scores, grid_args$grid_id, 205 | params, rename_args, model_mode 206 | ) 207 | }) 208 | 209 | # optionally extract the predictions 210 | if (control$save_pred) { 211 | resamples$.predictions <- extract_h2o_preds( 212 | assessment_indices, grid_ids, 213 | full_data_h2o, rename_args, model_mode 214 | ) 215 | } 216 | 217 | # optionally store/remove the models from the cluster 218 | if (control$save_models) { 219 | resamples$.models <- extract_h2o_models(grid_ids, rename_args) 220 | } else { 221 | remove_h2o_models(grid_ids) 222 | } 223 | 224 | # add the .notes column (empty for now) 225 | resamples$.notes <- purrr::map(nrow(resamples), ~ tibble::as_tibble_col(character())) 226 | 227 | # create a `tune_results` class 228 | class(resamples) <- c("tune_results", class(resamples)) 229 | 230 | arg_names <- names(grid) 231 | param_list <- purrr::map( 232 | arg_names, 233 | ~ glue::glue("dials::{.x}()") %>% 234 | rlang::parse_expr() %>% 235 | rlang::eval_tidy() 236 | ) 237 | attr(resamples, "parameters") <- dials::parameters(param_list) 238 | 239 | names(attributes(metrics)$metrics) <- 240 | gsub("yardstick::", "", names(attributes(metrics)$metrics)) 241 | names(attributes(metrics)$metrics) <- 242 | gsub("h2oparsnip::", "", names(attributes(metrics)$metrics)) 243 | attr(resamples, "metrics") <- metrics 244 | 245 | resamples 246 | } 247 | 248 | 249 | #' Removes the models creating during tuning resampling 250 | #' 251 | #' @param grid_ids A character vector of h2o grid ids for each tuning resample. 252 | #' 253 | #' @return 254 | #' @keywords internal 255 | remove_h2o_models <- function(grid_ids) { 256 | for (grid_id in grid_ids) { 257 | grid <- h2o::h2o.getGrid(grid_id) 258 | 259 | for (model_id in as.character(grid@model_ids)) { 260 | h2o::h2o.rm(model_id) 261 | } 262 | } 263 | } 264 | 265 | 266 | #' Extract the h2o model ids for all of the tuning grids and return them as list 267 | #' of tibbles. 268 | #' 269 | #' @param grid_ids A character vector of the h2o ids for the tuning grids. 270 | #' @param rename_args A named character vector used to remap the h2o 271 | #' hyperparameter names to tidymodels nomenclature. The names of the vector 272 | #' are the tidymodels nomenclature and the values are the h2o nomenclature, 273 | #' e.g. c(mtry = "mtries", min_n = "min_rows") 274 | #' 275 | #' @return a list of tibbles 276 | #' 277 | #' @keywords internal 278 | extract_h2o_models <- function(grid_ids, rename_args) { 279 | purrr::map(grid_ids, function(grid_id) { 280 | grid <- h2o::h2o.getGrid(grid_id) 281 | model_ids <- as.character(grid@model_ids) 282 | df <- grid@summary_table[rename_args] 283 | df <- dplyr::rename(df, dplyr::all_of(rename_args)) 284 | df$.model_ids <- as.character(grid@model_ids) 285 | df 286 | }) 287 | } 288 | 289 | 290 | #' Extract the predictions for each tuning grid from the h2o cluster 291 | #' 292 | #' @param test_indices A numeric vector if the row numbers of the H2OFrame that 293 | #' represent the assessment samples. 294 | #' @param grid_ids A character vector of the h2o ids for the tuning grids. 295 | #' @param data A H2OFrame object. 296 | #' @param rename_args A named character vector used to remap the h2o 297 | #' hyperparameter names to tidymodels nomenclature. The names of the vector 298 | #' are the tidymodels nomenclature and the values are the h2o nomenclature, 299 | #' e.g. c(mtry = "mtries", min_n = "min_rows") 300 | #' 301 | #' @return A tibble 302 | #' 303 | #' @keywords internal 304 | extract_h2o_preds <- function(test_indices, grid_ids, data, rename_args, model_mode) { 305 | predictions <- purrr::map2(test_indices, grid_ids, function(ids, grid_id) { 306 | grid <- h2o::h2o.getGrid(grid_id) 307 | model_ids <- as.character(grid@model_ids) 308 | grid_args <- grid@summary_table[rename_args] 309 | grid_args <- dplyr::rename(grid_args, dplyr::all_of(rename_args)) 310 | 311 | purrr::map_dfr(seq_along(model_ids), function(i) { 312 | model <- h2o::h2o.getModel(model_ids[[i]]) 313 | args <- grid_args[i, ] 314 | preds <- tibble::as_tibble(predict(model, data[ids, ])) 315 | 316 | if (model_mode == "classification") { 317 | names(preds) <- ".pred_class" 318 | } else { 319 | names(preds) <- ".pred" 320 | } 321 | dplyr::bind_cols(preds, args, .row = ids) 322 | }) 323 | }) 324 | 325 | return(predictions) 326 | } 327 | 328 | 329 | #' Score the tuning results 330 | #' 331 | #' @param h2o_metric_name A character with the name of the h2o metric used to 332 | #' score the tuning resamples. 333 | #' @param yardstick_metric_name A character with the name of the equivalent 334 | #' yardstick metric used to score the tuning resamples. 335 | #' @param grid_id The h2o id for the tuning grid. 336 | #' @param params A named list of hyperparameters and their values. 337 | #' @param rename_args A named character vector used to remap the h2o 338 | #' hyperparameter names to tidymodels nomenclature. The names of the vector 339 | #' are the tidymodels nomenclature and the values are the h2o nomenclature, 340 | #' e.g. c(mtry = "mtries", min_n = "min_rows") 341 | #' @param model_mode The mode of the model, either "classification", 342 | #' "regression", or "multiclass". 343 | #' 344 | #' @return 345 | #' 346 | #' @keywords internal 347 | extract_h2o_scores <- 348 | function(h2o_metric_name, 349 | yardstick_metric_name, 350 | grid_id, 351 | params, 352 | rename_args, 353 | model_mode) { 354 | tuning_args <- names(params) 355 | 356 | grid <- h2o::h2o.getGrid( 357 | grid_id = grid_id, 358 | sort_by = h2o_metric_name, 359 | decreasing = FALSE 360 | ) 361 | 362 | scores <- as.data.frame(grid@summary_table) 363 | scores[, ncol(scores)] <- 364 | as.numeric(scores[, ncol(scores)]) 365 | 366 | for (x in names(params)) { 367 | scores[[x]] <- gsub("\\[", "", scores[[x]]) 368 | scores[[x]] <- gsub("\\]", "", scores[[x]]) 369 | } 370 | 371 | # create the tune-like resamples object 372 | scores <- scores %>% 373 | as_tibble() %>% 374 | dplyr::select(-dplyr::one_of("model_ids")) %>% 375 | dplyr::mutate_at(tuning_args, as.numeric) %>% 376 | dplyr::mutate( 377 | .metric = yardstick_metric_name, 378 | .estimator = dplyr::if_else(model_mode == "classification", "multiclass", "standard") 379 | ) %>% 380 | dplyr::rename(.estimate = !!h2o_metric_name) %>% 381 | dplyr::rename(dplyr::all_of(rename_args)) 382 | 383 | return(scores) 384 | } 385 | 386 | 387 | #' Return the equivalent h2o algorithm name for a parsnip `model_spec` object. 388 | #' 389 | #' @param object A parsnip `model_spec` object. 390 | #' @param model_args A list of model arguments. 391 | #' 392 | #' @return A list with algorithm, the model name and the arguments with the 393 | #' family attribute set for specific models (e.g. glm). 394 | #' 395 | #' @keywords internal 396 | model_spec_to_algorithm <- function(object, model_args) { 397 | if (inherits(object, "boost_tree")) { 398 | model_name <- "boost_tree" 399 | algorithm <- "gbm" 400 | } 401 | if (inherits(object, "rand_forest")) { 402 | model_name <- "rand_forest" 403 | algorithm <- "randomForest" 404 | } 405 | if (inherits(object, "linear_reg")) { 406 | model_name <- "linear_reg" 407 | algorithm <- "glm" 408 | model_args$family <- "gaussian" 409 | } 410 | if (inherits(object, "logistic_reg")) { 411 | model_name <- "logistic_reg" 412 | algorithm <- "glm" 413 | model_args$family <- "binomial" 414 | } 415 | if (inherits(object, "multinom_reg")) { 416 | model_name <- "multinom_reg" 417 | algorithm <- "glm" 418 | model_args$family <- "multinomial" 419 | } 420 | if (inherits(object, "mlp")) { 421 | model_name <- "mlp" 422 | algorithm <- "deeplearning" 423 | } 424 | if (inherits(object, "naive_Bayes")) { 425 | model_name <- "naive_Bayes" 426 | algorithm <- "naiveBayes" 427 | } 428 | 429 | return(list(algorithm, model_name, model_args)) 430 | } 431 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | generate_random_id <- function(prefix) { 2 | paste(prefix, as.integer(stats::runif(1, 0, 1e9)), sep = "_") 3 | } 4 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | 3 | # if (any(loadedNamespaces() == "discrim")) { 4 | # add_naive_Bayes_h2o() 5 | # } 6 | # 7 | # if (any(loadedNamespaces() == "rules")) { 8 | # add_rule_fit_h2o() 9 | # } 10 | 11 | add_naive_Bayes_h2o() 12 | add_rule_fit_h2o() 13 | 14 | add_mlp_h2o() 15 | add_boost_tree_h2o() 16 | add_rand_forest_h2o() 17 | add_multinom_reg_h2o() 18 | add_logistic_reg_h2o() 19 | add_linear_reg_h2o() 20 | add_automl() 21 | } 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # h2oparsnip 2 | 3 | 4 | [![CRAN status](https://www.r-pkg.org/badges/version/h2oparsnip)](https://CRAN.R-project.org/package=h2oparsnip) 5 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental) 6 | 7 | 8 | **NOTE the development of h2oparsnip is being integrated within tidymodels as the [agua](https://github.com/tidymodels/agua) package. New issues and feature requests should be made in the agua repository**. 9 | 10 | ```h2oparsnip``` provides a set of wrappers to bind h2o algorthms with the 11 | 'parsnip' package. 12 | 13 | This package is early in development. Currently the following h2o algorithms 14 | are implemented: 15 | 16 | - h2o.deeplearning engine added to parsnip::mlp model specification 17 | - h2o.gbm engine added to parsnip::boost_tree model specification 18 | - h2o.randomForest engine added to parsnip::rand_forest model specification 19 | - h2o.glm engine added to multinom_reg, logistic_reg and linear_reg model 20 | specifications 21 | - h2o.naiveBayes engine added to naive_Bayes specification 22 | - a new model, automl 23 | - h2o.rulefit engine added to parsnip::rule_fit 24 | 25 | ## Installation 26 | 27 | The package is not yet on CRAN and can be installed with: 28 | 29 | ``` r 30 | devtools::install_github("stevenpawley/h2oparsnip") 31 | ``` 32 | 33 | ## Notes 34 | 35 | The package currently is based on the concept of using h2o as a disposable backend, using h2o as a drop-in replacement for the traditionally used 'engines' within the parsnip package. However, performing tasks such as hyperparameter tuning via the 'tune' packge will be less efficient if working on a remote cluster than using h2o directly because data is being sent back and forth. 36 | 37 | h2oparsnip also does not provide any management of the h2o cluster. If lots of models are being run then available memory within the cluster may be exhausted. Currently this has to be managed using the commands in the h2o package. 38 | 39 | ## Basic Usage 40 | 41 | An example of using the h2o engine for boosted trees. For boosted trees, hyperparameter tuning of the number of trees is performed using a multi_predict method, i.e. the algorithm fits the maximum number of trees in the parameter grid and can be tuned using fewer trees without retraining. 42 | 43 | ``` 44 | library(tidymodels) 45 | library(h2oparsnip) 46 | library(h2o) 47 | 48 | h2o.init() 49 | 50 | gbm <- boost_tree(mode = "regression", trees = tune(), min_n = tune()) %>% 51 | set_engine("h2o") 52 | 53 | rec <- iris %>% 54 | recipe(Petal.Length ~ .) 55 | 56 | params <- expand.grid(trees = c(10, 20, 30), min_n = c(1, 5, 10)) 57 | resamples = mc_cv(iris, times = 1) 58 | tune_results <- tune_grid( 59 | object = gbm, 60 | preprocessor = rec, 61 | resamples = resamples, 62 | grid = params, 63 | metrics = metric_set(rmse) 64 | ) 65 | ``` 66 | 67 | ## Tuning (alternative) 68 | 69 | A problem with using `tune::tune_grid` is that performance is reduced because the data for every tuning hyperparameter iteration and resampling is moved from R to the h2o cluster. To minimize this, the `tune_grid_h2o` function can be used to tune model arguments, as a near drop-in replacement: 70 | 71 | ``` 72 | tune_results <- tune_grid_h2o( 73 | object = gbm, 74 | preprocessor = rec, 75 | resamples = resamples, 76 | grid = params, 77 | metrics = metric_set(rmse) 78 | ) 79 | ``` 80 | 81 | Currently, `tune_grid_h2o` can only tune model parameters and does not handle recipes with tunable parameters. `tune_grid_h2o` moves the data to the h2o cluster only once, i.e. the complete dataset specified by the `resamples` argument is moved to the cluster, and then the equivalent h2o.frame is split based on the row indices in the resampling object, and the `h2o::h2o.grid` function is used for tuning on the h2o frames. To avoid repeatedly moving predictions back from h2o to R, all metrics are also calculated on the cluster. This restricts the range of metrics to what is available in h2o (`tune_grid_h2o` maps **yardstick** metrics to their h2o equivalents). The available metrics are listed in the `tune_grid_h2o` help documentation. However, hyperparameter tuning using `tune_grid_h2o` should be similarly performant as when using h2o directly. 82 | 83 | ### Control 84 | 85 | Similar to `tune::control_grid`, details of `tune_grid_h2o` can be configured using `tune_grid_h2o(control = control_h2o())`. This allows the predictions and/or models to be saved (the default is that they are removed after tuning to avoid clutter in the cluster). 86 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevenpawley/h2oparsnip/913d60e9ea72cb8e0e12edca4cf998de46cd52ae/_pkgdown.yml -------------------------------------------------------------------------------- /h2oparsnip.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /man/automl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/automl.R 3 | \name{automl} 4 | \alias{automl} 5 | \title{General interface for automl models} 6 | \usage{ 7 | automl(mode = "classification") 8 | } 9 | \arguments{ 10 | \item{mode}{A single character string for the type of model.} 11 | } 12 | \value{ 13 | A model_spec 14 | } 15 | \description{ 16 | General interface for automl models 17 | } 18 | -------------------------------------------------------------------------------- /man/control_h2o.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/control_h2o.R 3 | \name{control_h2o} 4 | \alias{control_h2o} 5 | \title{Control aspects of the grid search process} 6 | \usage{ 7 | control_h2o( 8 | verbose = FALSE, 9 | save_pred = FALSE, 10 | save_models = FALSE, 11 | event_level = "first" 12 | ) 13 | } 14 | \arguments{ 15 | \item{verbose}{A logical for logging results as they are generated.} 16 | 17 | \item{save_pred}{A logical for whether the out-of-sample predictions should be saved 18 | for each model evaluated.} 19 | 20 | \item{save_models}{A logical for whether to retain the models associated with the 21 | tuning and resampling iterations within the h2o cluster and append their h2o model 22 | ids to the resamples object as a '.models' column.} 23 | } 24 | \value{ 25 | An object of `control_grid` and `control_resamples` class. 26 | } 27 | \description{ 28 | `control_h2o` provides a function to set various aspects of the grid search process. 29 | By default during tuning, the resampling predictions are stored within the h2o 30 | cluster. To save memory and space, use 'save_pred = FALSE'. 31 | `` 32 | } 33 | \examples{ 34 | # to save space in the cluster use these settings (the defaults) 35 | control_h2o(verbose = TRUE, save_pred = FALSE) 36 | } 37 | -------------------------------------------------------------------------------- /man/extract_h2o_models.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune_grid_h2o.R 3 | \name{extract_h2o_models} 4 | \alias{extract_h2o_models} 5 | \title{Extract the h2o model ids for all of the tuning grids and return them as list 6 | of tibbles.} 7 | \usage{ 8 | extract_h2o_models(grid_ids, rename_args) 9 | } 10 | \arguments{ 11 | \item{grid_ids}{A character vector of the h2o ids for the tuning grids.} 12 | 13 | \item{rename_args}{A named character vector used to remap the h2o 14 | hyperparameter names to tidymodels nomenclature. The names of the vector 15 | are the tidymodels nomenclature and the values are the h2o nomenclature, 16 | e.g. c(mtry = "mtries", min_n = "min_rows")} 17 | } 18 | \value{ 19 | a list of tibbles 20 | } 21 | \description{ 22 | Extract the h2o model ids for all of the tuning grids and return them as list 23 | of tibbles. 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/extract_h2o_preds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune_grid_h2o.R 3 | \name{extract_h2o_preds} 4 | \alias{extract_h2o_preds} 5 | \title{Extract the predictions for each tuning grid from the h2o cluster} 6 | \usage{ 7 | extract_h2o_preds(test_indices, grid_ids, data, rename_args, model_mode) 8 | } 9 | \arguments{ 10 | \item{test_indices}{A numeric vector if the row numbers of the H2OFrame that 11 | represent the assessment samples.} 12 | 13 | \item{grid_ids}{A character vector of the h2o ids for the tuning grids.} 14 | 15 | \item{data}{A H2OFrame object.} 16 | 17 | \item{rename_args}{A named character vector used to remap the h2o 18 | hyperparameter names to tidymodels nomenclature. The names of the vector 19 | are the tidymodels nomenclature and the values are the h2o nomenclature, 20 | e.g. c(mtry = "mtries", min_n = "min_rows")} 21 | } 22 | \value{ 23 | A tibble 24 | } 25 | \description{ 26 | Extract the predictions for each tuning grid from the h2o cluster 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/extract_h2o_scores.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune_grid_h2o.R 3 | \name{extract_h2o_scores} 4 | \alias{extract_h2o_scores} 5 | \title{Score the tuning results} 6 | \usage{ 7 | extract_h2o_scores( 8 | h2o_metric_name, 9 | yardstick_metric_name, 10 | grid_id, 11 | params, 12 | rename_args, 13 | model_mode 14 | ) 15 | } 16 | \arguments{ 17 | \item{h2o_metric_name}{A character with the name of the h2o metric used to 18 | score the tuning resamples.} 19 | 20 | \item{yardstick_metric_name}{A character with the name of the equivalent 21 | yardstick metric used to score the tuning resamples.} 22 | 23 | \item{grid_id}{The h2o id for the tuning grid.} 24 | 25 | \item{params}{A named list of hyperparameters and their values.} 26 | 27 | \item{rename_args}{A named character vector used to remap the h2o 28 | hyperparameter names to tidymodels nomenclature. The names of the vector 29 | are the tidymodels nomenclature and the values are the h2o nomenclature, 30 | e.g. c(mtry = "mtries", min_n = "min_rows")} 31 | 32 | \item{model_mode}{The mode of the model, either "classification", 33 | "regression", or "multiclass".} 34 | } 35 | \value{ 36 | 37 | } 38 | \description{ 39 | Score the tuning results 40 | } 41 | \keyword{internal} 42 | -------------------------------------------------------------------------------- /man/h2o_automl_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/automl.R 3 | \name{h2o_automl_train} 4 | \alias{h2o_automl_train} 5 | \title{Wrapper for training a h2o.automl model} 6 | \usage{ 7 | h2o_automl_train(formula, data, ...) 8 | } 9 | \arguments{ 10 | \item{formula}{formula} 11 | 12 | \item{data}{data.frame of training data} 13 | 14 | \item{...}{Other arguments to pass the h2o.automl} 15 | } 16 | \value{ 17 | evaluated h2o model call 18 | } 19 | \description{ 20 | Wrapper for training a h2o.automl model 21 | } 22 | -------------------------------------------------------------------------------- /man/h2o_gbm_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/boost_tree.R 3 | \name{h2o_gbm_train} 4 | \alias{h2o_gbm_train} 5 | \title{Wrapper for training a h2o.gbm model as part of a parsnip `boost_tree` 6 | h2o engine} 7 | \usage{ 8 | h2o_gbm_train( 9 | formula, 10 | data, 11 | ntrees = 50, 12 | max_depth = 5, 13 | min_rows = 10, 14 | learn_rate = 0.1, 15 | sample_rate = 1, 16 | col_sample_rate = 1, 17 | min_split_improvement = 1e-05, 18 | stopping_rounds = 0, 19 | validation = 0, 20 | algorithm = "h2o.gbm", 21 | ... 22 | ) 23 | } 24 | \arguments{ 25 | \item{formula}{formula} 26 | 27 | \item{data}{data.frame of training data} 28 | 29 | \item{ntrees}{integer, the number of trees to build (default = 50).} 30 | 31 | \item{max_depth}{integer, the maximum tree depth (default = 10).} 32 | 33 | \item{min_rows}{integer, the minimum number of observations for a leaf 34 | (default = 10).} 35 | 36 | \item{learn_rate}{numeric, the learning rate (default = 0.1, range is from 37 | 0.0 to 1.0).} 38 | 39 | \item{sample_rate}{numeric, the proportion of samples to use to build each 40 | tree (default = 1.0).} 41 | 42 | \item{col_sample_rate}{numeric, the proportion of features available during 43 | each node split (default = 1.0).} 44 | 45 | \item{min_split_improvement}{numeric, minimum relative improvement in 46 | squared error reduction in order for a split to happen (default = 1e-05)} 47 | 48 | \item{stopping_rounds}{An integer specifying the number of training 49 | iterations without improvement before stopping. If `stopping_rounds = 0` 50 | (the default) then early stopping is disabled. If `validation` is used, 51 | performance is base on the validation set; otherwise the training set is 52 | used.} 53 | 54 | \item{validation}{A positive number. If on `[0, 1)` the value, `validation` 55 | is a random proportion of data in `x` and `y` that are used for performance 56 | assessment and potential early stopping. If 1 or greater, it is the _number_ 57 | of training set samples use for these purposes.} 58 | 59 | \item{algorithm}{Whether to use the default h2o 'h2o.gbm' algorithm or use 60 | 'h2o.xgboost' via h2o.} 61 | 62 | \item{...}{other arguments passed to the h2o engine.} 63 | } 64 | \value{ 65 | evaluated h2o model call 66 | } 67 | \description{ 68 | Wrapper for training a h2o.gbm model as part of a parsnip `boost_tree` 69 | h2o engine 70 | } 71 | -------------------------------------------------------------------------------- /man/h2o_glm_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/glm_wrapper.R 3 | \name{h2o_glm_train} 4 | \alias{h2o_glm_train} 5 | \title{Wrapper for training a h2o.glm model as part of a parsnip} 6 | \usage{ 7 | h2o_glm_train(formula, data, alpha = NULL, lambda = NULL, family = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{formula}{formula} 11 | 12 | \item{data}{data.frame of training data} 13 | 14 | \item{alpha}{numeric, Distribution of regularization between the L1 (Lasso) 15 | and L2 (Ridge) penalties. A value of 1 for alpha represents Lasso 16 | regression, a value of 0 produces Ridge regression.} 17 | 18 | \item{lambda}{numeric, regularization strength} 19 | 20 | \item{family}{character, one of c("gaussian", "binomial", "quasibinomial", 21 | "ordinal", "multinomial", "poisson", "gamma", "tweedie", 22 | "negativebinomial")} 23 | 24 | \item{...}{other arguments passed to the h2o engine.} 25 | } 26 | \value{ 27 | evaluated h2o model call 28 | } 29 | \description{ 30 | Wrapper for training a h2o.glm model as part of a parsnip 31 | } 32 | -------------------------------------------------------------------------------- /man/h2o_mlp_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlp.R 3 | \name{h2o_mlp_train} 4 | \alias{h2o_mlp_train} 5 | \title{Wrapper for training a h2o.deeplearning model as part of a parsnip `mlp` 6 | h2o engine} 7 | \usage{ 8 | h2o_mlp_train( 9 | formula, 10 | data, 11 | l2 = 0, 12 | hidden_dropout_ratios = 0, 13 | hidden = 100, 14 | epochs = 10, 15 | activation = "Rectifier", 16 | stopping_rounds = 0, 17 | validation = 0, 18 | ... 19 | ) 20 | } 21 | \arguments{ 22 | \item{formula}{formula} 23 | 24 | \item{data}{data.frame of training data} 25 | 26 | \item{l2}{numeric, l2 regulation parameter, default = 0} 27 | 28 | \item{hidden_dropout_ratios}{dropout ratio for a single hidden layer (default 29 | = 0)} 30 | 31 | \item{hidden}{integer, number of neurons in the hidden layer (default = c(200, 200))} 32 | 33 | \item{epochs}{integer, number of epochs (default = 10)} 34 | 35 | \item{activation}{character, activation function. Must be one of: "Tanh", 36 | "TanhWithDropout", "Rectifier", "RectifierWithDropout", "Maxout", 37 | "MaxoutWithDropout". Defaults to "Rectifier. If `hidden_dropout_ratios` > 0 38 | then the equivalent activation function with dropout is used.} 39 | 40 | \item{stopping_rounds}{An integer specifying the number of training 41 | iterations without improvement before stopping. If `stopping_rounds = 0` 42 | (the default) then early stopping is disabled. If `validation` is used, 43 | performance is base on the validation set; otherwise the training set is 44 | used.} 45 | 46 | \item{validation}{A positive number. If on `[0, 1)` the value, `validation` 47 | is a random proportion of data in `x` and `y` that are used for performance 48 | assessment and potential early stopping. If 1 or greater, it is the _number_ 49 | of training set samples use for these purposes.} 50 | 51 | \item{...}{other arguments not currently used} 52 | } 53 | \value{ 54 | evaluated h2o model call 55 | } 56 | \description{ 57 | Wrapper for training a h2o.deeplearning model as part of a parsnip `mlp` 58 | h2o engine 59 | } 60 | -------------------------------------------------------------------------------- /man/h2o_naiveBayes_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/naive_Bayes.R 3 | \name{h2o_naiveBayes_train} 4 | \alias{h2o_naiveBayes_train} 5 | \title{Wrapper for training a h2o.naiveBayes model as part of a discrim `naive_Bayes` 6 | h2o engine} 7 | \usage{ 8 | h2o_naiveBayes_train(formula, data, laplace = 0, ...) 9 | } 10 | \arguments{ 11 | \item{formula}{formula} 12 | 13 | \item{data}{data.frame of training data} 14 | 15 | \item{laplace}{numeric, the Laplace smoothing parameter, must be >= 0.} 16 | 17 | \item{...}{other arguments passed to the h2o engine.} 18 | } 19 | \value{ 20 | a fitted h2o model. 21 | } 22 | \description{ 23 | Wrapper for training a h2o.naiveBayes model as part of a discrim `naive_Bayes` 24 | h2o engine 25 | } 26 | -------------------------------------------------------------------------------- /man/h2o_rf_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rand_forest.R 3 | \name{h2o_rf_train} 4 | \alias{h2o_rf_train} 5 | \title{Wrapper for training a h2o.randomForest model as part of a parsnip 6 | `rand_forest` h2o engine} 7 | \usage{ 8 | h2o_rf_train( 9 | formula, 10 | data, 11 | ntrees = 50, 12 | min_rows = 10, 13 | mtries = -1, 14 | stopping_rounds = 0, 15 | validation = 0, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{formula}{formula} 21 | 22 | \item{data}{data.frame of training data} 23 | 24 | \item{ntrees}{integer, the number of trees to build (default = 50)} 25 | 26 | \item{min_rows}{integer, the minimum number of observations for a leaf 27 | (default = 10)} 28 | 29 | \item{mtries}{integer, the number of columns to randomly select at each 30 | level. Default of -1 is sqrt(p) for classification and (p/3) for regression.} 31 | 32 | \item{stopping_rounds}{An integer specifying the number of training 33 | iterations without improvement before stopping. If `stopping_rounds = 0` 34 | (the default) then early stopping is disabled. If `validation` is used, 35 | performance is base on the validation set; otherwise the training set is 36 | used.} 37 | 38 | \item{validation}{A positive number. If on `[0, 1)` the value, `validation` 39 | is a random proportion of data in `x` and `y` that are used for performance 40 | assessment and potential early stopping. If 1 or greater, it is the _number_ 41 | of training set samples use for these purposes.} 42 | 43 | \item{...}{other arguments not currently used} 44 | } 45 | \value{ 46 | evaluated h2o model call 47 | } 48 | \description{ 49 | Wrapper for training a h2o.randomForest model as part of a parsnip 50 | `rand_forest` h2o engine 51 | } 52 | -------------------------------------------------------------------------------- /man/h2o_rulefit_train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rule_fit.R 3 | \name{h2o_rulefit_train} 4 | \alias{h2o_rulefit_train} 5 | \title{Wrapper for training a h2o.rulefit model as part of a parsnip 6 | `rule_fit` h2o engine} 7 | \usage{ 8 | h2o_rulefit_train( 9 | formula, 10 | data, 11 | rule_generation_ntrees = 50, 12 | max_rule_length = 3, 13 | lambda = 0, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{formula}{formula} 19 | 20 | \item{data}{data.frame of training data} 21 | 22 | \item{rule_generation_ntrees}{integer, the number of trees to build (default 23 | = 50)} 24 | 25 | \item{max_rule_length}{integer, the maximum tree depth (default = 3).} 26 | 27 | \item{lambda}{Specify the regularization strength for LASSO regressor.} 28 | 29 | \item{...}{other arguments that are passed to the h2o model} 30 | } 31 | \value{ 32 | evaluated h2o model call 33 | } 34 | \description{ 35 | Wrapper for training a h2o.rulefit model as part of a parsnip 36 | `rule_fit` h2o engine 37 | } 38 | -------------------------------------------------------------------------------- /man/model_spec_to_algorithm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune_grid_h2o.R 3 | \name{model_spec_to_algorithm} 4 | \alias{model_spec_to_algorithm} 5 | \title{Return the equivalent h2o algorithm name for a parsnip `model_spec` object.} 6 | \usage{ 7 | model_spec_to_algorithm(object, model_args) 8 | } 9 | \arguments{ 10 | \item{object}{A parsnip `model_spec` object.} 11 | 12 | \item{model_args}{A list of model arguments.} 13 | } 14 | \value{ 15 | A list with algorithm, the model name and the arguments with the 16 | family attribute set for specific models (e.g. glm). 17 | } 18 | \description{ 19 | Return the equivalent h2o algorithm name for a parsnip `model_spec` object. 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/mse_vec.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/metrics.R 3 | \name{mse_vec} 4 | \alias{mse_vec} 5 | \alias{mse} 6 | \alias{mse.data.frame} 7 | \title{Mean squared error} 8 | \usage{ 9 | mse_vec(truth, estimate, na_rm = TRUE, ...) 10 | 11 | mse(data, ...) 12 | 13 | \method{mse}{data.frame}(data, truth, estimate, na_rm = TRUE, ...) 14 | } 15 | \arguments{ 16 | \item{truth}{The column identifier for the true class results (that is a 17 | factor). This should be an unquoted column name although this argument is 18 | passed by expression and supports quasiquotation (you can unquote column 19 | names). For _vec() functions, a factor vector.} 20 | 21 | \item{estimate}{The column identifier for the predicted class results (that 22 | is also factor). As with truth this can be specified different ways but the 23 | primary method is to use an unquoted variable name. For _vec() functions, a 24 | factor vector.} 25 | 26 | \item{na_rm}{A logical value indicating whether NA values should be stripped 27 | before the computation proceeds.} 28 | 29 | \item{...}{Not currently used.} 30 | 31 | \item{data}{A `data.frame` containing the `truth` and `estimate` columns.} 32 | } 33 | \value{ 34 | A `tibble` with columns `.metric`, `.estimator`, and `.estimate` and 1 row of values. 35 | } 36 | \description{ 37 | Calculate the mean squared error. This metric is in squared units of the original data. 38 | } 39 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/remove_h2o_models.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune_grid_h2o.R 3 | \name{remove_h2o_models} 4 | \alias{remove_h2o_models} 5 | \title{Removes the models creating during tuning resampling} 6 | \usage{ 7 | remove_h2o_models(grid_ids) 8 | } 9 | \arguments{ 10 | \item{grid_ids}{A character vector of h2o grid ids for each tuning resample.} 11 | } 12 | \value{ 13 | 14 | } 15 | \description{ 16 | Removes the models creating during tuning resampling 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/tune_grid_h2o.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tune_grid_h2o.R 3 | \name{tune_grid_h2o} 4 | \alias{tune_grid_h2o} 5 | \title{Tune h2o models} 6 | \usage{ 7 | tune_grid_h2o( 8 | object, 9 | preprocessor = NULL, 10 | resamples, 11 | param_info = NULL, 12 | grid = 10, 13 | metrics = NULL, 14 | control = control_h2o(), 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{object}{A parsnip `model_spec` object.} 20 | 21 | \item{preprocessor}{A `recipe` object.} 22 | 23 | \item{resamples}{An `rset` object.} 24 | 25 | \item{param_info}{A `dials::parameters()` object or NULL. If none is given, a 26 | parameters set is derived from other arguments. Passing this argument can 27 | be useful when parameter ranges need to be customized.} 28 | 29 | \item{grid}{A `data.frame` of tuning combinations or a positive integer. The 30 | data frame should have columns for each parameter being tuned and rows for 31 | tuning parameter candidates. An integer denotes the number of candidate 32 | parameter sets to be created automatically. If a positive integer is used 33 | or no tuning grid is supplied, then a semi-random grid via 34 | `dials::grid_latin_hypercube` is created based on the specified number of 35 | tuning iterations (default size = 10).} 36 | 37 | \item{metrics}{A `yardstick::metric_set` or NULL. Note that not all yardstick 38 | metrics can be used with `tune_grid_h2o`. The metrics must be one of 39 | `yardstick::rsq`, `yardstick::rmse` or `h2oparsnip::mse` for regression 40 | models, and `yardstick::accuracy`, `yardstick::mn_log_loss`, 41 | `yardstick::roc_auc` or `yardstick::pr_auc` for classification models. If 42 | NULL then the default is `yardstick::rsq` for regression models and 43 | `yardstick::mn_log_loss` for classification models.} 44 | 45 | \item{control}{An object used to modify the tuning process.} 46 | 47 | \item{...}{Not currently used.} 48 | } 49 | \value{ 50 | 51 | } 52 | \description{ 53 | This is a prototype of a version of tune_grid that uses h2o.grid to perform 54 | hyperparameter tuning. 55 | } 56 | \section{Limitations}{ 57 | 58 | - Only model arguments can be tuned, not arguments in the preprocessing 59 | recipes. 60 | 61 | - Parsnip only allows `data.frame` and `tbl_spark` objects to be passed to 62 | the `fit` method, not `H2OFrame` objects. 63 | } 64 | 65 | -------------------------------------------------------------------------------- /man/write_h2o.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_h2o.R 3 | \name{write_h2o} 4 | \alias{write_h2o} 5 | \alias{read_h2o} 6 | \alias{write_h2o.workflow} 7 | \alias{write_h2o.model_fit} 8 | \alias{read_h2o.workflow} 9 | \alias{read_h2o.model_fit} 10 | \title{Saves an H2O model to file that is contained within a fitted parsnip model 11 | specification or contained within a workflow} 12 | \usage{ 13 | write_h2o(object, filename, ...) 14 | 15 | read_h2o(object, filename, ...) 16 | 17 | \method{write_h2o}{workflow}(object, filename, ...) 18 | 19 | \method{write_h2o}{model_fit}(object, filename, ...) 20 | 21 | \method{read_h2o}{workflow}(object, filename, ...) 22 | 23 | \method{read_h2o}{model_fit}(object, filename, ...) 24 | } 25 | \arguments{ 26 | \item{object}{Either a `workflows::workflow()` object contained a fitted 27 | model when using the workflows package, or a `model_spec` object from a 28 | fitted model when using the parsnip package directly.} 29 | 30 | \item{filename}{A `character` specifying the file path used to save the 31 | model. H2O models do not require a specific file extension.} 32 | 33 | \item{...}{Currently not used.} 34 | } 35 | \value{ 36 | The file path used to save the model. 37 | } 38 | \description{ 39 | H2O models cannot be saved using the typical R approaches such as saveRDS 40 | because the actual H2O model is contained within a Java virtual machine. H2O 41 | models need to be saved and restored using the `h2o::h2o.saveModel` and 42 | `h2o::h2o.loadModel` functions. This is inconvenient for using H2O models 43 | contained within parsnip model specifications or workflow objects. 44 | } 45 | \details{ 46 | The `write_h2o` function extracts the H2O model from within a parsnip or 47 | workflow fitted model and saves it to file using the `h2o::h2o.saveModel` 48 | function. To restore a model and insert it back into a previously fitted 49 | model use the `read_h2o` function. 50 | } 51 | \examples{ 52 | library(parsnip) 53 | library(h2o) 54 | 55 | # start a h2o session 56 | h2o.init() 57 | 58 | # fit a parsnip model using the h2o engine 59 | clf <- mlp(mode = "classification") \%>\% 60 | set_engine("h2o") 61 | 62 | model_fit <- clf \%>\% fit(Species ~ ., iris) 63 | 64 | # save the parsnip model 65 | saveRDS(model_fit, file.path(tempdir(), "my_model.rds")) 66 | 67 | # save the h2o component of the model 68 | write_h2o(object = model_fit, filename = file.path(tempdir(), "my_h2o_model.mod")) 69 | h2o.shutdown(prompt = FALSE) 70 | 71 | # restore a model 72 | h2o.init() 73 | model_fit <- readRDS(file.path(tempdir(), "my_model.rds")) 74 | 75 | # read and insert the H2O model back into the parsnip object 76 | model_fit <- read_h2o(model_fit, file.path(tempdir(), "my_h2o_model.mod")) 77 | } 78 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(h2oparsnip) 3 | 4 | test_check("h2oparsnip") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_boost_tree_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | 8 | test_that("boost_tree h2o formula method", { 9 | skip_on_cran() 10 | 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | 14 | # classfication 15 | h2o_clf_fitted <- 16 | h2o.gbm( 17 | x = 1:4, 18 | y = 5, 19 | training_frame = as.h2o(iris_df), 20 | ntrees = 50, 21 | max_depth = 10, 22 | min_rows = 1, 23 | learn_rate = 0.1, 24 | sample_rate = 0.8, 25 | col_sample_rate = 0.3, 26 | seed = 1234 27 | ) 28 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 29 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 30 | 31 | clf <- 32 | boost_tree( 33 | mode = "classification", 34 | trees = 50, 35 | tree_depth = 10, 36 | min_n = 1, 37 | learn_rate = 0.1, 38 | sample_size = 0.8, 39 | mtry = 0.3 40 | ) %>% 41 | set_engine("h2o", seed = 1234) 42 | 43 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 44 | 45 | clf_preds <- predict(fitted_clf, iris_df) 46 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 47 | 48 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 49 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 50 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 51 | expect_equal(clf_probs[[3]], h2o_clf_preds$virginica) 52 | 53 | # regression 54 | h2o_regr_fitted <- 55 | h2o.gbm( 56 | x = 2:5, 57 | y = 1, 58 | training_frame = as.h2o(iris_df), 59 | ntrees = 50, 60 | max_depth = 10, 61 | min_rows = 1, 62 | learn_rate = 0.1, 63 | sample_rate = 0.8, 64 | col_sample_rate = 0.3, 65 | seed = 1234 66 | ) 67 | h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df)) 68 | h2o_regr_preds <- as_tibble(h2o_regr_preds) 69 | 70 | regr <- boost_tree( 71 | mode = "regression", 72 | trees = 50, 73 | tree_depth = 10, 74 | min_n = 1, 75 | learn_rate = 0.1, 76 | sample_size = 0.8, 77 | mtry = 0.3 78 | ) %>% 79 | set_engine("h2o", seed = 1234) 80 | 81 | fitted_regr <- regr %>% fit(Sepal.Length ~ ., iris_df) 82 | regr_preds <- predict(fitted_regr, iris_df) 83 | 84 | expect_equal(h2o_regr_preds$predict, regr_preds$.pred) 85 | }) 86 | 87 | test_that("boost_tree h2o multi_predict", { 88 | skip_on_cran() 89 | 90 | h2o.init(nthreads = 1) 91 | iris_df <- as_tibble(iris) 92 | 93 | clf <- 94 | boost_tree( 95 | mode = "classification", 96 | trees = 50, 97 | tree_depth = 10, 98 | min_n = 1, 99 | learn_rate = 0.1, 100 | sample_size = 0.8, 101 | mtry = 0.3 102 | ) %>% 103 | set_engine("h2o", seed = 1234) 104 | 105 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 106 | 107 | expect_true(parsnip::has_multi_predict(fitted_clf)) 108 | expect_equal(parsnip::multi_predict_args(fitted_clf), "trees") 109 | 110 | expect_equal( 111 | nrow(multi_predict(fitted_clf, iris_df, type = "class", trees = 10)), 112 | 150 113 | ) 114 | }) 115 | -------------------------------------------------------------------------------- /tests/testthat/test_control_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(rsample) 3 | library(h2o) 4 | library(h2oparsnip) 5 | library(modeldata) 6 | library(recipes) 7 | library(dials) 8 | library(yardstick) 9 | library(tune) 10 | 11 | 12 | test_that("test for control_h2o options", { 13 | skip_on_cran() 14 | h2o.init(nthreads = 1) 15 | data(ames) 16 | 17 | spec <- rand_forest(mode = "regression", mtry = tune(), min_n = tune()) %>% 18 | set_engine("h2o") 19 | 20 | rec <- ames %>% 21 | recipe(Sale_Price ~ .) 22 | 23 | params <- list(mtry = mtry(c(5, 10)), min_n = min_n(c(1, 10))) %>% 24 | grid_regular() 25 | 26 | resamples <- bootstraps(ames, times = 1) 27 | 28 | # test defaults 29 | res <- 30 | tune_grid_h2o( 31 | object = spec, 32 | preprocessor = rec, 33 | resamples = resamples, 34 | grid = params, 35 | metrics = metric_set(rmse), 36 | control = control_h2o(save_pred = FALSE) 37 | ) 38 | expect_s3_class(res, "tune_results") 39 | expect_equal(names(res), c("splits", "id", ".metrics", ".notes")) 40 | 41 | # test for returning predictions for numeric variable 42 | res <- 43 | tune_grid_h2o( 44 | object = spec, 45 | preprocessor = rec, 46 | resamples = resamples, 47 | grid = params, 48 | metrics = metric_set(rmse), 49 | control = control_h2o(save_pred = TRUE) 50 | ) 51 | expect_s3_class(res, "tune_results") 52 | expect_equal(names(res), c("splits", "id", ".metrics", ".predictions", ".notes")) 53 | expect_equal(names(res$.predictions[[1]])[1], ".pred") 54 | 55 | # test for storing models 56 | res <- 57 | tune_grid_h2o( 58 | object = spec, 59 | preprocessor = rec, 60 | resamples = resamples, 61 | grid = params, 62 | metrics = metric_set(rmse), 63 | control = control_h2o(save_pred = TRUE, save_models = TRUE) 64 | ) 65 | expect_s3_class(res, "tune_results") 66 | expect_equal(names(res), c("splits", "id", ".metrics", ".predictions", ".models", ".notes")) 67 | 68 | h2o.shutdown(prompt = FALSE) 69 | }) 70 | -------------------------------------------------------------------------------- /tests/testthat/test_linear_reg_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | test_that("linear_reg h2o formula method", { 8 | skip_on_cran() 9 | 10 | h2o.init(nthreads = 1) 11 | iris_df <- as_tibble(iris) 12 | 13 | # classfication 14 | h2o_regr_fitted <- 15 | h2o.glm( 16 | x = 2:4, 17 | y = 1, 18 | training_frame = as.h2o(iris_df), 19 | family = "gaussian", 20 | seed = 1234, 21 | ) 22 | h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df)) 23 | h2o_regr_preds <- as_tibble(h2o_regr_preds) 24 | 25 | regr <- linear_reg(mode = "regression") %>% 26 | set_engine("h2o", seed = 1234) 27 | 28 | fitted_regr <- regr %>% 29 | fit(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, iris_df) 30 | 31 | regr_preds <- predict(fitted_regr, iris_df) 32 | 33 | expect_equal(regr_preds[[1]], h2o_regr_preds$predict) 34 | }) 35 | -------------------------------------------------------------------------------- /tests/testthat/test_logistic_reg_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | 8 | test_that("logistic_reg h2o formula method", { 9 | skip_on_cran() 10 | 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | iris_df <- iris_df[which(iris_df$Species %in% c("setosa", "versicolor")), ] 14 | iris_df$Species <- droplevels(iris_df$Species) 15 | 16 | # classfication 17 | h2o_clf_fitted <- 18 | h2o.glm( 19 | x = 1:4, 20 | y = 5, 21 | training_frame = as.h2o(iris_df), 22 | family = "binomial", 23 | seed = 1234, 24 | ) 25 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 26 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 27 | 28 | clf <- logistic_reg(mode = "classification") %>% 29 | set_engine("h2o", seed = 1234) 30 | 31 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 32 | 33 | clf_preds <- predict(fitted_clf, iris_df) 34 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 35 | 36 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 37 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 38 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 39 | }) 40 | -------------------------------------------------------------------------------- /tests/testthat/test_metrics.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(rsample) 3 | library(h2o) 4 | library(h2oparsnip) 5 | library(modeldata) 6 | library(recipes) 7 | library(dials) 8 | library(yardstick) 9 | library(tune) 10 | 11 | 12 | test_that("test regression metrics", { 13 | skip_on_cran() 14 | h2o.init(nthreads = 1) 15 | data(ames) 16 | 17 | spec <- rand_forest(mode = "regression", mtry = tune(), min_n = tune()) %>% 18 | set_engine("h2o") 19 | 20 | rec <- ames %>% 21 | recipe(Sale_Price ~ .) 22 | 23 | params <- list(mtry = mtry(c(5, 10)), min_n = min_n(c(1, 10))) %>% 24 | grid_regular() 25 | 26 | resamples <- bootstraps(ames, times = 1) 27 | 28 | regression_metrics <- metric_set(rmse, rsq) 29 | 30 | res <- 31 | tune_grid_h2o( 32 | object = spec, 33 | preprocessor = rec, 34 | resamples = resamples, 35 | grid = params, 36 | metrics = regression_metrics, 37 | control = control_h2o(save_pred = FALSE, save_models = FALSE) 38 | ) 39 | expect_equal(unique(collect_metrics(res) %>% pull(.metric)), c("rmse", "rsq")) 40 | h2o.shutdown(prompt = FALSE) 41 | }) 42 | 43 | 44 | test_that("test classification metrics multiclass", { 45 | skip_on_cran() 46 | h2o.init(nthreads = 1) 47 | data(iris) 48 | 49 | spec <- rand_forest(mode = "classification", min_n = tune()) %>% 50 | set_engine("h2o") 51 | 52 | rec <- iris %>% 53 | recipe(Species ~ .) 54 | 55 | params <- list(min_n = min_n(c(1, 10))) %>% 56 | grid_regular() 57 | 58 | resamples <- bootstraps(iris, times = 1) 59 | 60 | clf_metrics <- metric_set(accuracy, mn_log_loss) 61 | 62 | res <- 63 | tune_grid_h2o( 64 | object = spec, 65 | preprocessor = rec, 66 | resamples = resamples, 67 | grid = params, 68 | metrics = clf_metrics, 69 | control = control_h2o(save_pred = FALSE, save_models = FALSE) 70 | ) 71 | expect_equal(unique(collect_metrics(res) %>% pull(.metric)), c("accuracy", "mn_log_loss")) 72 | 73 | h2o.shutdown(prompt = FALSE) 74 | }) 75 | 76 | 77 | test_that("test classification metrics binary", { 78 | skip_on_cran() 79 | h2o.init(nthreads = 1) 80 | data(two_class_dat) 81 | 82 | spec <- rand_forest(mode = "classification", min_n = tune()) %>% 83 | set_engine("h2o") 84 | 85 | rec <- two_class_dat %>% 86 | recipe(Class ~ .) 87 | 88 | params <- list(min_n = min_n(c(1, 10))) %>% 89 | grid_regular() 90 | 91 | resamples <- bootstraps(two_class_dat, times = 1) 92 | 93 | clf_metrics <- metric_set(roc_auc, pr_auc) 94 | 95 | res <- 96 | tune_grid_h2o( 97 | object = spec, 98 | preprocessor = rec, 99 | resamples = resamples, 100 | grid = params, 101 | metrics = clf_metrics, 102 | control = control_h2o(save_pred = FALSE, save_models = FALSE) 103 | ) 104 | expect_equal( 105 | unique(collect_metrics(res) %>% pull(.metric)), 106 | c("pr_auc", "roc_auc") 107 | ) 108 | 109 | h2o.shutdown(prompt = FALSE) 110 | }) 111 | -------------------------------------------------------------------------------- /tests/testthat/test_mlp_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | 8 | test_that("mlp h2o formula method", { 9 | skip_on_cran() 10 | 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | 14 | # classfication 15 | h2o_clf_fitted <- 16 | h2o.deeplearning( 17 | x = 1:4, 18 | y = 5, 19 | training_frame = as.h2o(iris_df), 20 | hidden = 100, 21 | activation = "Rectifier", 22 | seed = 1234, 23 | reproducible = TRUE 24 | ) 25 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 26 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 27 | 28 | mlp_clf <- mlp(mode = "classification", hidden_units = 100) %>% 29 | set_engine("h2o", seed = 1234, reproducible = TRUE) 30 | 31 | fitted_clf <- mlp_clf %>% fit(Species ~ ., iris_df) 32 | 33 | clf_preds <- predict(fitted_clf, iris_df) 34 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 35 | 36 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 37 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 38 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 39 | expect_equal(clf_probs[[3]], h2o_clf_preds$virginica) 40 | 41 | # regression 42 | h2o_regr_fitted <- 43 | h2o::h2o.deeplearning( 44 | x = 2:5, 45 | y = 1, 46 | training_frame = as.h2o(iris_df), 47 | hidden = 100, 48 | activation = "Rectifier", 49 | seed = 1234, 50 | reproducible = TRUE 51 | ) 52 | h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df)) 53 | h2o_regr_preds <- as_tibble(h2o_regr_preds) 54 | 55 | mlp_regr <- mlp(mode = "regression", hidden_units = 100) %>% 56 | set_engine("h2o", seed = 1234, reproducible = TRUE) 57 | 58 | fitted_regr <- mlp_regr %>% fit(Sepal.Length ~ ., iris_df) 59 | regr_preds <- predict(fitted_regr, iris_df) 60 | 61 | expect_equal(h2o_regr_preds$predict, regr_preds$.pred) 62 | }) 63 | 64 | 65 | test_that("mlp h2o non-formula method", { 66 | skip_on_cran() 67 | 68 | h2o.init(nthreads = 1) 69 | iris_df <- as_tibble(iris) 70 | 71 | h2o_clf_fitted <- 72 | h2o::h2o.deeplearning( 73 | x = 1:4, 74 | y = 5, 75 | training_frame = as.h2o(iris_df), 76 | hidden = 100, 77 | activation = "Rectifier", 78 | seed = 1234, 79 | reproducible = TRUE 80 | ) 81 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 82 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 83 | 84 | mlp_clf <- mlp(mode = "classification", hidden_units = 100) %>% 85 | set_engine("h2o", seed = 1234, reproducible = TRUE) 86 | 87 | fitted_clf <- mlp_clf %>% fit_xy(iris_df[, -5], iris_df$Species) 88 | 89 | clf_preds <- predict(fitted_clf, iris_df) 90 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 91 | 92 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 93 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 94 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 95 | expect_equal(clf_probs[[3]], h2o_clf_preds$virginica) 96 | }) 97 | 98 | 99 | test_that("mlp h2o automatic use of activation function with dropout", { 100 | skip_on_cran() 101 | 102 | h2o.init(nthreads = 1) 103 | iris_df <- as_tibble(iris) 104 | 105 | # rectifier with dropout 106 | h2o_clf_fitted <- 107 | h2o::h2o.deeplearning( 108 | x = 1:4, 109 | y = 5, 110 | training_frame = as.h2o(iris_df), 111 | hidden = 100, 112 | activation = "RectifierWithDropout", 113 | hidden_dropout_ratios = 0.2, 114 | seed = 1234, 115 | reproducible = TRUE 116 | ) 117 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 118 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 119 | 120 | mlp_clf <- 121 | mlp( 122 | mode = "classification", 123 | hidden_units = 100, 124 | activation = "relu", 125 | dropout = 0.2 126 | ) %>% 127 | set_engine("h2o", seed = 1234, reproducible = TRUE) 128 | 129 | fitted_clf <- mlp_clf %>% fit_xy(iris_df[, -5], iris_df$Species) 130 | clf_preds <- predict(fitted_clf, iris_df) 131 | 132 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 133 | }) 134 | 135 | 136 | test_that("mlp h2o multi_predict", { 137 | mlp_clf <- 138 | mlp(mode = "classification", hidden_units = 100, epochs = 100) %>% 139 | set_engine("h2o", seed = 1234, reproducible = TRUE) 140 | 141 | fitted_clf <- mlp_clf %>% 142 | fit(Species ~ ., iris_df) 143 | 144 | expect_true(parsnip::has_multi_predict(fitted_clf)) 145 | expect_equal(parsnip::multi_predict_args(fitted_clf), "epochs") 146 | 147 | expect_equal( 148 | nrow(multi_predict(fitted_clf, iris_df, type = "class", epochs = 200)), 149 | 150 150 | ) 151 | }) 152 | 153 | 154 | test_that("mlp h2o early stopping", { 155 | mlp_clf <- 156 | mlp(mode = "classification", hidden_units = 100, epochs = 100) %>% 157 | set_engine("h2o", 158 | seed = 1234, reproducible = TRUE, stopping_rounds = 10, 159 | validation = 0.1 160 | ) 161 | 162 | fitted_clf <- mlp_clf %>% 163 | fit(Species ~ ., iris_df) 164 | }) 165 | -------------------------------------------------------------------------------- /tests/testthat/test_multinom_reg_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | 8 | test_that("multinom_reg h2o formula method", { 9 | skip_on_cran() 10 | 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | 14 | # classfication 15 | h2o_clf_fitted <- 16 | h2o.glm( 17 | x = 1:4, 18 | y = 5, 19 | training_frame = as.h2o(iris_df), 20 | family = "multinomial", 21 | seed = 1234, 22 | ) 23 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 24 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 25 | 26 | clf <- multinom_reg(mode = "classification") %>% 27 | set_engine("h2o", seed = 1234) 28 | 29 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 30 | 31 | clf_preds <- predict(fitted_clf, iris_df) 32 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 33 | 34 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 35 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 36 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 37 | expect_equal(clf_probs[[3]], h2o_clf_preds$virginica) 38 | }) 39 | -------------------------------------------------------------------------------- /tests/testthat/test_naive_Bayes.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(discrim) 4 | library(tibble) 5 | library(magrittr) 6 | library(h2o) 7 | 8 | test_that("naive_Bayes h2o formula method", { 9 | skip_on_cran() 10 | 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | 14 | # classfication 15 | h2o_clf_fitted <- 16 | h2o.naiveBayes( 17 | x = 1:4, 18 | y = 5, 19 | training_frame = as.h2o(iris_df), 20 | seed = 1234 21 | ) 22 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 23 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 24 | 25 | clf <- naive_Bayes(mode = "classification") %>% 26 | set_engine("h2o", seed = 1234) 27 | 28 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 29 | 30 | clf_preds <- predict(fitted_clf, iris_df) 31 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 32 | 33 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 34 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 35 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 36 | }) 37 | -------------------------------------------------------------------------------- /tests/testthat/test_rand_forest_h2o.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | 8 | test_that("boost_tree h2o formula method", { 9 | skip_on_cran() 10 | h2o.init(nthreads = 1) 11 | iris_df <- as_tibble(iris) 12 | 13 | # classfication 14 | h2o_clf_fitted <- 15 | h2o.randomForest( 16 | x = 1:4, 17 | y = 5, 18 | training_frame = as.h2o(iris_df), 19 | ntrees = 50, 20 | min_rows = 1, 21 | mtries = -1, 22 | seed = 1234, 23 | ) 24 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 25 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 26 | 27 | clf <- 28 | rand_forest( 29 | mode = "classification", 30 | trees = 50, 31 | min_n = 1, 32 | mtry = -1 33 | ) %>% 34 | set_engine("h2o", seed = 1234) 35 | 36 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 37 | 38 | clf_preds <- predict(fitted_clf, iris_df) 39 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 40 | 41 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 42 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 43 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 44 | expect_equal(clf_probs[[3]], h2o_clf_preds$virginica) 45 | 46 | # regression 47 | h2o_regr_fitted <- 48 | h2o.randomForest( 49 | x = 2:5, 50 | y = 1, 51 | training_frame = as.h2o(iris_df), 52 | ntrees = 50, 53 | min_rows = 1, 54 | mtries = -1, 55 | seed = 1234 56 | ) 57 | h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df)) 58 | h2o_regr_preds <- as_tibble(h2o_regr_preds) 59 | 60 | regr <- rand_forest( 61 | mode = "regression", 62 | trees = 50, 63 | min_n = 1, 64 | mtry = -1 65 | ) %>% 66 | set_engine("h2o", seed = 1234) 67 | 68 | fitted_regr <- regr %>% fit(Sepal.Length ~ ., iris_df) 69 | regr_preds <- predict(fitted_regr, iris_df) 70 | 71 | expect_equal(h2o_regr_preds$predict, regr_preds$.pred) 72 | }) 73 | -------------------------------------------------------------------------------- /tests/testthat/test_rule_fit.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | library(rules) 7 | 8 | 9 | test_that("boost_tree h2o formula method", { 10 | skip_on_cran() 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | 14 | # classfication 15 | h2o_clf_fitted <- 16 | h2o.rulefit( 17 | x = 1:4, 18 | y = 5, 19 | training_frame = as.h2o(iris_df), 20 | seed = 1234, 21 | ) 22 | h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df)) 23 | h2o_clf_preds <- as_tibble(h2o_clf_preds) 24 | 25 | clf <- 26 | rule_fit(mode = "classification") %>% 27 | set_engine("h2o", seed = 1234) 28 | 29 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 30 | 31 | clf_preds <- predict(fitted_clf, iris_df) 32 | clf_probs <- predict(fitted_clf, iris_df, type = "prob") 33 | 34 | expect_equal(clf_preds[[1]], h2o_clf_preds$predict) 35 | expect_equal(clf_probs[[1]], h2o_clf_preds$setosa) 36 | expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor) 37 | expect_equal(clf_probs[[3]], h2o_clf_preds$virginica) 38 | 39 | # regression 40 | h2o_regr_fitted <- 41 | h2o.rulefit( 42 | x = 2:5, 43 | y = 1, 44 | training_frame = as.h2o(iris_df), 45 | seed = 1234 46 | ) 47 | h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df)) 48 | h2o_regr_preds <- as_tibble(h2o_regr_preds) 49 | 50 | regr <- rule_fit(mode = "regression", ) %>% 51 | set_engine("h2o", seed = 1234) 52 | 53 | fitted_regr <- regr %>% fit(Sepal.Length ~ ., iris_df) 54 | regr_preds <- predict(fitted_regr, iris_df) 55 | 56 | expect_equal(h2o_regr_preds$predict, regr_preds$.pred) 57 | }) 58 | -------------------------------------------------------------------------------- /tests/testthat/test_save.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(parsnip) 3 | library(tibble) 4 | library(magrittr) 5 | library(h2o) 6 | 7 | 8 | test_that("model persistence, model spec", { 9 | skip_on_cran() 10 | 11 | h2o.init(nthreads = 1) 12 | iris_df <- as_tibble(iris) 13 | 14 | clf <- 15 | mlp(mode = "classification") %>% 16 | set_engine("h2o", seed = 1234) 17 | 18 | fitted_clf <- clf %>% fit(Species ~ ., iris_df) 19 | preds <- predict(fitted_clf, iris_df) 20 | 21 | # save model 22 | model_file <- tempfile(fileext = ".mod") 23 | parsnip_file <- tempfile(fileext = ".rds") 24 | 25 | saveRDS(fitted_clf, parsnip_file) 26 | write_h2o(fitted_clf, model_file) 27 | 28 | # remove model 29 | h2o.removeAll() 30 | remove(fitted_clf) 31 | 32 | # restore model 33 | restored_clf <- readRDS(parsnip_file) 34 | restored_clf <- read_h2o(restored_clf, model_file) 35 | preds_post <- predict(restored_clf, iris_df) 36 | 37 | # compare 38 | expect_equal(preds, preds_post) 39 | }) 40 | --------------------------------------------------------------------------------