├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── automl.R
    ├── boost_tree.R
    ├── control_h2o.R
    ├── glm_wrapper.R
    ├── imports.R
    ├── linear_reg.R
    ├── logistic_reg.R
    ├── make_call.R
    ├── metrics.R
    ├── mlp.R
    ├── multi_predicts.R
    ├── multinomial_reg.R
    ├── naive_Bayes.R
    ├── rand_forest.R
    ├── read_h2o.R
    ├── rule_fit.R
    ├── translate_args.R
    ├── tune_grid_h2o.R
    ├── utils-pipe.R
    ├── utils.R
    └── zzz.R
├── README.md
├── _pkgdown.yml
├── h2oparsnip.Rproj
├── man
    ├── automl.Rd
    ├── control_h2o.Rd
    ├── extract_h2o_models.Rd
    ├── extract_h2o_preds.Rd
    ├── extract_h2o_scores.Rd
    ├── h2o_automl_train.Rd
    ├── h2o_gbm_train.Rd
    ├── h2o_glm_train.Rd
    ├── h2o_mlp_train.Rd
    ├── h2o_naiveBayes_train.Rd
    ├── h2o_rf_train.Rd
    ├── h2o_rulefit_train.Rd
    ├── model_spec_to_algorithm.Rd
    ├── mse_vec.Rd
    ├── pipe.Rd
    ├── remove_h2o_models.Rd
    ├── tune_grid_h2o.Rd
    └── write_h2o.Rd
└── tests
    ├── testthat.R
    └── testthat
        ├── test_boost_tree_h2o.R
        ├── test_control_h2o.R
        ├── test_linear_reg_h2o.R
        ├── test_logistic_reg_h2o.R
        ├── test_metrics.R
        ├── test_mlp_h2o.R
        ├── test_multinom_reg_h2o.R
        ├── test_naive_Bayes.R
        ├── test_rand_forest_h2o.R
        ├── test_rule_fit.R
        └── test_save.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | ^_pkgdown\.yml$
5 | ^docs$
6 | ^pkgdown$
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | docs
6 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: h2oparsnip
 2 | Type: Package
 3 | Title: Model Wrappers for H2O models
 4 | Version: 0.1.0
 5 | Author: Steven Pawley
 6 | Maintainer: Steven Pawley <dr.stevenpawley@gmail.com>
 7 | Description: Bindings for additional models for use with the 'parsnip' package.
 8 |     Models include learning algorithms available in the R H2O package.
 9 | License: MIT + file LICENSE
10 | URL: https://github.com/stevenpawley/h2oparsnip
11 | Suggests:
12 |     testthat,
13 |     roxygen2,
14 |     covr
15 | Encoding: UTF-8
16 | LazyData: true
17 | Depends:
18 |     R (>= 2.10),
19 |     parsnip
20 | Imports:
21 |     rlang,
22 |     tibble,
23 |     magrittr,
24 |     h2o,
25 |     dials,
26 |     tune,
27 |     yardstick,
28 |     rsample,
29 |     discrim
30 | BugReports: https://github.com/stevenpawley/h2oparsnip/issues
31 | RoxygenNote: 7.1.2
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2020
2 | COPYRIGHT HOLDER: Steven Pawley
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2020 Steven Pawley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(mse,data.frame)
 4 | S3method(multi_predict,"_H2OMultinomialModel")
 5 | S3method(multi_predict,"_H2ORegressionModel")
 6 | S3method(print,automl)
 7 | S3method(read_h2o,model_fit)
 8 | S3method(read_h2o,workflow)
 9 | S3method(write_h2o,model_fit)
10 | S3method(write_h2o,workflow)
11 | export("%>%")
12 | export(automl)
13 | export(control_h2o)
14 | export(h2o_automl_train)
15 | export(h2o_gbm_train)
16 | export(h2o_glm_train)
17 | export(h2o_mlp_train)
18 | export(h2o_naiveBayes_train)
19 | export(h2o_rf_train)
20 | export(h2o_rulefit_train)
21 | export(mse)
22 | export(mse_vec)
23 | export(read_h2o)
24 | export(tune_grid_h2o)
25 | export(write_h2o)
26 | importFrom(magrittr,"%>%")
27 | importFrom(parsnip,multi_predict)
28 | importFrom(tibble,as_tibble)
29 | importFrom(tibble,tibble)
30 | importFrom(yardstick,accuracy)
31 | importFrom(yardstick,mn_log_loss)
32 | importFrom(yardstick,pr_auc)
33 | importFrom(yardstick,rmse)
34 | importFrom(yardstick,roc_auc)
35 | importFrom(yardstick,rsq)
36 | 


--------------------------------------------------------------------------------
/R/automl.R:
--------------------------------------------------------------------------------
  1 | #' General interface for automl models
  2 | #'
  3 | #' @param mode A single character string for the type of model.
  4 | #'
  5 | #' @return A model_spec
  6 | #' @export
  7 | automl <- function(mode = "classification") {
  8 |   args <- list()
  9 | 
 10 |   parsnip::new_model_spec(
 11 |     "automl",
 12 |     args = args,
 13 |     eng_args = NULL,
 14 |     mode = mode,
 15 |     method = NULL,
 16 |     engine = "h2o"
 17 |   )
 18 | }
 19 | 
 20 | #' @export
 21 | print.automl <- function(x, ...) {
 22 |   cat("Automl Model Specification (", x$mode, ")\n\n", sep = "")
 23 |   parsnip::model_printer(x, ...)
 24 | 
 25 |   if (!is.null(x$method$fit$args)) {
 26 |     cat("Model fit template:\n")
 27 |     print(parsnip::show_call(x))
 28 |   }
 29 | 
 30 |   invisible(x)
 31 | }
 32 | 
 33 | 
 34 | add_automl <- function() {
 35 |   # define model
 36 |   parsnip::set_new_model("automl")
 37 | 
 38 |   # define model modes
 39 |   parsnip::set_model_mode(model = "automl", mode = "classification")
 40 |   parsnip::set_model_engine("automl", mode = "classification", eng = "h2o")
 41 | 
 42 |   parsnip::set_model_mode(model = "automl", mode = "regression")
 43 |   parsnip::set_model_engine("automl", mode = "regression", eng = "h2o")
 44 | 
 45 |   # define dependencies for each mode
 46 |   parsnip::set_dependency("automl", "h2o", "h2o")
 47 | 
 48 |   # define fit methods
 49 |   parsnip::set_fit(
 50 |     model = "automl",
 51 |     eng = "h2o",
 52 |     mode = "regression",
 53 |     value = list(
 54 |       interface = "formula",
 55 |       protect = c("formula", "x", "y", "training_frame"),
 56 |       func = c(fun = "h2o_automl_train"),
 57 |       defaults = list()
 58 |     )
 59 |   )
 60 |   parsnip::set_fit(
 61 |     model = "automl",
 62 |     eng = "h2o",
 63 |     mode = "classification",
 64 |     value = list(
 65 |       interface = "formula",
 66 |       protect = c("formula", "x", "y", "training_frame"),
 67 |       func = c(fun = "h2o_automl_train"),
 68 |       defaults = list()
 69 |     )
 70 |   )
 71 |   parsnip::set_encoding(
 72 |     model = "automl",
 73 |     eng = "h2o",
 74 |     mode = "regression",
 75 |     options = list(
 76 |       predictor_indicators = "none",
 77 |       compute_intercept = FALSE,
 78 |       remove_intercept = FALSE,
 79 |       allow_sparse_x = FALSE
 80 |     )
 81 |   )
 82 |   parsnip::set_encoding(
 83 |     model = "automl",
 84 |     eng = "h2o",
 85 |     mode = "classification",
 86 |     options = list(
 87 |       predictor_indicators = "none",
 88 |       compute_intercept = FALSE,
 89 |       remove_intercept = FALSE,
 90 |       allow_sparse_x = FALSE
 91 |     )
 92 |   )
 93 | 
 94 |   # regression predict
 95 |   parsnip::set_pred(
 96 |     model = "automl",
 97 |     eng = "h2o",
 98 |     mode = "regression",
 99 |     type = "numeric",
100 |     value = list(
101 |       pre = function(x, object) h2o::as.h2o(x),
102 |       post = function(x, object) as.data.frame(x)$predict,
103 |       func = c(pkg = "h2o", fun = "h2o.predict"),
104 |       args = list(
105 |         object = quote(object$fit),
106 |         newdata = quote(new_data)
107 |       )
108 |     )
109 |   )
110 |   parsnip::set_pred(
111 |     model = "automl",
112 |     eng = "h2o",
113 |     mode = "regression",
114 |     type = "raw",
115 |     value = list(
116 |       pre = function(x, object) h2o::as.h2o(x),
117 |       post = function(x, object) as.data.frame(x),
118 |       func = c(pkg = "h2o", fun = "h2o.predict"),
119 |       args = list(
120 |         object = quote(object$fit),
121 |         newdata = quote(new_data)
122 |       )
123 |     )
124 |   )
125 | 
126 |   # classification predict
127 |   parsnip::set_pred(
128 |     model = "automl",
129 |     eng = "h2o",
130 |     mode = "classification",
131 |     type = "class",
132 |     value = list(
133 |       pre = function(x, object) h2o::as.h2o(x),
134 |       post = function(x, object) as.data.frame(x)$predict,
135 |       func = c(pkg = "h2o", fun = "h2o.predict"),
136 |       args = list(
137 |         object = quote(object$fit),
138 |         newdata = quote(new_data)
139 |       )
140 |     )
141 |   )
142 |   parsnip::set_pred(
143 |     model = "automl",
144 |     eng = "h2o",
145 |     mode = "classification",
146 |     type = "prob",
147 |     value = list(
148 |       pre = function(x, object) h2o::as.h2o(x),
149 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
150 |       func = c(pkg = "h2o", fun = "h2o.predict"),
151 |       args = list(
152 |         object = quote(object$fit),
153 |         newdata = quote(new_data)
154 |       )
155 |     )
156 |   )
157 |   parsnip::set_pred(
158 |     model = "automl",
159 |     eng = "h2o",
160 |     mode = "classification",
161 |     type = "raw",
162 |     value = list(
163 |       pre = function(x, object) h2o::as.h2o(x),
164 |       post = function(x, object) as.data.frame(x),
165 |       func = c(pkg = "h2o", fun = "h2o.predict"),
166 |       args = list(
167 |         object = quote(object$fit),
168 |         newdata = quote(new_data)
169 |       )
170 |     )
171 |   )
172 | }
173 | 
174 | 
175 | #' Wrapper for training a h2o.automl model
176 | #'
177 | #' @param formula formula
178 | #' @param data data.frame of training data
179 | #' @param ... Other arguments to pass the h2o.automl
180 | #'
181 | #' @return evaluated h2o model call
182 | #' @export
183 | h2o_automl_train <- function(formula, data, ...) {
184 |   others <- list(...)
185 | 
186 |   # get term names
187 |   X <- attr(stats::terms(formula, data = data), "term.labels")
188 |   y <- all.vars(formula)[1]
189 | 
190 |   # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
191 |   if (!inherits(data, "H2OFrame")) {
192 |     data <- h2o::as.h2o(data)
193 |   }
194 | 
195 |   # define arguments
196 |   args <- list(
197 |     x = X,
198 |     y = y,
199 |     training_frame = data
200 |   )
201 | 
202 |   others <- list(...)
203 |   res <- make_h2o_call("h2o.automl", args, others)
204 | 
205 |   res
206 | }
207 | 


--------------------------------------------------------------------------------
/R/boost_tree.R:
--------------------------------------------------------------------------------
  1 | add_boost_tree_h2o <- function() {
  2 |   parsnip::set_model_engine("boost_tree", "classification", "h2o")
  3 |   parsnip::set_model_engine("boost_tree", "regression", "h2o")
  4 |   parsnip::set_dependency("boost_tree", "h2o", "h2o")
  5 | 
  6 |   parsnip::set_model_arg(
  7 |     model = "boost_tree",
  8 |     eng = "h2o",
  9 |     parsnip = "trees",
 10 |     original = "ntrees",
 11 |     func = list(pkg = "dials", fun = "trees"),
 12 |     has_submodel = TRUE
 13 |   )
 14 |   parsnip::set_model_arg(
 15 |     model = "boost_tree",
 16 |     eng = "h2o",
 17 |     parsnip = "tree_depth",
 18 |     original = "max_depth",
 19 |     func = list(pkg = "dials", fun = "tree_depth"),
 20 |     has_submodel = FALSE
 21 |   )
 22 |   parsnip::set_model_arg(
 23 |     model = "boost_tree",
 24 |     eng = "h2o",
 25 |     parsnip = "min_n",
 26 |     original = "min_rows",
 27 |     func = list(pkg = "dials", fun = "min_n"),
 28 |     has_submodel = FALSE
 29 |   )
 30 |   parsnip::set_model_arg(
 31 |     model = "boost_tree",
 32 |     eng = "h2o",
 33 |     parsnip = "learn_rate",
 34 |     original = "learn_rate",
 35 |     func = list(pkg = "dials", fun = "learn_rate"),
 36 |     has_submodel = FALSE
 37 |   )
 38 |   parsnip::set_model_arg(
 39 |     model = "boost_tree",
 40 |     eng = "h2o",
 41 |     parsnip = "sample_size",
 42 |     original = "sample_rate",
 43 |     func = list(pkg = "dials", fun = "sample_size"),
 44 |     has_submodel = FALSE
 45 |   )
 46 |   parsnip::set_model_arg(
 47 |     model = "boost_tree",
 48 |     eng = "h2o",
 49 |     parsnip = "mtry",
 50 |     original = "col_sample_rate",
 51 |     func = list(pkg = "dials", fun = "mtry"),
 52 |     has_submodel = FALSE
 53 |   )
 54 |   parsnip::set_model_arg(
 55 |     model = "boost_tree",
 56 |     eng = "h2o",
 57 |     parsnip = "loss_reduction",
 58 |     original = "min_split_improvement",
 59 |     func = list(pkg = "dials", fun = "loss_reduction"),
 60 |     has_submodel = FALSE
 61 |   )
 62 |   parsnip::set_model_arg(
 63 |     model = "boost_tree",
 64 |     eng = "h2o",
 65 |     parsnip = "stop_iter",
 66 |     original = "stopping_rounds",
 67 |     func = list(pkg = "dials", fun = "stop_iter"),
 68 |     has_submodel = FALSE
 69 |   )
 70 |   parsnip::set_fit(
 71 |     model = "boost_tree",
 72 |     eng = "h2o",
 73 |     mode = "regression",
 74 |     value = list(
 75 |       interface = "formula",
 76 |       protect = c("formula", "x", "y", "training_frame"),
 77 |       func = c(fun = "h2o_gbm_train"),
 78 |       defaults = list()
 79 |     )
 80 |   )
 81 |   parsnip::set_fit(
 82 |     model = "boost_tree",
 83 |     eng = "h2o",
 84 |     mode = "classification",
 85 |     value = list(
 86 |       interface = "formula",
 87 |       protect = c("formula", "x", "y", "training_frame"),
 88 |       func = c(fun = "h2o_gbm_train"),
 89 |       defaults = list()
 90 |     )
 91 |   )
 92 |   parsnip::set_encoding(
 93 |     model = "boost_tree",
 94 |     eng = "h2o",
 95 |     mode = "classification",
 96 |     options = list(
 97 |       predictor_indicators = "none",
 98 |       compute_intercept = FALSE,
 99 |       remove_intercept = FALSE,
100 |       allow_sparse_x = FALSE
101 |     )
102 |   )
103 |   parsnip::set_encoding(
104 |     model = "boost_tree",
105 |     eng = "h2o",
106 |     mode = "regression",
107 |     options = list(
108 |       predictor_indicators = "none",
109 |       compute_intercept = FALSE,
110 |       remove_intercept = FALSE,
111 |       allow_sparse_x = FALSE
112 |     )
113 |   )
114 | 
115 |   # regression predict
116 |   parsnip::set_pred(
117 |     model = "boost_tree",
118 |     eng = "h2o",
119 |     mode = "regression",
120 |     type = "numeric",
121 |     value = list(
122 |       pre = function(x, object) h2o::as.h2o(x),
123 |       post = function(x, object) as.data.frame(x)$predict,
124 |       func = c(pkg = "h2o", fun = "h2o.predict"),
125 |       args = list(
126 |         object = quote(object$fit),
127 |         newdata = quote(new_data)
128 |       )
129 |     )
130 |   )
131 |   parsnip::set_pred(
132 |     model = "boost_tree",
133 |     eng = "h2o",
134 |     mode = "regression",
135 |     type = "raw",
136 |     value = list(
137 |       pre = function(x, object) h2o::as.h2o(x),
138 |       post = function(x, object) as.data.frame(x),
139 |       func = c(pkg = "h2o", fun = "h2o.predict"),
140 |       args = list(
141 |         object = quote(object$fit),
142 |         newdata = quote(new_data)
143 |       )
144 |     )
145 |   )
146 | 
147 |   # classification predict
148 |   parsnip::set_pred(
149 |     model = "boost_tree",
150 |     eng = "h2o",
151 |     mode = "classification",
152 |     type = "class",
153 |     value = list(
154 |       pre = function(x, object) h2o::as.h2o(x),
155 |       post = function(x, object) as.data.frame(x)$predict,
156 |       func = c(pkg = "h2o", fun = "h2o.predict"),
157 |       args = list(
158 |         object = quote(object$fit),
159 |         newdata = quote(new_data)
160 |       )
161 |     )
162 |   )
163 |   parsnip::set_pred(
164 |     model = "boost_tree",
165 |     eng = "h2o",
166 |     mode = "classification",
167 |     type = "prob",
168 |     value = list(
169 |       pre = function(x, object) h2o::as.h2o(x),
170 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
171 |       func = c(pkg = "h2o", fun = "h2o.predict"),
172 |       args = list(
173 |         object = quote(object$fit),
174 |         newdata = quote(new_data)
175 |       )
176 |     )
177 |   )
178 |   parsnip::set_pred(
179 |     model = "boost_tree",
180 |     eng = "h2o",
181 |     mode = "classification",
182 |     type = "raw",
183 |     value = list(
184 |       pre = function(x, object) h2o::as.h2o(x),
185 |       post = function(x, object) as.data.frame(x),
186 |       func = c(pkg = "h2o", fun = "h2o.predict"),
187 |       args = list(
188 |         object = quote(object$fit),
189 |         newdata = quote(new_data)
190 |       )
191 |     )
192 |   )
193 | }
194 | 
195 | #' Wrapper for training a h2o.gbm model as part of a parsnip `boost_tree`
196 | #' h2o engine
197 | #'
198 | #' @param formula formula
199 | #' @param data data.frame of training data
200 | #' @param ntrees integer, the number of trees to build (default = 50).
201 | #' @param max_depth integer, the maximum tree depth (default = 10).
202 | #' @param min_rows integer, the minimum number of observations for a leaf
203 | #'   (default = 10).
204 | #' @param learn_rate numeric, the learning rate (default = 0.1, range is from
205 | #'   0.0 to 1.0).
206 | #' @param sample_rate numeric, the proportion of samples to use to build each
207 | #'   tree (default = 1.0).
208 | #' @param col_sample_rate numeric, the proportion of features available during
209 | #'   each node split (default = 1.0).
210 | #' @param min_split_improvement numeric,  minimum relative improvement in
211 | #'   squared error reduction in order for a split to happen (default = 1e-05)
212 | #' @param stopping_rounds An integer specifying the number of training
213 | #'   iterations without improvement before stopping. If `stopping_rounds = 0`
214 | #'   (the default) then early stopping is disabled.  If `validation` is used,
215 | #'   performance is base on the validation set; otherwise the training set is
216 | #'   used.
217 | #' @param validation A positive number. If on `[0, 1)` the value, `validation`
218 | #' is a random proportion of data in `x` and `y` that are used for performance
219 | #' assessment and potential early stopping. If 1 or greater, it is the _number_
220 | #' of training set samples use for these purposes.
221 | #' @param algorithm Whether to use the default h2o 'h2o.gbm' algorithm or use
222 | #'   'h2o.xgboost' via h2o.
223 | #' @param ... other arguments passed to the h2o engine.
224 | #'
225 | #' @return evaluated h2o model call
226 | #' @export
227 | h2o_gbm_train <-
228 |   function(formula,
229 |            data,
230 |            ntrees = 50,
231 |            max_depth = 5,
232 |            min_rows = 10,
233 |            learn_rate = 0.1,
234 |            sample_rate = 1.0,
235 |            col_sample_rate = 1.0,
236 |            min_split_improvement = 1e-05,
237 |            stopping_rounds = 0,
238 |            validation = 0,
239 |            algorithm = "h2o.gbm",
240 |            ...) {
241 |     others <- list(...)
242 | 
243 |     # get term names and convert to h2o
244 |     X <- attr(stats::terms(formula, data = data), "term.labels")
245 |     y <- all.vars(formula)[1]
246 | 
247 |     # early stopping
248 |     if (validation > 1) {
249 |       validation <- validation / nrow(data)
250 |     }
251 | 
252 |     if (stopping_rounds > 0 & validation > 0) {
253 |       n <- nrow(data)
254 |       trn_index <- sample(1:n, size = floor(n * validation) + 1)
255 |       valid <- data[-trn_index, ]
256 |       data <- data[trn_index, ]
257 |     } else {
258 |       valid <- NULL
259 |     }
260 | 
261 |     # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
262 |     if (!inherits(data, "H2OFrame")) {
263 |       data <- h2o::as.h2o(data)
264 |     }
265 | 
266 |     if (!is.null(valid)) {
267 |       valid <- h2o::as.h2o(valid)
268 |     }
269 | 
270 |     # convert mtry (number of features) to proportions
271 |     if (col_sample_rate > 1) {
272 |       col_sample_rate <- col_sample_rate / length(X)
273 |     }
274 | 
275 |     # define arguments
276 |     args <- list(
277 |       x = X,
278 |       y = y,
279 |       training_frame = data,
280 |       validation_frame = valid,
281 |       ntrees = ntrees,
282 |       max_depth = max_depth,
283 |       min_rows = min_rows,
284 |       learn_rate = learn_rate,
285 |       sample_rate = sample_rate,
286 |       min_split_improvement = min_split_improvement,
287 |       stopping_rounds = stopping_rounds
288 |     )
289 | 
290 |     if (algorithm == "h2o.gbm") {
291 |       args$col_sample_rate <- col_sample_rate
292 |     } else if (algorithm == "h2o.xgboost") {
293 |       args$colsample_bynode <- col_sample_rate
294 |     }
295 | 
296 |     res <- make_h2o_call(algorithm, args, others)
297 | 
298 |     res
299 |   }
300 | 


--------------------------------------------------------------------------------
/R/control_h2o.R:
--------------------------------------------------------------------------------
 1 | #' Control aspects of the grid search process
 2 | #'
 3 | #' `control_h2o` provides a function to set various aspects of the grid search process.
 4 | #' By default during tuning, the resampling predictions are stored within the h2o
 5 | #' cluster. To save memory and space, use 'save_pred = FALSE'.
 6 | #' ``
 7 | #'
 8 | #' @param verbose A logical for logging results as they are generated.
 9 | #' @param save_pred A logical for whether the out-of-sample predictions should be saved
10 | #'   for each model evaluated.
11 | #' @param save_models A logical for whether to retain the models associated with the
12 | #'   tuning and resampling iterations within the h2o cluster and append their h2o model
13 | #'   ids to the resamples object as a '.models' column.
14 | #'
15 | #' @return An object of `control_grid` and `control_resamples` class.
16 | #' @export
17 | #'
18 | #' @examples
19 | #' # to save space in the cluster use these settings (the defaults)
20 | #' control_h2o(verbose = TRUE, save_pred = FALSE)
21 | control_h2o <- function(verbose = FALSE, save_pred = FALSE, save_models = FALSE,
22 |                         event_level = "first") {
23 |   res <- list(
24 |     verbose = verbose,
25 |     save_pred = save_pred,
26 |     save_models = save_models
27 |   )
28 | 
29 |   class(res) <- c("control_grid", "control_resamples")
30 |   res
31 | }
32 | 


--------------------------------------------------------------------------------
/R/glm_wrapper.R:
--------------------------------------------------------------------------------
 1 | #' Wrapper for training a h2o.glm model as part of a parsnip
 2 | #'
 3 | #' @param formula formula
 4 | #' @param data data.frame of training data
 5 | #' @param alpha numeric, Distribution of regularization between the L1 (Lasso)
 6 | #'   and L2 (Ridge) penalties. A value of 1 for alpha represents Lasso
 7 | #'   regression, a value of 0 produces Ridge regression.
 8 | #' @param lambda numeric, regularization strength
 9 | #' @param family character, one of c("gaussian", "binomial", "quasibinomial",
10 | #'   "ordinal", "multinomial", "poisson", "gamma", "tweedie",
11 | #'   "negativebinomial")
12 | #' @param ... other arguments passed to the h2o engine.
13 | #'
14 | #' @return evaluated h2o model call
15 | #' @export
16 | h2o_glm_train <-
17 |   function(formula,
18 |            data,
19 |            alpha = NULL,
20 |            lambda = NULL,
21 |            family = NULL,
22 |            ...) {
23 |     others <- list(...)
24 | 
25 |     # get term names and convert to h2o
26 |     X <- attr(stats::terms(formula, data = data), "term.labels")
27 |     y <- all.vars(formula)[1]
28 | 
29 |     # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
30 |     if (!inherits(data, "H2OFrame")) {
31 |       data <- h2o::as.h2o(data)
32 |     }
33 | 
34 |     # define arguments
35 |     args <- list(
36 |       x = X,
37 |       y = y,
38 |       training_frame = data,
39 |       alpha = alpha,
40 |       lambda = lambda,
41 |       family = family
42 |     )
43 | 
44 |     res <- make_h2o_call("h2o.glm", args, others)
45 | 
46 |     if (!"alpha" %in% names(res@parameters)) {
47 |       res@parameters$alpha <- alpha
48 |     }
49 | 
50 |     if (!"lambda" %in% names(res@parameters)) {
51 |       res@parameters$lambda <- lambda
52 |     }
53 | 
54 |     if (!"family" %in% names(res@parameters)) {
55 |       res@parameters$family <- family
56 |     }
57 | 
58 |     res
59 |   }
60 | 


--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
1 | ## usethis namespace: start
2 | #' @importFrom tibble tibble as_tibble
3 | #' @importFrom parsnip multi_predict
4 | ## usethis namespace: end
5 | NULL
6 | 


--------------------------------------------------------------------------------
/R/linear_reg.R:
--------------------------------------------------------------------------------
 1 | add_linear_reg_h2o <- function() {
 2 |   parsnip::set_model_engine("linear_reg", "regression", "h2o")
 3 |   parsnip::set_dependency("linear_reg", "h2o", "h2o")
 4 | 
 5 |   parsnip::set_model_arg(
 6 |     model = "linear_reg",
 7 |     eng = "h2o",
 8 |     parsnip = "mixture",
 9 |     original = "alpha",
10 |     func = list(pkg = "dials", fun = "mixture"),
11 |     has_submodel = FALSE
12 |   )
13 |   parsnip::set_model_arg(
14 |     model = "linear_reg",
15 |     eng = "h2o",
16 |     parsnip = "penalty",
17 |     original = "lambda",
18 |     func = list(pkg = "dials", fun = "penalty"),
19 |     has_submodel = FALSE
20 |   )
21 |   parsnip::set_fit(
22 |     model = "linear_reg",
23 |     eng = "h2o",
24 |     mode = "regression",
25 |     value = list(
26 |       interface = "formula",
27 |       protect = c("formula", "x", "y", "training_frame"),
28 |       func = c(fun = "h2o_glm_train"),
29 |       defaults = list(
30 |         family = "gaussian"
31 |       )
32 |     )
33 |   )
34 |   parsnip::set_encoding(
35 |     model = "linear_reg",
36 |     eng = "h2o",
37 |     mode = "regression",
38 |     options = list(
39 |       predictor_indicators = "none",
40 |       compute_intercept = FALSE,
41 |       remove_intercept = FALSE,
42 |       allow_sparse_x = FALSE
43 |     )
44 |   )
45 | 
46 |   # regression predict
47 |   parsnip::set_pred(
48 |     model = "linear_reg",
49 |     eng = "h2o",
50 |     mode = "regression",
51 |     type = "numeric",
52 |     value = list(
53 |       pre = function(x, object) h2o::as.h2o(x),
54 |       post = function(x, object) as.data.frame(x)$predict,
55 |       func = c(pkg = "h2o", fun = "h2o.predict"),
56 |       args = list(
57 |         object = quote(object$fit),
58 |         newdata = quote(new_data)
59 |       )
60 |     )
61 |   )
62 |   parsnip::set_pred(
63 |     model = "linear_reg",
64 |     eng = "h2o",
65 |     mode = "regression",
66 |     type = "raw",
67 |     value = list(
68 |       pre = function(x, object) h2o::as.h2o(x),
69 |       post = function(x, object) as.data.frame(x),
70 |       func = c(pkg = "h2o", fun = "h2o.predict"),
71 |       args = list(
72 |         object = quote(object$fit),
73 |         newdata = quote(new_data)
74 |       )
75 |     )
76 |   )
77 | }
78 | 


--------------------------------------------------------------------------------
/R/logistic_reg.R:
--------------------------------------------------------------------------------
 1 | add_logistic_reg_h2o <- function() {
 2 |   parsnip::set_model_engine("logistic_reg", "classification", "h2o")
 3 |   parsnip::set_dependency("logistic_reg", "h2o", "h2o")
 4 | 
 5 |   parsnip::set_model_arg(
 6 |     model = "logistic_reg",
 7 |     eng = "h2o",
 8 |     parsnip = "mixture",
 9 |     original = "alpha",
10 |     func = list(pkg = "dials", fun = "mixture"),
11 |     has_submodel = FALSE
12 |   )
13 |   parsnip::set_model_arg(
14 |     model = "logistic_reg",
15 |     eng = "h2o",
16 |     parsnip = "penalty",
17 |     original = "lambda",
18 |     func = list(pkg = "dials", fun = "penalty"),
19 |     has_submodel = FALSE
20 |   )
21 |   parsnip::set_fit(
22 |     model = "logistic_reg",
23 |     eng = "h2o",
24 |     mode = "classification",
25 |     value = list(
26 |       interface = "formula",
27 |       protect = c("formula", "x", "y", "training_frame", "family"),
28 |       func = c(fun = "h2o_glm_train"),
29 |       defaults = list(
30 |         family = "binomial"
31 |       )
32 |     )
33 |   )
34 |   parsnip::set_encoding(
35 |     model = "logistic_reg",
36 |     eng = "h2o",
37 |     mode = "classification",
38 |     options = list(
39 |       predictor_indicators = "none",
40 |       compute_intercept = FALSE,
41 |       remove_intercept = FALSE,
42 |       allow_sparse_x = FALSE
43 |     )
44 |   )
45 | 
46 |   # classification predict
47 |   parsnip::set_pred(
48 |     model = "logistic_reg",
49 |     eng = "h2o",
50 |     mode = "classification",
51 |     type = "class",
52 |     value = list(
53 |       pre = function(x, object) h2o::as.h2o(x),
54 |       post = function(x, object) as.data.frame(x)$predict,
55 |       func = c(pkg = "h2o", fun = "h2o.predict"),
56 |       args = list(
57 |         object = quote(object$fit),
58 |         newdata = quote(new_data)
59 |       )
60 |     )
61 |   )
62 |   parsnip::set_pred(
63 |     model = "logistic_reg",
64 |     eng = "h2o",
65 |     mode = "classification",
66 |     type = "prob",
67 |     value = list(
68 |       pre = function(x, object) h2o::as.h2o(x),
69 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
70 |       func = c(pkg = "h2o", fun = "h2o.predict"),
71 |       args = list(
72 |         object = quote(object$fit),
73 |         newdata = quote(new_data)
74 |       )
75 |     )
76 |   )
77 |   parsnip::set_pred(
78 |     model = "logistic_reg",
79 |     eng = "h2o",
80 |     mode = "classification",
81 |     type = "raw",
82 |     value = list(
83 |       pre = function(x, object) h2o::as.h2o(x),
84 |       post = function(x, object) as.data.frame(x),
85 |       func = c(pkg = "h2o", fun = "h2o.predict"),
86 |       args = list(
87 |         object = quote(object$fit),
88 |         newdata = quote(new_data)
89 |       )
90 |     )
91 |   )
92 | }
93 | 


--------------------------------------------------------------------------------
/R/make_call.R:
--------------------------------------------------------------------------------
 1 | make_h2o_call <- function(.fn, args, others) {
 2 |   # remove args with NULLs
 3 |   args <- args[lengths(args) != 0]
 4 | 
 5 |   # create unevaluated model call
 6 |   model_call <- rlang::call2(.fn = .fn, !!!args, .ns = "h2o")
 7 | 
 8 |   # add others if not NULL
 9 |   if (length(others) > 0) {
10 |     model_call <- rlang::call_standardise(model_call)
11 |     model_call <- rlang::call_modify(model_call, !!!others)
12 |   }
13 | 
14 |   rlang::eval_tidy(model_call)
15 | }
16 | 


--------------------------------------------------------------------------------
/R/metrics.R:
--------------------------------------------------------------------------------
  1 | #' Mean squared error
  2 | #'
  3 | #' Calculate the mean squared error. This metric is in squared units of the original data.
  4 | #'
  5 | #' @param data A `data.frame` containing the `truth` and `estimate` columns.
  6 | #' @param truth The column identifier for the true class results (that is a
  7 | #'   factor). This should be an unquoted column name although this argument is
  8 | #'   passed by expression and supports quasiquotation (you can unquote column
  9 | #'   names). For _vec() functions, a factor vector.
 10 | #' @param estimate The column identifier for the predicted class results (that
 11 | #'   is also factor). As with truth this can be specified different ways but the
 12 | #'   primary method is to use an unquoted variable name. For _vec() functions, a
 13 | #'   factor vector.
 14 | #' @param na_rm A logical value indicating whether NA values should be stripped
 15 | #'   before the computation proceeds.
 16 | #' @param ... Not currently used.
 17 | #'
 18 | #' @return A `tibble` with columns `.metric`, `.estimator`, and `.estimate` and 1 row of values.
 19 | #' @export
 20 | mse_vec <- function(truth, estimate, na_rm = TRUE, ...) {
 21 |   mse_impl <- function(truth, estimate) {
 22 |     mean((estimate - truth)^2)
 23 |   }
 24 | 
 25 |   yardstick::metric_vec_template(
 26 |     metric_impl = mse_impl,
 27 |     truth = truth,
 28 |     estimate = estimate,
 29 |     na_rm = na_rm,
 30 |     cls = "numeric",
 31 |     ...
 32 |   )
 33 | }
 34 | 
 35 | #' @export
 36 | #' @rdname mse_vec
 37 | mse <- function(data, ...) {
 38 |   UseMethod("mse")
 39 | }
 40 | 
 41 | class(mse) <- c("numeric_metric", "function")
 42 | attr(mse, "direction") <- "minimize"
 43 | 
 44 | 
 45 | #' @export
 46 | #' @rdname mse_vec
 47 | mse.data.frame <- function(data, truth, estimate, na_rm = TRUE, ...) {
 48 |   yardstick::metric_summarizer(
 49 |     metric_nm = "mse",
 50 |     metric_fn = mse_vec,
 51 |     data = data,
 52 |     truth = !!rlang::enquo(truth),
 53 |     estimate = !!rlang::enquo(estimate),
 54 |     na_rm = na_rm,
 55 |     ...
 56 |   )
 57 | }
 58 | 
 59 | #' @importFrom yardstick rsq rmse accuracy mn_log_loss pr_auc roc_auc
 60 | convert_h2o_metrics <- function(metrics) {
 61 |   allowed_metrics <- c(
 62 |     # regression
 63 |     "yardstick::rsq",
 64 |     "yardstick::rmse",
 65 |     "h2oparsnip::mse",
 66 | 
 67 |     # classification
 68 |     "yardstick::accuracy",
 69 |     "yardstick::mn_log_loss",
 70 |     "yardstick::pr_auc",
 71 |     "yardstick::roc_auc"
 72 |   )
 73 | 
 74 |   allowed_metrics <-
 75 |     c(allowed_metrics, gsub("yardstick::", "", allowed_metrics))
 76 |   allowed_metrics <-
 77 |     c(allowed_metrics, gsub("h2oparsnip::", "", allowed_metrics))
 78 | 
 79 |   if (any(!names(attributes(metrics)$metrics) %in% allowed_metrics)) {
 80 |     msg <- "`metrics` argument must contain a `yardstick::metric_set` with one or
 81 |       several of the following metrics:"
 82 |     rlang::abort(paste(msg, paste(allowed_metrics, collapse = ", ")))
 83 |   }
 84 | 
 85 |   metric_names <- names(attributes(metrics)$metric)
 86 |   metric_names <- gsub("yardstick::", "", metric_names)
 87 |   metric_names <- gsub("h2oparsnip::", "", metric_names)
 88 | 
 89 |   convert_metric <- function(yardstick_name) {
 90 |     switch(yardstick_name,
 91 |       # regression
 92 |       rsq = "r2",
 93 |       rmse = "rmse",
 94 |       mse = "mse",
 95 | 
 96 |       # classification
 97 |       accuracy = "accuracy",
 98 |       mn_log_loss = "logloss",
 99 |       roc_auc = "auc",
100 |       pr_auc = "aucpr"
101 |     )
102 |   }
103 | 
104 |   sapply(metric_names, convert_metric)
105 | }
106 | 


--------------------------------------------------------------------------------
/R/mlp.R:
--------------------------------------------------------------------------------
  1 | add_mlp_h2o <- function() {
  2 |   parsnip::set_model_engine("mlp", "classification", "h2o")
  3 |   parsnip::set_model_engine("mlp", "regression", "h2o")
  4 |   parsnip::set_dependency("mlp", "h2o", "h2o")
  5 | 
  6 |   parsnip::set_model_arg(
  7 |     model = "mlp",
  8 |     eng = "h2o",
  9 |     parsnip = "cost",
 10 |     original = "l2",
 11 |     func = list(pkg = "dials", fun = "cost"),
 12 |     has_submodel = FALSE
 13 |   )
 14 |   parsnip::set_model_arg(
 15 |     model = "mlp",
 16 |     eng = "h2o",
 17 |     parsnip = "dropout",
 18 |     original = "hidden_dropout_ratios",
 19 |     func = list(pkg = "dials", fun = "dropout"),
 20 |     has_submodel = FALSE
 21 |   )
 22 |   parsnip::set_model_arg(
 23 |     model = "mlp",
 24 |     eng = "h2o",
 25 |     parsnip = "hidden_units",
 26 |     original = "hidden",
 27 |     func = list(pkg = "dials", fun = "hidden_units"),
 28 |     has_submodel = FALSE
 29 |   )
 30 |   parsnip::set_model_arg(
 31 |     model = "mlp",
 32 |     eng = "h2o",
 33 |     parsnip = "epochs",
 34 |     original = "epochs",
 35 |     func = list(pkg = "dials", fun = "epochs"),
 36 |     has_submodel = FALSE
 37 |   )
 38 |   parsnip::set_model_arg(
 39 |     model = "mlp",
 40 |     eng = "h2o",
 41 |     parsnip = "activation",
 42 |     original = "activation",
 43 |     func = list(pkg = "dials", fun = "activation"),
 44 |     has_submodel = FALSE
 45 |   )
 46 |   parsnip::set_fit(
 47 |     model = "mlp",
 48 |     eng = "h2o",
 49 |     mode = "regression",
 50 |     value = list(
 51 |       interface = "formula",
 52 |       protect = c("formula", "x", "y", "training_frame"),
 53 |       func = c(fun = "h2o_mlp_train"),
 54 |       defaults = list()
 55 |     )
 56 |   )
 57 |   parsnip::set_fit(
 58 |     model = "mlp",
 59 |     eng = "h2o",
 60 |     mode = "classification",
 61 |     value = list(
 62 |       interface = "formula",
 63 |       protect = c("formula", "x", "y", "training_frame"),
 64 |       func = c(fun = "h2o_mlp_train"),
 65 |       defaults = list()
 66 |     )
 67 |   )
 68 |   parsnip::set_encoding(
 69 |     model = "mlp",
 70 |     eng = "h2o",
 71 |     mode = "classification",
 72 |     options = list(
 73 |       predictor_indicators = "none",
 74 |       compute_intercept = FALSE,
 75 |       remove_intercept = FALSE,
 76 |       allow_sparse_x = FALSE
 77 |     )
 78 |   )
 79 |   parsnip::set_encoding(
 80 |     model = "mlp",
 81 |     eng = "h2o",
 82 |     mode = "regression",
 83 |     options = list(
 84 |       predictor_indicators = "none",
 85 |       compute_intercept = FALSE,
 86 |       remove_intercept = FALSE,
 87 |       allow_sparse_x = FALSE
 88 |     )
 89 |   )
 90 | 
 91 |   # regression predict
 92 |   parsnip::set_pred(
 93 |     model = "mlp",
 94 |     eng = "h2o",
 95 |     mode = "regression",
 96 |     type = "numeric",
 97 |     value = list(
 98 |       pre = function(x, object) h2o::as.h2o(x),
 99 |       post = function(x, object) as.data.frame(x)$predict,
100 |       func = c(pkg = "h2o", fun = "h2o.predict"),
101 |       args = list(
102 |         object = quote(object$fit),
103 |         newdata = quote(new_data)
104 |       )
105 |     )
106 |   )
107 |   parsnip::set_pred(
108 |     model = "mlp",
109 |     eng = "h2o",
110 |     mode = "regression",
111 |     type = "raw",
112 |     value = list(
113 |       pre = function(x, object) h2o::as.h2o(x),
114 |       post = function(x, object) as.data.frame(x),
115 |       func = c(pkg = "h2o", fun = "h2o.predict"),
116 |       args = list(
117 |         object = quote(object$fit),
118 |         newdata = quote(new_data)
119 |       )
120 |     )
121 |   )
122 | 
123 |   # classification predict
124 |   parsnip::set_pred(
125 |     model = "mlp",
126 |     eng = "h2o",
127 |     mode = "classification",
128 |     type = "class",
129 |     value = list(
130 |       pre = function(x, object) h2o::as.h2o(x),
131 |       post = function(x, object) as.data.frame(x)$predict,
132 |       func = c(pkg = "h2o", fun = "h2o.predict"),
133 |       args = list(
134 |         object = quote(object$fit),
135 |         newdata = quote(new_data)
136 |       )
137 |     )
138 |   )
139 |   parsnip::set_pred(
140 |     model = "mlp",
141 |     eng = "h2o",
142 |     mode = "classification",
143 |     type = "prob",
144 |     value = list(
145 |       pre = function(x, object) h2o::as.h2o(x),
146 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
147 |       func = c(pkg = "h2o", fun = "h2o.predict"),
148 |       args = list(
149 |         object = quote(object$fit),
150 |         newdata = quote(new_data)
151 |       )
152 |     )
153 |   )
154 |   parsnip::set_pred(
155 |     model = "mlp",
156 |     eng = "h2o",
157 |     mode = "classification",
158 |     type = "raw",
159 |     value = list(
160 |       pre = function(x, object) h2o::as.h2o(x),
161 |       post = function(x, object) as.data.frame(x),
162 |       func = c(pkg = "h2o", fun = "h2o.predict"),
163 |       args = list(
164 |         object = quote(object$fit),
165 |         newdata = quote(new_data)
166 |       )
167 |     )
168 |   )
169 | }
170 | 
171 | #' Wrapper for training a h2o.deeplearning model as part of a parsnip `mlp`
172 | #' h2o engine
173 | #'
174 | #' @param formula formula
175 | #' @param data data.frame of training data
176 | #' @param l2 numeric, l2 regulation parameter, default = 0
177 | #' @param hidden_dropout_ratios dropout ratio for a single hidden layer (default
178 | #'   = 0)
179 | #' @param hidden integer, number of neurons in the hidden layer (default = c(200, 200))
180 | #' @param epochs integer, number of epochs (default = 10)
181 | #' @param activation character, activation function. Must be one of: "Tanh",
182 | #'   "TanhWithDropout", "Rectifier", "RectifierWithDropout", "Maxout",
183 | #'   "MaxoutWithDropout". Defaults to "Rectifier. If `hidden_dropout_ratios` > 0
184 | #'   then the equivalent activation function with dropout is used.
185 | #' @param stopping_rounds An integer specifying the number of training
186 | #'   iterations without improvement before stopping. If `stopping_rounds = 0`
187 | #'   (the default) then early stopping is disabled.  If `validation` is used,
188 | #'   performance is base on the validation set; otherwise the training set is
189 | #'   used.
190 | #' @param validation A positive number. If on `[0, 1)` the value, `validation`
191 | #' is a random proportion of data in `x` and `y` that are used for performance
192 | #' assessment and potential early stopping. If 1 or greater, it is the _number_
193 | #' of training set samples use for these purposes.
194 | #' @param ... other arguments not currently used
195 | #'
196 | #' @return evaluated h2o model call
197 | #' @export
198 | h2o_mlp_train <-
199 |   function(formula,
200 |            data,
201 |            l2 = 0,
202 |            hidden_dropout_ratios = 0,
203 |            hidden = 100,
204 |            epochs = 10,
205 |            activation = "Rectifier",
206 |            stopping_rounds = 0,
207 |            validation = 0,
208 |            ...) {
209 |     others <- list(...)
210 | 
211 |     # get term names and convert to h2o
212 |     X <- attr(stats::terms(formula, data = data), "term.labels")
213 |     y <- all.vars(formula)[1]
214 | 
215 |     # early stopping
216 |     if (validation > 1) {
217 |       validation <- validation / nrow(data)
218 |     }
219 | 
220 |     if (stopping_rounds > 0 & validation > 0) {
221 |       n <- nrow(data)
222 |       trn_index <- sample(1:n, size = floor(n * validation) + 1)
223 |       valid <- data[-trn_index, ]
224 |       data <- data[trn_index, ]
225 |     } else {
226 |       valid <- NULL
227 |     }
228 | 
229 |     # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
230 |     if (!inherits(data, "H2OFrame")) {
231 |       data <- h2o::as.h2o(data)
232 |     }
233 | 
234 |     if (!is.null(valid)) {
235 |       valid <- h2o::as.h2o(valid)
236 |     }
237 | 
238 |     # remap dials::values_activation to permissible h2o activation values
239 |     if (activation %in% c("linear", "elu", "softmax")) {
240 |       stop(
241 |         paste(
242 |           activation,
243 |           "activation function is not available when using the h2o engine."
244 |         )
245 |       )
246 |     }
247 | 
248 |     activation <- switch(activation,
249 |       relu = "Rectifier",
250 |       tanh = "Tanh",
251 |       maxout = "Maxout",
252 |       activation
253 |     )
254 | 
255 |     if (activation == "Rectifier" & hidden_dropout_ratios > 0) {
256 |       activation <- "RectifierWithDropout"
257 |     } else if (activation == "Tanh" & hidden_dropout_ratios > 0) {
258 |       activation <- "TanhWithDropout"
259 |     } else if (activation == "Maxout" & hidden_dropout_ratios > 0) {
260 |       activation <- "MaxoutWithDropout"
261 |     }
262 | 
263 |     if (hidden_dropout_ratios == 0) {
264 |       hidden_dropout_ratios <- NULL
265 |     }
266 | 
267 |     # define arguments
268 |     args <- list(
269 |       x = X,
270 |       y = y,
271 |       training_frame = data,
272 |       validation_frame = valid,
273 |       l2 = l2,
274 |       hidden_dropout_ratios = hidden_dropout_ratios,
275 |       hidden = hidden,
276 |       epochs = epochs,
277 |       activation = activation,
278 |       stopping_rounds = stopping_rounds
279 |     )
280 | 
281 |     make_h2o_call("h2o.deeplearning", args, others)
282 |   }
283 | 


--------------------------------------------------------------------------------
/R/multi_predicts.R:
--------------------------------------------------------------------------------
  1 | #' @export
  2 | multi_predict._H2OMultinomialModel <-
  3 |   function(object, new_data, type = "class", ...) {
  4 |     args <- list(...)
  5 | 
  6 |     if (any(names(rlang::enquos(...)) == "newdata")) {
  7 |       rlang::abort("Did you mean to use `new_data` instead of `newdata`?")
  8 |     }
  9 | 
 10 |     res <- switch(object$fit@algorithm,
 11 |       gbm = gbm_multi_predict(object, new_data, type, args$trees),
 12 |       deeplearning = mlp_multi_predict(object, new_data, type, args$epochs)
 13 |     )
 14 | 
 15 |     res
 16 |   }
 17 | 
 18 | #' @export
 19 | multi_predict._H2ORegressionModel <-
 20 |   function(object, new_data, type = "numeric", ...) {
 21 |     args <- list(...)
 22 | 
 23 |     if (any(names(rlang::enquos(...)) == "newdata")) {
 24 |       rlang::abort("Did you mean to use `new_data` instead of `newdata`?")
 25 |     }
 26 | 
 27 |     res <- switch(object$fit@algorithm,
 28 |       gbm = gbm_multi_predict(object, new_data, type, args$trees),
 29 |       deeplearning = mlp_multi_predict(object, new_data, type, args$epochs)
 30 |     )
 31 | 
 32 |     res
 33 |   }
 34 | 
 35 | 
 36 | gbm_multi_predict <- function(object, new_data, type, trees) {
 37 |   trees <- sort(trees)
 38 | 
 39 |   preds <- h2o::staged_predict_proba.H2OModel(
 40 |     object = object$fit,
 41 |     newdata = h2o::as.h2o(new_data)
 42 |   )
 43 | 
 44 |   preds <- as.data.frame(preds)
 45 | 
 46 |   if (type %in% c("class", "prob")) {
 47 |     n_classes <- length(object$lvl)
 48 | 
 49 |     res <- purrr::map(trees, function(tree) {
 50 |       pred_class_names <- paste0("C", seq_len(n_classes))
 51 |       tree_names <- paste0("T", tree)
 52 |       x <- preds[, paste(tree_names, pred_class_names, sep = ".")]
 53 |       colnames(x) <- object$lvl
 54 |       x <- as.data.frame(x)
 55 |       x$predict <- object$lvl[apply(x, 1, which.max)]
 56 |       x$trees <- tree
 57 |       x
 58 |     })
 59 |   } else if (type == "numeric") {
 60 |     res <- purrr::map(trees, function(tree) {
 61 |       tibble(trees = tree, .pred = preds[, tree])
 62 |     })
 63 |   }
 64 | 
 65 |   # prepare predictions in parsnip format
 66 |   if (type == "class") {
 67 |     res <- purrr::map(
 68 |       res,
 69 |       ~ dplyr::select(.x, trees, predict) %>%
 70 |         dplyr::rename(.pred_class = predict) %>%
 71 |         dplyr::mutate(
 72 |           .pred_class = factor(.pred_class, levels = object$lvl),
 73 |           .row = 1:max(dplyr::row_number())
 74 |         )
 75 |     )
 76 |   } else if (type == "numeric") {
 77 |     res <- purrr::map(
 78 |       res,
 79 |       ~ dplyr::mutate(.x, .row = 1:max(dplyr::row_number()))
 80 |     )
 81 |   } else if (type == "prob") {
 82 |     new_names <- object$lvl
 83 |     names(new_names) <- paste(".pred", object$lvl, sep = "_")
 84 | 
 85 |     res <- purrr::map(
 86 |       res,
 87 |       ~ dplyr::select(.x, trees, !!!object$lvl) %>%
 88 |         dplyr::rename(!!new_names)
 89 |     )
 90 | 
 91 |     res <- res %>%
 92 |       dplyr::mutate(.row = 1:max(dplyr::row_number()))
 93 |   }
 94 | 
 95 |   res <- dplyr::bind_rows(res)
 96 |   res <- dplyr::arrange(res, .row, trees)
 97 |   res <- split(res[, -ncol(res)], res$.row)
 98 | 
 99 |   tibble(.pred = res)
100 | }
101 | 
102 | 
103 | mlp_multi_predict <- function(object, new_data, type, epochs) {
104 |   epochs <- sort(epochs)
105 |   new_data <- h2o::as.h2o(new_data)
106 | 
107 |   preds <- purrr::map(epochs, function(epoch) {
108 |     model <- h2o::h2o.deeplearning(
109 |       x = object$fit@parameters$x,
110 |       y = object$fit@parameters$y,
111 |       training_frame = object$fit@parameters$training_frame,
112 |       checkpoint = object$fit@model_id,
113 |       epochs = epoch,
114 |       hidden = object$fit@parameters$hidden,
115 |       l2 = object$fit@parameters$l2,
116 |       hidden_dropout_ratios = object$fit@parameters$hidden_dropout_ratios,
117 |       activation = object$fit@parameters$activation
118 |     )
119 | 
120 |     predict(model, new_data) %>%
121 |       as.data.frame()
122 |   })
123 | 
124 |   # prepare predictions in parsnip format
125 |   if (type == "class") {
126 |     res <- purrr::map2(
127 |       preds,
128 |       epochs,
129 |       ~ dplyr::select(.x, predict) %>%
130 |         dplyr::rename(.pred_class = predict) %>%
131 |         dplyr::mutate(
132 |           epochs = .y,
133 |           .pred_class = factor(.pred_class, levels = object$lvl),
134 |           .row = 1:max(dplyr::row_number())
135 |         )
136 |     )
137 |   } else if (type == "prob") {
138 |     new_names <- object$lvl
139 |     names(new_names) <- paste(".pred", object$lvl, sep = "_")
140 | 
141 |     res <- purrr::map2(
142 |       preds,
143 |       epochs,
144 |       ~ dplyr::select(.x, !!!object$lvl) %>%
145 |         dplyr::rename(!!!new_names) %>%
146 |         dplyr::mutate(epochs = .y, .row = 1:max(dplyr::row_number()))
147 |     )
148 |   } else if (type == "numeric") {
149 |     res <- purrr::map2(preds, epochs, function(x, epoch) {
150 |       tibble(epochs = epoch, .pred = x$predict)
151 |     })
152 | 
153 |     res <- purrr::map(res, ~ dplyr::mutate(.x, .row = 1:max(dplyr::row_number())))
154 |   }
155 | 
156 |   res <- dplyr::bind_rows(res)
157 |   res <- dplyr::arrange(res, .row, epochs)
158 |   res <- split(res[, -ncol(res)], res$.row)
159 | 
160 |   tibble(.pred = res)
161 | }
162 | 


--------------------------------------------------------------------------------
/R/multinomial_reg.R:
--------------------------------------------------------------------------------
 1 | add_multinom_reg_h2o <- function() {
 2 |   parsnip::set_model_engine("multinom_reg", "classification", "h2o")
 3 |   parsnip::set_dependency("multinom_reg", "h2o", "h2o")
 4 | 
 5 |   parsnip::set_model_arg(
 6 |     model = "multinom_reg",
 7 |     eng = "h2o",
 8 |     parsnip = "mixture",
 9 |     original = "alpha",
10 |     func = list(pkg = "dials", fun = "mixture"),
11 |     has_submodel = FALSE
12 |   )
13 |   parsnip::set_model_arg(
14 |     model = "multinom_reg",
15 |     eng = "h2o",
16 |     parsnip = "penalty",
17 |     original = "lambda",
18 |     func = list(pkg = "dials", fun = "penalty"),
19 |     has_submodel = FALSE
20 |   )
21 |   parsnip::set_fit(
22 |     model = "multinom_reg",
23 |     eng = "h2o",
24 |     mode = "classification",
25 |     value = list(
26 |       interface = "formula",
27 |       protect = c("formula", "x", "y", "training_frame", "family"),
28 |       func = c(fun = "h2o_glm_train"),
29 |       defaults = list(
30 |         family = "multinomial"
31 |       )
32 |     )
33 |   )
34 |   parsnip::set_encoding(
35 |     model = "multinom_reg",
36 |     eng = "h2o",
37 |     mode = "classification",
38 |     options = list(
39 |       predictor_indicators = "none",
40 |       compute_intercept = FALSE,
41 |       remove_intercept = FALSE,
42 |       allow_sparse_x = FALSE
43 |     )
44 |   )
45 | 
46 |   # classification predict
47 |   parsnip::set_pred(
48 |     model = "multinom_reg",
49 |     eng = "h2o",
50 |     mode = "classification",
51 |     type = "class",
52 |     value = list(
53 |       pre = function(x, object) h2o::as.h2o(x),
54 |       post = function(x, object) as.data.frame(x)$predict,
55 |       func = c(pkg = "h2o", fun = "h2o.predict"),
56 |       args = list(
57 |         object = quote(object$fit),
58 |         newdata = quote(new_data)
59 |       )
60 |     )
61 |   )
62 |   parsnip::set_pred(
63 |     model = "multinom_reg",
64 |     eng = "h2o",
65 |     mode = "classification",
66 |     type = "prob",
67 |     value = list(
68 |       pre = function(x, object) h2o::as.h2o(x),
69 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
70 |       func = c(pkg = "h2o", fun = "h2o.predict"),
71 |       args = list(
72 |         object = quote(object$fit),
73 |         newdata = quote(new_data)
74 |       )
75 |     )
76 |   )
77 |   parsnip::set_pred(
78 |     model = "multinom_reg",
79 |     eng = "h2o",
80 |     mode = "classification",
81 |     type = "raw",
82 |     value = list(
83 |       pre = function(x, object) h2o::as.h2o(x),
84 |       post = function(x, object) as.data.frame(x),
85 |       func = c(pkg = "h2o", fun = "h2o.predict"),
86 |       args = list(
87 |         object = quote(object$fit),
88 |         newdata = quote(new_data)
89 |       )
90 |     )
91 |   )
92 | }
93 | 


--------------------------------------------------------------------------------
/R/naive_Bayes.R:
--------------------------------------------------------------------------------
  1 | add_naive_Bayes_h2o <- function() {
  2 |   parsnip::set_model_engine("naive_Bayes", "classification", "h2o")
  3 |   parsnip::set_dependency("naive_Bayes", "h2o", "h2o")
  4 | 
  5 |   parsnip::set_model_arg(
  6 |     model = "naive_Bayes",
  7 |     eng = "h2o",
  8 |     parsnip = "Laplace",
  9 |     original = "laplace",
 10 |     func = list(pkg = "dials", fun = "Laplace"),
 11 |     has_submodel = FALSE
 12 |   )
 13 | 
 14 |   # fit
 15 |   parsnip::set_fit(
 16 |     model = "naive_Bayes",
 17 |     eng = "h2o",
 18 |     mode = "classification",
 19 |     value = list(
 20 |       interface = "formula",
 21 |       protect = c("formula", "x", "y", "training_frame"),
 22 |       func = c(fun = "h2o_naiveBayes_train"),
 23 |       defaults = list()
 24 |     )
 25 |   )
 26 |   parsnip::set_encoding(
 27 |     model = "naive_Bayes",
 28 |     eng = "h2o",
 29 |     mode = "classification",
 30 |     options = list(
 31 |       predictor_indicators = "none",
 32 |       compute_intercept = FALSE,
 33 |       remove_intercept = FALSE,
 34 |       allow_sparse_x = FALSE
 35 |     )
 36 |   )
 37 | 
 38 |   # classification predict
 39 |   parsnip::set_pred(
 40 |     model = "naive_Bayes",
 41 |     eng = "h2o",
 42 |     mode = "classification",
 43 |     type = "class",
 44 |     value = list(
 45 |       pre = function(x, object) h2o::as.h2o(x),
 46 |       post = function(x, object) as.data.frame(x)$predict,
 47 |       func = c(pkg = "h2o", fun = "h2o.predict"),
 48 |       args = list(
 49 |         object = quote(object$fit),
 50 |         newdata = quote(new_data)
 51 |       )
 52 |     )
 53 |   )
 54 |   parsnip::set_pred(
 55 |     model = "naive_Bayes",
 56 |     eng = "h2o",
 57 |     mode = "classification",
 58 |     type = "prob",
 59 |     value = list(
 60 |       pre = function(x, object) h2o::as.h2o(x),
 61 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
 62 |       func = c(pkg = "h2o", fun = "h2o.predict"),
 63 |       args = list(
 64 |         object = quote(object$fit),
 65 |         newdata = quote(new_data)
 66 |       )
 67 |     )
 68 |   )
 69 |   parsnip::set_pred(
 70 |     model = "naive_Bayes",
 71 |     eng = "h2o",
 72 |     mode = "classification",
 73 |     type = "raw",
 74 |     value = list(
 75 |       pre = function(x, object) h2o::as.h2o(x),
 76 |       post = function(x, object) as.data.frame(x),
 77 |       func = c(pkg = "h2o", fun = "h2o.predict"),
 78 |       args = list(
 79 |         object = quote(object$fit),
 80 |         newdata = quote(new_data)
 81 |       )
 82 |     )
 83 |   )
 84 | }
 85 | 
 86 | #' Wrapper for training a h2o.naiveBayes model as part of a discrim `naive_Bayes`
 87 | #' h2o engine
 88 | #'
 89 | #' @param formula formula
 90 | #' @param data data.frame of training data
 91 | #' @param laplace numeric, the Laplace smoothing parameter, must be >= 0.
 92 | #' @param ... other arguments passed to the h2o engine.
 93 | #'
 94 | #' @return a fitted h2o model.
 95 | #' @export
 96 | h2o_naiveBayes_train <-
 97 |   function(formula,
 98 |            data,
 99 |            laplace = 0,
100 |            ...) {
101 |     others <- list(...)
102 | 
103 |     # get term names and convert to h2o
104 |     X <- attr(stats::terms(formula, data = data), "term.labels")
105 |     y <- all.vars(formula)[1]
106 | 
107 |     # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
108 |     if (!inherits(data, "H2OFrame")) {
109 |       data <- h2o::as.h2o(data)
110 |     }
111 | 
112 |     # check arguments
113 |     if (laplace < 0) {
114 |       laplace <- 0
115 |     }
116 | 
117 |     # define arguments
118 |     args <- list(
119 |       x = X,
120 |       y = y,
121 |       training_frame = data,
122 |       laplace = laplace
123 |     )
124 | 
125 |     res <- make_h2o_call("h2o.naiveBayes", args, others)
126 | 
127 |     res
128 |   }
129 | 


--------------------------------------------------------------------------------
/R/rand_forest.R:
--------------------------------------------------------------------------------
  1 | add_rand_forest_h2o <- function() {
  2 |   parsnip::set_model_engine("rand_forest", "classification", "h2o")
  3 |   parsnip::set_model_engine("rand_forest", "regression", "h2o")
  4 |   parsnip::set_dependency("rand_forest", "h2o", "h2o")
  5 | 
  6 |   parsnip::set_model_arg(
  7 |     model = "rand_forest",
  8 |     eng = "h2o",
  9 |     parsnip = "trees",
 10 |     original = "ntrees",
 11 |     func = list(pkg = "dials", fun = "trees"),
 12 |     has_submodel = FALSE
 13 |   )
 14 |   parsnip::set_model_arg(
 15 |     model = "rand_forest",
 16 |     eng = "h2o",
 17 |     parsnip = "min_n",
 18 |     original = "min_rows",
 19 |     func = list(pkg = "dials", fun = "min_n"),
 20 |     has_submodel = FALSE
 21 |   )
 22 |   parsnip::set_model_arg(
 23 |     model = "rand_forest",
 24 |     eng = "h2o",
 25 |     parsnip = "mtry",
 26 |     original = "mtries",
 27 |     func = list(pkg = "dials", fun = "mtry"),
 28 |     has_submodel = FALSE
 29 |   )
 30 |   parsnip::set_fit(
 31 |     model = "rand_forest",
 32 |     eng = "h2o",
 33 |     mode = "regression",
 34 |     value = list(
 35 |       interface = "formula",
 36 |       protect = c("formula", "x", "y", "training_frame"),
 37 |       func = c(fun = "h2o_rf_train"),
 38 |       defaults = list()
 39 |     )
 40 |   )
 41 |   parsnip::set_fit(
 42 |     model = "rand_forest",
 43 |     eng = "h2o",
 44 |     mode = "classification",
 45 |     value = list(
 46 |       interface = "formula",
 47 |       protect = c("formula", "x", "y", "training_frame"),
 48 |       func = c(fun = "h2o_rf_train"),
 49 |       defaults = list()
 50 |     )
 51 |   )
 52 |   parsnip::set_encoding(
 53 |     model = "rand_forest",
 54 |     eng = "h2o",
 55 |     mode = "classification",
 56 |     options = list(
 57 |       predictor_indicators = "none",
 58 |       compute_intercept = FALSE,
 59 |       remove_intercept = FALSE,
 60 |       allow_sparse_x = FALSE
 61 |     )
 62 |   )
 63 |   parsnip::set_encoding(
 64 |     model = "rand_forest",
 65 |     eng = "h2o",
 66 |     mode = "regression",
 67 |     options = list(
 68 |       predictor_indicators = "none",
 69 |       compute_intercept = FALSE,
 70 |       remove_intercept = FALSE,
 71 |       allow_sparse_x = FALSE
 72 |     )
 73 |   )
 74 | 
 75 |   # regression predict
 76 |   parsnip::set_pred(
 77 |     model = "rand_forest",
 78 |     eng = "h2o",
 79 |     mode = "regression",
 80 |     type = "numeric",
 81 |     value = list(
 82 |       pre = function(x, object) h2o::as.h2o(x),
 83 |       post = function(x, object) as.data.frame(x)$predict,
 84 |       func = c(pkg = "h2o", fun = "h2o.predict"),
 85 |       args = list(
 86 |         object = quote(object$fit),
 87 |         newdata = quote(new_data)
 88 |       )
 89 |     )
 90 |   )
 91 |   parsnip::set_pred(
 92 |     model = "rand_forest",
 93 |     eng = "h2o",
 94 |     mode = "regression",
 95 |     type = "raw",
 96 |     value = list(
 97 |       pre = function(x, object) h2o::as.h2o(x),
 98 |       post = function(x, object) as.data.frame(x),
 99 |       func = c(pkg = "h2o", fun = "h2o.predict"),
100 |       args = list(
101 |         object = quote(object$fit),
102 |         newdata = quote(new_data)
103 |       )
104 |     )
105 |   )
106 | 
107 |   # classification predict
108 |   parsnip::set_pred(
109 |     model = "rand_forest",
110 |     eng = "h2o",
111 |     mode = "classification",
112 |     type = "class",
113 |     value = list(
114 |       pre = function(x, object) h2o::as.h2o(x),
115 |       post = function(x, object) as.data.frame(x)$predict,
116 |       func = c(pkg = "h2o", fun = "h2o.predict"),
117 |       args = list(
118 |         object = quote(object$fit),
119 |         newdata = quote(new_data)
120 |       )
121 |     )
122 |   )
123 |   parsnip::set_pred(
124 |     model = "rand_forest",
125 |     eng = "h2o",
126 |     mode = "classification",
127 |     type = "prob",
128 |     value = list(
129 |       pre = function(x, object) h2o::as.h2o(x),
130 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
131 |       func = c(pkg = "h2o", fun = "h2o.predict"),
132 |       args = list(
133 |         object = quote(object$fit),
134 |         newdata = quote(new_data)
135 |       )
136 |     )
137 |   )
138 |   parsnip::set_pred(
139 |     model = "rand_forest",
140 |     eng = "h2o",
141 |     mode = "classification",
142 |     type = "raw",
143 |     value = list(
144 |       pre = function(x, object) h2o::as.h2o(x),
145 |       post = function(x, object) as.data.frame(x),
146 |       func = c(pkg = "h2o", fun = "h2o.predict"),
147 |       args = list(
148 |         object = quote(object$fit),
149 |         newdata = quote(new_data)
150 |       )
151 |     )
152 |   )
153 | }
154 | 
155 | #' Wrapper for training a h2o.randomForest model as part of a parsnip
156 | #' `rand_forest` h2o engine
157 | #'
158 | #' @param formula formula
159 | #' @param data data.frame of training data
160 | #' @param ntrees integer, the number of trees to build (default = 50)
161 | #' @param min_rows integer, the minimum number of observations for a leaf
162 | #'   (default = 10)
163 | #' @param mtries integer, the number of columns to randomly select at each
164 | #' level. Default of -1 is sqrt(p) for classification and (p/3) for regression.
165 | #' @param stopping_rounds An integer specifying the number of training
166 | #'   iterations without improvement before stopping. If `stopping_rounds = 0`
167 | #'   (the default) then early stopping is disabled.  If `validation` is used,
168 | #'   performance is base on the validation set; otherwise the training set is
169 | #'   used.
170 | #' @param validation A positive number. If on `[0, 1)` the value, `validation`
171 | #' is a random proportion of data in `x` and `y` that are used for performance
172 | #' assessment and potential early stopping. If 1 or greater, it is the _number_
173 | #' of training set samples use for these purposes.
174 | #' @param ... other arguments not currently used
175 | #'
176 | #' @return evaluated h2o model call
177 | #' @export
178 | h2o_rf_train <-
179 |   function(formula,
180 |            data,
181 |            ntrees = 50,
182 |            min_rows = 10,
183 |            mtries = -1,
184 |            stopping_rounds = 0,
185 |            validation = 0,
186 |            ...) {
187 |     others <- list(...)
188 | 
189 |     # get term names and convert to h2o
190 |     X <- attr(stats::terms(formula, data = data), "term.labels")
191 |     y <- all.vars(formula)[1]
192 | 
193 |     # early stopping
194 |     if (validation > 1) {
195 |       validation <- validation / nrow(data)
196 |     }
197 | 
198 |     if (stopping_rounds > 0 & validation > 0) {
199 |       n <- nrow(data)
200 |       trn_index <- sample(1:n, size = floor(n * validation) + 1)
201 |       valid <- data[-trn_index, ]
202 |       data <- data[trn_index, ]
203 |     } else {
204 |       valid <- NULL
205 |     }
206 | 
207 |     # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
208 |     if (!inherits(data, "H2OFrame")) {
209 |       data <- h2o::as.h2o(data)
210 |     }
211 | 
212 |     if (!is.null(valid)) {
213 |       valid <- h2o::as.h2o(valid)
214 |     }
215 | 
216 |     # check for valid mtries
217 |     if (mtries > length(X)) {
218 |       mtries <- length(X)
219 |     }
220 | 
221 |     # define arguments
222 |     args <- list(
223 |       x = X,
224 |       y = y,
225 |       training_frame = data,
226 |       validation_frame = valid,
227 |       ntrees = ntrees,
228 |       min_rows = min_rows,
229 |       mtries = mtries,
230 |       stopping_rounds = stopping_rounds
231 |     )
232 | 
233 |     res <- make_h2o_call("h2o.randomForest", args, others)
234 | 
235 |     res
236 |   }
237 | 


--------------------------------------------------------------------------------
/R/read_h2o.R:
--------------------------------------------------------------------------------
  1 | #' Saves an H2O model to file that is contained within a fitted parsnip model
  2 | #' specification or contained within a workflow
  3 | #'
  4 | #' H2O models cannot be saved using the typical R approaches such as saveRDS
  5 | #' because the actual H2O model is contained within a Java virtual machine. H2O
  6 | #' models need to be saved and restored using the `h2o::h2o.saveModel` and
  7 | #' `h2o::h2o.loadModel` functions. This is inconvenient for using H2O models
  8 | #' contained within parsnip model specifications or workflow objects.
  9 | #'
 10 | #' The `write_h2o` function extracts the H2O model from within a parsnip or
 11 | #' workflow fitted model and saves it to file using the `h2o::h2o.saveModel`
 12 | #' function. To restore a model and insert it back into a previously fitted
 13 | #' model use the `read_h2o` function.
 14 | #'
 15 | #' @param object Either a `workflows::workflow()` object contained a fitted
 16 | #'   model when using the workflows package, or a `model_spec` object from a
 17 | #'   fitted model when using the parsnip package directly.
 18 | #' @param filename A `character` specifying the file path used to save the
 19 | #'   model. H2O models do not require a specific file extension.
 20 | #' @param ... Currently not used.
 21 | #'
 22 | #' @return The file path used to save the model.
 23 | #' @export
 24 | #'
 25 | #' @examples
 26 | #' library(parsnip)
 27 | #' library(h2o)
 28 | #'
 29 | #' # start a h2o session
 30 | #' h2o.init()
 31 | #'
 32 | #' # fit a parsnip model using the h2o engine
 33 | #' clf <- mlp(mode = "classification") %>%
 34 | #'   set_engine("h2o")
 35 | #'
 36 | #' model_fit <- clf %>% fit(Species ~ ., iris)
 37 | #'
 38 | #' # save the parsnip model
 39 | #' saveRDS(model_fit, file.path(tempdir(), "my_model.rds"))
 40 | #'
 41 | #' # save the h2o component of the model
 42 | #' write_h2o(object = model_fit, filename = file.path(tempdir(), "my_h2o_model.mod"))
 43 | #' h2o.shutdown(prompt = FALSE)
 44 | #'
 45 | #' # restore a model
 46 | #' h2o.init()
 47 | #' model_fit <- readRDS(file.path(tempdir(), "my_model.rds"))
 48 | #'
 49 | #' # read and insert the H2O model back into the parsnip object
 50 | #' model_fit <- read_h2o(model_fit, file.path(tempdir(), "my_h2o_model.mod"))
 51 | write_h2o <- function(object, filename, ...) {
 52 |   UseMethod("write_h2o", object)
 53 | }
 54 | 
 55 | 
 56 | #' @export
 57 | #' @rdname write_h2o
 58 | read_h2o <- function(object, filename, ...) {
 59 |   UseMethod("read_h2o", object)
 60 | }
 61 | 
 62 | 
 63 | #' @export
 64 | #' @rdname write_h2o
 65 | write_h2o.workflow <- function(object, filename, ...) {
 66 |   # extract model from workflow
 67 |   parsnip_model_spec <- workflows::pull_workflow_fit(object)
 68 | 
 69 |   # save H2O model which uses the model ID as the filename
 70 |   directory <- dirname(filename)
 71 |   fileout <- h2o::h2o.saveModel(
 72 |     object = parsnip_model_spec$fit,
 73 |     path = directory,
 74 |     force = TRUE
 75 |   )
 76 | 
 77 |   # rename the file
 78 |   file.rename(fileout, filename)
 79 | 
 80 |   filename
 81 | }
 82 | 
 83 | 
 84 | #' @export
 85 | #' @rdname write_h2o
 86 | write_h2o.model_fit <- function(object, filename, ...) {
 87 |   # extract the h2o model
 88 |   fit_obj <- object$fit
 89 | 
 90 |   # save H2O model which uses the model ID as the filename
 91 |   directory <- dirname(filename)
 92 |   fileout <- h2o::h2o.saveModel(
 93 |     object = fit_obj,
 94 |     path = directory,
 95 |     force = TRUE
 96 |   )
 97 | 
 98 |   # rename the file
 99 |   file.rename(fileout, filename)
100 | 
101 |   filename
102 | }
103 | 
104 | 
105 | #' @export
106 | #' @rdname write_h2o
107 | read_h2o.workflow <- function(object, filename, ...) {
108 |   h2o_model <- h2o::h2o.loadModel(filename)
109 |   object$fit$fit$fit <- h2o_model
110 |   object
111 | }
112 | 
113 | 
114 | #' @export
115 | #' @rdname write_h2o
116 | read_h2o.model_fit <- function(object, filename, ...) {
117 |   h2o_model <- h2o::h2o.loadModel(filename)
118 |   object$fit <- h2o_model
119 |   object
120 | }
121 | 


--------------------------------------------------------------------------------
/R/rule_fit.R:
--------------------------------------------------------------------------------
  1 | add_rule_fit_h2o <- function() {
  2 |   parsnip::set_model_engine("rule_fit", "classification", "h2o")
  3 |   parsnip::set_model_engine("rule_fit", "regression", "h2o")
  4 |   parsnip::set_dependency("rule_fit", "h2o", "h2o")
  5 | 
  6 |   parsnip::set_model_arg(
  7 |     model = "rule_fit",
  8 |     eng = "h2o",
  9 |     parsnip = "trees",
 10 |     original = "rule_generation_ntrees",
 11 |     func = list(pkg = "dials", fun = "trees"),
 12 |     has_submodel = FALSE
 13 |   )
 14 |   parsnip::set_model_arg(
 15 |     model = "rule_fit",
 16 |     eng = "h2o",
 17 |     parsnip = "tree_depth",
 18 |     original = "max_rule_length",
 19 |     func = list(pkg = "dials", fun = "tree_depth"),
 20 |     has_submodel = FALSE
 21 |   )
 22 |   parsnip::set_model_arg(
 23 |     model = "rule_fit",
 24 |     eng = "h2o",
 25 |     parsnip = "penalty",
 26 |     original = "lambda",
 27 |     func = list(pkg = "dials", fun = "penalty"),
 28 |     has_submodel = FALSE
 29 |   )
 30 |   parsnip::set_fit(
 31 |     model = "rule_fit",
 32 |     eng = "h2o",
 33 |     mode = "regression",
 34 |     value = list(
 35 |       interface = "formula",
 36 |       protect = c("formula", "x", "y", "training_frame"),
 37 |       func = c(fun = "h2o_rulefit_train"),
 38 |       defaults = list()
 39 |     )
 40 |   )
 41 |   parsnip::set_fit(
 42 |     model = "rule_fit",
 43 |     eng = "h2o",
 44 |     mode = "classification",
 45 |     value = list(
 46 |       interface = "formula",
 47 |       protect = c("formula", "x", "y", "training_frame"),
 48 |       func = c(fun = "h2o_rulefit_train"),
 49 |       defaults = list()
 50 |     )
 51 |   )
 52 |   parsnip::set_encoding(
 53 |     model = "rule_fit",
 54 |     eng = "h2o",
 55 |     mode = "classification",
 56 |     options = list(
 57 |       predictor_indicators = "none",
 58 |       compute_intercept = FALSE,
 59 |       remove_intercept = FALSE,
 60 |       allow_sparse_x = FALSE
 61 |     )
 62 |   )
 63 |   parsnip::set_encoding(
 64 |     model = "rule_fit",
 65 |     eng = "h2o",
 66 |     mode = "regression",
 67 |     options = list(
 68 |       predictor_indicators = "none",
 69 |       compute_intercept = FALSE,
 70 |       remove_intercept = FALSE,
 71 |       allow_sparse_x = FALSE
 72 |     )
 73 |   )
 74 | 
 75 |   # regression predict
 76 |   parsnip::set_pred(
 77 |     model = "rule_fit",
 78 |     eng = "h2o",
 79 |     mode = "regression",
 80 |     type = "numeric",
 81 |     value = list(
 82 |       pre = function(x, object) h2o::as.h2o(x),
 83 |       post = function(x, object) as.data.frame(x)$predict,
 84 |       func = c(pkg = "h2o", fun = "h2o.predict"),
 85 |       args = list(
 86 |         object = quote(object$fit),
 87 |         newdata = quote(new_data)
 88 |       )
 89 |     )
 90 |   )
 91 |   parsnip::set_pred(
 92 |     model = "rule_fit",
 93 |     eng = "h2o",
 94 |     mode = "regression",
 95 |     type = "raw",
 96 |     value = list(
 97 |       pre = function(x, object) h2o::as.h2o(x),
 98 |       post = function(x, object) as.data.frame(x),
 99 |       func = c(pkg = "h2o", fun = "h2o.predict"),
100 |       args = list(
101 |         object = quote(object$fit),
102 |         newdata = quote(new_data)
103 |       )
104 |     )
105 |   )
106 | 
107 |   # classification predict
108 |   parsnip::set_pred(
109 |     model = "rule_fit",
110 |     eng = "h2o",
111 |     mode = "classification",
112 |     type = "class",
113 |     value = list(
114 |       pre = function(x, object) h2o::as.h2o(x),
115 |       post = function(x, object) as.data.frame(x)$predict,
116 |       func = c(pkg = "h2o", fun = "h2o.predict"),
117 |       args = list(
118 |         object = quote(object$fit),
119 |         newdata = quote(new_data)
120 |       )
121 |     )
122 |   )
123 |   parsnip::set_pred(
124 |     model = "rule_fit",
125 |     eng = "h2o",
126 |     mode = "classification",
127 |     type = "prob",
128 |     value = list(
129 |       pre = function(x, object) h2o::as.h2o(x),
130 |       post = function(x, object) as.data.frame(x[, 2:ncol(x)]),
131 |       func = c(pkg = "h2o", fun = "h2o.predict"),
132 |       args = list(
133 |         object = quote(object$fit),
134 |         newdata = quote(new_data)
135 |       )
136 |     )
137 |   )
138 |   parsnip::set_pred(
139 |     model = "rule_fit",
140 |     eng = "h2o",
141 |     mode = "classification",
142 |     type = "raw",
143 |     value = list(
144 |       pre = function(x, object) h2o::as.h2o(x),
145 |       post = function(x, object) as.data.frame(x),
146 |       func = c(pkg = "h2o", fun = "h2o.predict"),
147 |       args = list(
148 |         object = quote(object$fit),
149 |         newdata = quote(new_data)
150 |       )
151 |     )
152 |   )
153 | }
154 | 
155 | #' Wrapper for training a h2o.rulefit model as part of a parsnip
156 | #' `rule_fit` h2o engine
157 | #'
158 | #' @param formula formula
159 | #' @param data data.frame of training data
160 | #' @param rule_generation_ntrees integer, the number of trees to build (default
161 | #'   = 50)
162 | #' @param max_rule_length integer, the maximum tree depth (default = 3).
163 | #' @param lambda Specify the regularization strength for LASSO regressor.
164 | #' @param ... other arguments that are passed to the h2o model
165 | #'
166 | #' @return evaluated h2o model call
167 | #' @export
168 | h2o_rulefit_train <-
169 |   function(formula,
170 |            data,
171 |            rule_generation_ntrees = 50,
172 |            max_rule_length = 3,
173 |            lambda = 0,
174 |            ...) {
175 |     others <- list(...)
176 | 
177 |     # get term names and convert to h2o
178 |     X <- attr(stats::terms(formula, data = data), "term.labels")
179 |     y <- all.vars(formula)[1]
180 | 
181 |     # convert to H2OFrame (although parsnip doesn't support H2OFrames right now)
182 |     if (!inherits(data, "H2OFrame")) {
183 |       data <- h2o::as.h2o(data)
184 |     }
185 | 
186 |     # define arguments
187 |     args <- list(
188 |       x = X,
189 |       y = y,
190 |       training_frame = data,
191 |       rule_generation_ntrees = rule_generation_ntrees,
192 |       max_rule_length = max_rule_length,
193 |       lambda = lambda
194 |     )
195 | 
196 |     res <- make_h2o_call("h2o.rulefit", args, others)
197 | 
198 |     res
199 |   }
200 | 


--------------------------------------------------------------------------------
/R/translate_args.R:
--------------------------------------------------------------------------------
 1 | translate_args <- function(model_name) {
 2 |   envir <- parsnip::get_model_env()
 3 | 
 4 |   args <- tibble::tibble(ls(envir))
 5 |   args <- rlang::set_names(args, "name")
 6 |   args <- args[grepl("args", args$name), ]
 7 | 
 8 |   args$model <- sub("_args", "", args$name)
 9 |   args$args <- lapply(args$name, function(x) envir[[x]])
10 | 
11 |   args <- args %>%
12 |     tidyr::unnest("args") %>%
13 |     dplyr::select(!!rlang::sym("model"):!!rlang::sym("original"))
14 | 
15 |   args <- args[args$model == model_name, ]
16 | 
17 |   args %>%
18 |     dplyr::select(-dplyr::one_of("model")) %>%
19 |     tidyr::pivot_wider(
20 |       names_from = !!rlang::sym("engine"),
21 |       values_from = !!rlang::sym("original")
22 |     )
23 | }
24 | 
25 | rename_list <- function(x, new_names) {
26 |   rename_args <-
27 |     sapply(names(x), function(nm) {
28 |       if (nm %in% names(new_names)) {
29 |         new_names[names(new_names) == nm]
30 |       } else {
31 |         nm
32 |       }
33 |     },
34 |     USE.NAMES = FALSE
35 |     )
36 |   names(x) <- rename_args
37 |   x
38 | }
39 | 


--------------------------------------------------------------------------------
/R/tune_grid_h2o.R:
--------------------------------------------------------------------------------
  1 | #' Tune h2o models
  2 | #'
  3 | #' This is a prototype of a version of tune_grid that uses h2o.grid to perform
  4 | #' hyperparameter tuning.
  5 | #'
  6 | #' @section Limitations:
  7 | #' - Only model arguments can be tuned, not arguments in the preprocessing
  8 | #' recipes.
  9 | #'
 10 | #' - Parsnip only allows `data.frame` and `tbl_spark` objects to be passed to
 11 | #' the `fit` method, not `H2OFrame` objects.
 12 | #'
 13 | #' @param object A parsnip `model_spec` object.
 14 | #' @param preprocessor A `recipe` object.
 15 | #' @param resamples An `rset` object.
 16 | #' @param param_info A `dials::parameters()` object or NULL. If none is given, a
 17 | #'   parameters set is derived from other arguments. Passing this argument can
 18 | #'   be useful when parameter ranges need to be customized.
 19 | #' @param grid A `data.frame` of tuning combinations or a positive integer. The
 20 | #'   data frame should have columns for each parameter being tuned and rows for
 21 | #'   tuning parameter candidates. An integer denotes the number of candidate
 22 | #'   parameter sets to be created automatically. If a positive integer is used
 23 | #'   or no tuning grid is supplied, then a semi-random grid via
 24 | #'   `dials::grid_latin_hypercube` is created based on the specified number of
 25 | #'   tuning iterations (default size = 10).
 26 | #' @param metrics A `yardstick::metric_set` or NULL. Note that not all yardstick
 27 | #'   metrics can be used with `tune_grid_h2o`. The metrics must be one of
 28 | #'   `yardstick::rsq`, `yardstick::rmse` or `h2oparsnip::mse` for regression
 29 | #'   models, and `yardstick::accuracy`, `yardstick::mn_log_loss`,
 30 | #'   `yardstick::roc_auc` or `yardstick::pr_auc` for classification models. If
 31 | #'   NULL then the default is `yardstick::rsq` for regression models and
 32 | #'   `yardstick::mn_log_loss` for classification models.
 33 | #' @param control An object used to modify the tuning process.
 34 | #' @param ... Not currently used.
 35 | #'
 36 | #' @return
 37 | #' @export
 38 | tune_grid_h2o <-
 39 |   function(object,
 40 |            preprocessor = NULL,
 41 |            resamples,
 42 |            param_info = NULL,
 43 |            grid = 10,
 44 |            metrics = NULL,
 45 |            control = control_h2o(),
 46 |            ...) {
 47 | 
 48 |     # check parameters
 49 |     if (inherits(object, "workflow")) {
 50 |       preprocessor <- workflows::pull_workflow_preprocessor(object)
 51 |       object <- workflows::pull_workflow_spec(object)
 52 |     }
 53 | 
 54 |     if (is.null(param_info)) {
 55 |       param_info <- tune::parameters(object)
 56 |     }
 57 | 
 58 |     if (inherits(grid, "numeric")) {
 59 |       grid <- dials::grid_latin_hypercube(param_info, size = grid)
 60 |     }
 61 | 
 62 |     if (!inherits(metrics, "metric_set")) {
 63 |       rlang::abort("argument `metrics` must be a `yardstick::metric_set`")
 64 |     }
 65 | 
 66 |     # check for supported scoring metrics
 67 |     metric_attrs <- attributes(metrics)
 68 |     metric_names <- names(metric_attrs$metrics)
 69 | 
 70 |     # tuning control options
 71 |     if (isFALSE(control$verbose)) {
 72 |       h2o::h2o.no_progress()
 73 |     }
 74 | 
 75 |     # get model mode
 76 |     model_mode <- object$mode
 77 | 
 78 |     # get model specification arguments
 79 |     model_args <- object$args
 80 | 
 81 |     # process scoring metric
 82 |     if (is.null(metrics) & model_mode == "classification") {
 83 |       metrics <- metric_set(yardstick::mn_log_loss)
 84 |     }
 85 |     if (is.null(metrics) & model_mode == "regression") {
 86 |       metrics <- metric_set(yardstick::rsq)
 87 |     }
 88 |     h2o_metrics <- convert_h2o_metrics(metrics)
 89 | 
 90 |     # extract complete dataset from resamples
 91 |     data_train <- rsample::training(resamples$splits[[1]])
 92 |     data_test <- rsample::testing(resamples$splits[[1]])
 93 |     full_data <- dplyr::bind_rows(data_train, data_test)
 94 |     row_order <- c(as.numeric(rownames(data_train)), as.numeric(rownames(data_test)))
 95 |     full_data <- as_tibble(full_data[order(row_order), ])
 96 | 
 97 |     # prep the recipe
 98 |     prepped_recipe <- preprocessor %>%
 99 |       recipes::prep(training = full_data, retain = TRUE)
100 | 
101 |     full_data <- prepped_recipe %>%
102 |       recipes::bake(new_data = NULL)
103 | 
104 |     # get predictor and outcome terms
105 |     outcome <- prepped_recipe$term_info %>%
106 |       dplyr::filter(!!rlang::sym("role") == "outcome") %>%
107 |       dplyr::pull(!!rlang::sym("variable"))
108 | 
109 |     predictors <- prepped_recipe$term_info %>%
110 |       dplyr::filter(!!rlang::sym("role") == "predictor") %>%
111 |       dplyr::pull(!!rlang::sym("variable"))
112 | 
113 |     form <- paste(outcome, "~", paste(predictors, collapse = " + "))
114 |     form <- stats::as.formula(form)
115 | 
116 |     # replace descriptors
117 |     if (parsnip:::requires_descrs(object)) {
118 |       data_stats <- parsnip:::get_descr_form(form, full_data)
119 |       parsnip:::scoped_descrs(data_stats)
120 |     }
121 | 
122 |     # get row indexes of assessment set data for each rsplit
123 |     assessment_indices <- purrr::map(resamples$splits, rsample::complement)
124 | 
125 |     # pass full data to h2o
126 |     full_data_h2o <- h2o::as.h2o(full_data, destination_frame = "grid_data")
127 | 
128 |     # translate parsnip arguments to h2o
129 |     alg <- model_spec_to_algorithm(object, model_args)
130 |     algorithm <- alg[[1]]
131 |     model_name <- alg[[2]]
132 |     model_args <- alg[[3]]
133 | 
134 |     original_names <- translate_args(model_name) %>%
135 |       dplyr::select(dplyr::all_of(c("parsnip", "h2o"))) %>%
136 |       tidyr::drop_na()
137 | 
138 |     nm <- original_names$h2o %>%
139 |       rlang::set_names(original_names$parsnip)
140 | 
141 |     model_args <- rename_list(model_args, nm)
142 |     tuning_args <- as.character(nm[tune::tune_args(object)$name])
143 |     rename_args <- nm[nm %in% tuning_args]
144 | 
145 |     # convert tuning values to a named list of hyperparameters and values
146 |     # e.g. list(mtries = c(3, 5, 7), min_rows = c(1, 5, 10))
147 |     params <- as.list(grid)
148 |     params <- purrr::map(params, ~ .x[!duplicated(.x)])
149 |     params <- rename_list(params, nm)
150 | 
151 |     # remove arguments that are not set
152 |     null_args <- sapply(model_args, function(x) is.null(rlang::eval_tidy(x)))
153 |     model_args <- model_args[!null_args]
154 | 
155 |     # check tunable
156 |     if (!any(tuning_args %in% names(params))) {
157 |       missing_tune <- tuning_args[!tuning_args %in% names(params)]
158 |       rlang::abort(paste("missing arguments", missing_tune))
159 |     }
160 | 
161 |     model_args <- model_args[!names(model_args) %in% tuning_args]
162 |     model_args <- append(model_args, object$eng_args)
163 | 
164 |     if (length(model_args) == 0) {
165 |       model_args <- NULL
166 |     }
167 | 
168 |     # fit h2o.grid on each resample
169 |     grid_ids <- replicate(
170 |       length(assessment_indices),
171 |       generate_random_id(glue::glue("{algorithm}_grid"))
172 |     )
173 | 
174 |     search_criteria <-
175 |       list(strategy = "RandomDiscrete", max_models = nrow(grid))
176 | 
177 |     resamples$.metrics <-
178 |       purrr::map2(assessment_indices, grid_ids, function(ids, grid_id) {
179 |         grid_args <- list(
180 |           grid_id = grid_id,
181 |           algorithm = algorithm,
182 |           x = predictors,
183 |           y = outcome,
184 |           training_frame = full_data_h2o[-ids, ],
185 |           validation_frame = full_data_h2o[ids, ],
186 |           hyper_params = params,
187 |           keep_cross_validation_predictions = FALSE,
188 |           keep_cross_validation_models = FALSE,
189 |           search_criteria = search_criteria,
190 |           parallelism = 0
191 |         )
192 | 
193 |         # set control options
194 |         if (control$save_pred) {
195 |           grid_args$keep_cross_validation_predictions <- TRUE
196 |         }
197 | 
198 |         # call h2o.grid
199 |         res <- make_h2o_call("h2o.grid", grid_args, model_args)
200 | 
201 |         # extract the scores from the cross-validation predictions
202 |         purrr::map2_dfr(
203 |           .x = h2o_metrics, .y = names(h2o_metrics),
204 |           .f = extract_h2o_scores, grid_args$grid_id,
205 |           params, rename_args, model_mode
206 |         )
207 |       })
208 | 
209 |     # optionally extract the predictions
210 |     if (control$save_pred) {
211 |       resamples$.predictions <- extract_h2o_preds(
212 |         assessment_indices, grid_ids,
213 |         full_data_h2o, rename_args, model_mode
214 |       )
215 |     }
216 | 
217 |     # optionally store/remove the models from the cluster
218 |     if (control$save_models) {
219 |       resamples$.models <- extract_h2o_models(grid_ids, rename_args)
220 |     } else {
221 |       remove_h2o_models(grid_ids)
222 |     }
223 | 
224 |     # add the .notes column (empty for now)
225 |     resamples$.notes <- purrr::map(nrow(resamples), ~ tibble::as_tibble_col(character()))
226 | 
227 |     # create a `tune_results` class
228 |     class(resamples) <- c("tune_results", class(resamples))
229 | 
230 |     arg_names <- names(grid)
231 |     param_list <- purrr::map(
232 |       arg_names,
233 |       ~ glue::glue("dials::{.x}()") %>%
234 |         rlang::parse_expr() %>%
235 |         rlang::eval_tidy()
236 |     )
237 |     attr(resamples, "parameters") <- dials::parameters(param_list)
238 | 
239 |     names(attributes(metrics)$metrics) <-
240 |       gsub("yardstick::", "", names(attributes(metrics)$metrics))
241 |     names(attributes(metrics)$metrics) <-
242 |       gsub("h2oparsnip::", "", names(attributes(metrics)$metrics))
243 |     attr(resamples, "metrics") <- metrics
244 | 
245 |     resamples
246 |   }
247 | 
248 | 
249 | #' Removes the models creating during tuning resampling
250 | #'
251 | #' @param grid_ids A character vector of h2o grid ids for each tuning resample.
252 | #'
253 | #' @return
254 | #' @keywords internal
255 | remove_h2o_models <- function(grid_ids) {
256 |   for (grid_id in grid_ids) {
257 |     grid <- h2o::h2o.getGrid(grid_id)
258 | 
259 |     for (model_id in as.character(grid@model_ids)) {
260 |       h2o::h2o.rm(model_id)
261 |     }
262 |   }
263 | }
264 | 
265 | 
266 | #' Extract the h2o model ids for all of the tuning grids and return them as list
267 | #' of tibbles.
268 | #'
269 | #' @param grid_ids A character vector of the h2o ids for the tuning grids.
270 | #' @param rename_args A named character vector used to remap the h2o
271 | #'   hyperparameter names to tidymodels nomenclature. The names of the vector
272 | #'   are the tidymodels nomenclature and the values are the h2o nomenclature,
273 | #'   e.g. c(mtry = "mtries", min_n = "min_rows")
274 | #'
275 | #' @return a list of tibbles
276 | #'
277 | #' @keywords internal
278 | extract_h2o_models <- function(grid_ids, rename_args) {
279 |   purrr::map(grid_ids, function(grid_id) {
280 |     grid <- h2o::h2o.getGrid(grid_id)
281 |     model_ids <- as.character(grid@model_ids)
282 |     df <- grid@summary_table[rename_args]
283 |     df <- dplyr::rename(df, dplyr::all_of(rename_args))
284 |     df$.model_ids <- as.character(grid@model_ids)
285 |     df
286 |   })
287 | }
288 | 
289 | 
290 | #' Extract the predictions for each tuning grid from the h2o cluster
291 | #'
292 | #' @param test_indices A numeric vector if the row numbers of the H2OFrame that
293 | #'   represent the assessment samples.
294 | #' @param grid_ids A character vector of the h2o ids for the tuning grids.
295 | #' @param data A H2OFrame object.
296 | #' @param rename_args A named character vector used to remap the h2o
297 | #'   hyperparameter names to tidymodels nomenclature. The names of the vector
298 | #'   are the tidymodels nomenclature and the values are the h2o nomenclature,
299 | #'   e.g. c(mtry = "mtries", min_n = "min_rows")
300 | #'
301 | #' @return A tibble
302 | #'
303 | #' @keywords internal
304 | extract_h2o_preds <- function(test_indices, grid_ids, data, rename_args, model_mode) {
305 |   predictions <- purrr::map2(test_indices, grid_ids, function(ids, grid_id) {
306 |     grid <- h2o::h2o.getGrid(grid_id)
307 |     model_ids <- as.character(grid@model_ids)
308 |     grid_args <- grid@summary_table[rename_args]
309 |     grid_args <- dplyr::rename(grid_args, dplyr::all_of(rename_args))
310 | 
311 |     purrr::map_dfr(seq_along(model_ids), function(i) {
312 |       model <- h2o::h2o.getModel(model_ids[[i]])
313 |       args <- grid_args[i, ]
314 |       preds <- tibble::as_tibble(predict(model, data[ids, ]))
315 | 
316 |       if (model_mode == "classification") {
317 |         names(preds) <- ".pred_class"
318 |       } else {
319 |         names(preds) <- ".pred"
320 |       }
321 |       dplyr::bind_cols(preds, args, .row = ids)
322 |     })
323 |   })
324 | 
325 |   return(predictions)
326 | }
327 | 
328 | 
329 | #' Score the tuning results
330 | #'
331 | #' @param h2o_metric_name A character with the name of the h2o metric used to
332 | #'   score the tuning resamples.
333 | #' @param yardstick_metric_name A character with the name of the equivalent
334 | #'   yardstick metric used to score the tuning resamples.
335 | #' @param grid_id The h2o id for the tuning grid.
336 | #' @param params A named list of hyperparameters and their values.
337 | #' @param rename_args A named character vector used to remap the h2o
338 | #'   hyperparameter names to tidymodels nomenclature. The names of the vector
339 | #'   are the tidymodels nomenclature and the values are the h2o nomenclature,
340 | #'   e.g. c(mtry = "mtries", min_n = "min_rows")
341 | #' @param model_mode The mode of the model, either "classification",
342 | #'   "regression", or "multiclass".
343 | #'
344 | #' @return
345 | #'
346 | #' @keywords internal
347 | extract_h2o_scores <-
348 |   function(h2o_metric_name,
349 |            yardstick_metric_name,
350 |            grid_id,
351 |            params,
352 |            rename_args,
353 |            model_mode) {
354 |     tuning_args <- names(params)
355 | 
356 |     grid <- h2o::h2o.getGrid(
357 |       grid_id = grid_id,
358 |       sort_by = h2o_metric_name,
359 |       decreasing = FALSE
360 |     )
361 | 
362 |     scores <- as.data.frame(grid@summary_table)
363 |     scores[, ncol(scores)] <-
364 |       as.numeric(scores[, ncol(scores)])
365 | 
366 |     for (x in names(params)) {
367 |       scores[[x]] <- gsub("\\[", "", scores[[x]])
368 |       scores[[x]] <- gsub("\\]", "", scores[[x]])
369 |     }
370 | 
371 |     # create the tune-like resamples object
372 |     scores <- scores %>%
373 |       as_tibble() %>%
374 |       dplyr::select(-dplyr::one_of("model_ids")) %>%
375 |       dplyr::mutate_at(tuning_args, as.numeric) %>%
376 |       dplyr::mutate(
377 |         .metric = yardstick_metric_name,
378 |         .estimator = dplyr::if_else(model_mode == "classification", "multiclass", "standard")
379 |       ) %>%
380 |       dplyr::rename(.estimate = !!h2o_metric_name) %>%
381 |       dplyr::rename(dplyr::all_of(rename_args))
382 | 
383 |     return(scores)
384 |   }
385 | 
386 | 
387 | #' Return the equivalent h2o algorithm name for a parsnip `model_spec` object.
388 | #'
389 | #' @param object A parsnip `model_spec` object.
390 | #' @param model_args A list of model arguments.
391 | #'
392 | #' @return A list with algorithm, the model name and the arguments with the
393 | #'   family attribute set for specific models (e.g. glm).
394 | #'
395 | #' @keywords internal
396 | model_spec_to_algorithm <- function(object, model_args) {
397 |   if (inherits(object, "boost_tree")) {
398 |     model_name <- "boost_tree"
399 |     algorithm <- "gbm"
400 |   }
401 |   if (inherits(object, "rand_forest")) {
402 |     model_name <- "rand_forest"
403 |     algorithm <- "randomForest"
404 |   }
405 |   if (inherits(object, "linear_reg")) {
406 |     model_name <- "linear_reg"
407 |     algorithm <- "glm"
408 |     model_args$family <- "gaussian"
409 |   }
410 |   if (inherits(object, "logistic_reg")) {
411 |     model_name <- "logistic_reg"
412 |     algorithm <- "glm"
413 |     model_args$family <- "binomial"
414 |   }
415 |   if (inherits(object, "multinom_reg")) {
416 |     model_name <- "multinom_reg"
417 |     algorithm <- "glm"
418 |     model_args$family <- "multinomial"
419 |   }
420 |   if (inherits(object, "mlp")) {
421 |     model_name <- "mlp"
422 |     algorithm <- "deeplearning"
423 |   }
424 |   if (inherits(object, "naive_Bayes")) {
425 |     model_name <- "naive_Bayes"
426 |     algorithm <- "naiveBayes"
427 |   }
428 | 
429 |   return(list(algorithm, model_name, model_args))
430 | }
431 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | generate_random_id <- function(prefix) {
2 |   paste(prefix, as.integer(stats::runif(1, 0, 1e9)), sep = "_")
3 | }
4 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(libname, pkgname) {
 2 | 
 3 |   # if (any(loadedNamespaces() == "discrim")) {
 4 |   #   add_naive_Bayes_h2o()
 5 |   # }
 6 |   #
 7 |   # if (any(loadedNamespaces() == "rules")) {
 8 |   #   add_rule_fit_h2o()
 9 |   # }
10 | 
11 |   add_naive_Bayes_h2o()
12 |   add_rule_fit_h2o()
13 | 
14 |   add_mlp_h2o()
15 |   add_boost_tree_h2o()
16 |   add_rand_forest_h2o()
17 |   add_multinom_reg_h2o()
18 |   add_logistic_reg_h2o()
19 |   add_linear_reg_h2o()
20 |   add_automl()
21 | }
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # h2oparsnip
 2 | 
 3 | <!-- badges: start -->
 4 | [![CRAN status](https://www.r-pkg.org/badges/version/h2oparsnip)](https://CRAN.R-project.org/package=h2oparsnip)
 5 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)
 6 | <!-- badges: end -->
 7 | 
 8 | **NOTE the development of h2oparsnip is being integrated within tidymodels as the [agua](https://github.com/tidymodels/agua) package. New issues and feature requests should be made in the agua repository**.
 9 | 
10 | ```h2oparsnip``` provides a set of wrappers to bind h2o algorthms with the
11 | 'parsnip' package.
12 | 
13 | This package is early in development. Currently the following h2o algorithms
14 | are implemented:
15 | 
16 | - h2o.deeplearning engine added to parsnip::mlp model specification
17 | - h2o.gbm engine added to parsnip::boost_tree model specification
18 | - h2o.randomForest engine added to parsnip::rand_forest model specification
19 | - h2o.glm engine added to multinom_reg, logistic_reg and linear_reg model
20 | specifications
21 | - h2o.naiveBayes engine added to naive_Bayes specification
22 | - a new model, automl
23 | - h2o.rulefit engine added to parsnip::rule_fit
24 | 
25 | ## Installation
26 | 
27 | The package is not yet on CRAN and can be installed with:
28 | 
29 | ``` r
30 | devtools::install_github("stevenpawley/h2oparsnip")
31 | ```
32 | 
33 | ## Notes
34 | 
35 | The package currently is based on the concept of using h2o as a disposable backend, using h2o as a drop-in replacement for the traditionally used 'engines' within the parsnip package. However, performing tasks such as hyperparameter tuning via the 'tune' packge will be less efficient if working on a remote cluster than using h2o directly because data is being sent back and forth.
36 | 
37 | h2oparsnip also does not provide any management of the h2o cluster. If lots of models are being run then available memory within the cluster may be exhausted. Currently this has to be managed using the commands in the h2o package.
38 | 
39 | ## Basic Usage
40 | 
41 | An example of using the h2o engine for boosted trees. For boosted trees, hyperparameter tuning of the number of trees is performed using a multi_predict method, i.e. the algorithm fits the maximum number of trees in the parameter grid and can be tuned using fewer trees without retraining.
42 | 
43 | ```
44 | library(tidymodels)
45 | library(h2oparsnip)
46 | library(h2o)
47 | 
48 | h2o.init()
49 | 
50 | gbm <- boost_tree(mode = "regression", trees = tune(), min_n = tune()) %>%
51 |   set_engine("h2o")
52 | 
53 | rec <- iris %>%
54 |   recipe(Petal.Length ~ .)
55 | 
56 | params <- expand.grid(trees = c(10, 20, 30), min_n = c(1, 5, 10))
57 | resamples = mc_cv(iris, times = 1)
58 | tune_results <- tune_grid(
59 |   object = gbm,
60 |   preprocessor = rec,
61 |   resamples = resamples,
62 |   grid = params,
63 |   metrics = metric_set(rmse)
64 | )
65 | ```
66 | 
67 | ## Tuning (alternative)
68 | 
69 | A problem with using `tune::tune_grid` is that performance is reduced because the data for every tuning hyperparameter iteration and resampling is moved from R to the h2o cluster. To minimize this, the `tune_grid_h2o` function can be used to tune model arguments, as a near drop-in replacement:
70 | 
71 | ```
72 | tune_results <- tune_grid_h2o(
73 |   object = gbm,
74 |   preprocessor = rec,
75 |   resamples = resamples,
76 |   grid = params,
77 |   metrics = metric_set(rmse)
78 | )
79 | ```
80 | 
81 | Currently, `tune_grid_h2o` can only tune model parameters and does not handle recipes with tunable parameters. `tune_grid_h2o` moves the data to the h2o cluster only once, i.e. the complete dataset specified by the `resamples` argument is moved to the cluster, and then the equivalent h2o.frame is split based on the row indices in the resampling object, and the `h2o::h2o.grid` function is used for tuning on the h2o frames. To avoid repeatedly moving predictions back from h2o to R, all metrics are also calculated on the cluster. This restricts the range of metrics to what is available in h2o (`tune_grid_h2o` maps **yardstick** metrics to their h2o equivalents). The available metrics are listed in the `tune_grid_h2o` help documentation. However, hyperparameter tuning using `tune_grid_h2o` should be similarly performant as when using h2o directly.
82 | 
83 | ### Control
84 | 
85 | Similar to `tune::control_grid`, details of `tune_grid_h2o` can be configured using `tune_grid_h2o(control = control_h2o())`. This allows the predictions and/or models to be saved (the default is that they are removed after tuning to avoid clutter in the cluster).
86 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevenpawley/h2oparsnip/913d60e9ea72cb8e0e12edca4cf998de46cd52ae/_pkgdown.yml


--------------------------------------------------------------------------------
/h2oparsnip.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/man/automl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/automl.R
 3 | \name{automl}
 4 | \alias{automl}
 5 | \title{General interface for automl models}
 6 | \usage{
 7 | automl(mode = "classification")
 8 | }
 9 | \arguments{
10 | \item{mode}{A single character string for the type of model.}
11 | }
12 | \value{
13 | A model_spec
14 | }
15 | \description{
16 | General interface for automl models
17 | }
18 | 


--------------------------------------------------------------------------------
/man/control_h2o.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/control_h2o.R
 3 | \name{control_h2o}
 4 | \alias{control_h2o}
 5 | \title{Control aspects of the grid search process}
 6 | \usage{
 7 | control_h2o(
 8 |   verbose = FALSE,
 9 |   save_pred = FALSE,
10 |   save_models = FALSE,
11 |   event_level = "first"
12 | )
13 | }
14 | \arguments{
15 | \item{verbose}{A logical for logging results as they are generated.}
16 | 
17 | \item{save_pred}{A logical for whether the out-of-sample predictions should be saved
18 | for each model evaluated.}
19 | 
20 | \item{save_models}{A logical for whether to retain the models associated with the
21 | tuning and resampling iterations within the h2o cluster and append their h2o model
22 | ids to the resamples object as a '.models' column.}
23 | }
24 | \value{
25 | An object of `control_grid` and `control_resamples` class.
26 | }
27 | \description{
28 | `control_h2o` provides a function to set various aspects of the grid search process.
29 | By default during tuning, the resampling predictions are stored within the h2o
30 | cluster. To save memory and space, use 'save_pred = FALSE'.
31 | ``
32 | }
33 | \examples{
34 | # to save space in the cluster use these settings (the defaults)
35 | control_h2o(verbose = TRUE, save_pred = FALSE)
36 | }
37 | 


--------------------------------------------------------------------------------
/man/extract_h2o_models.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune_grid_h2o.R
 3 | \name{extract_h2o_models}
 4 | \alias{extract_h2o_models}
 5 | \title{Extract the h2o model ids for all of the tuning grids and return them as list
 6 | of tibbles.}
 7 | \usage{
 8 | extract_h2o_models(grid_ids, rename_args)
 9 | }
10 | \arguments{
11 | \item{grid_ids}{A character vector of the h2o ids for the tuning grids.}
12 | 
13 | \item{rename_args}{A named character vector used to remap the h2o
14 | hyperparameter names to tidymodels nomenclature. The names of the vector
15 | are the tidymodels nomenclature and the values are the h2o nomenclature,
16 | e.g. c(mtry = "mtries", min_n = "min_rows")}
17 | }
18 | \value{
19 | a list of tibbles
20 | }
21 | \description{
22 | Extract the h2o model ids for all of the tuning grids and return them as list
23 | of tibbles.
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/man/extract_h2o_preds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune_grid_h2o.R
 3 | \name{extract_h2o_preds}
 4 | \alias{extract_h2o_preds}
 5 | \title{Extract the predictions for each tuning grid from the h2o cluster}
 6 | \usage{
 7 | extract_h2o_preds(test_indices, grid_ids, data, rename_args, model_mode)
 8 | }
 9 | \arguments{
10 | \item{test_indices}{A numeric vector if the row numbers of the H2OFrame that
11 | represent the assessment samples.}
12 | 
13 | \item{grid_ids}{A character vector of the h2o ids for the tuning grids.}
14 | 
15 | \item{data}{A H2OFrame object.}
16 | 
17 | \item{rename_args}{A named character vector used to remap the h2o
18 | hyperparameter names to tidymodels nomenclature. The names of the vector
19 | are the tidymodels nomenclature and the values are the h2o nomenclature,
20 | e.g. c(mtry = "mtries", min_n = "min_rows")}
21 | }
22 | \value{
23 | A tibble
24 | }
25 | \description{
26 | Extract the predictions for each tuning grid from the h2o cluster
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/extract_h2o_scores.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune_grid_h2o.R
 3 | \name{extract_h2o_scores}
 4 | \alias{extract_h2o_scores}
 5 | \title{Score the tuning results}
 6 | \usage{
 7 | extract_h2o_scores(
 8 |   h2o_metric_name,
 9 |   yardstick_metric_name,
10 |   grid_id,
11 |   params,
12 |   rename_args,
13 |   model_mode
14 | )
15 | }
16 | \arguments{
17 | \item{h2o_metric_name}{A character with the name of the h2o metric used to
18 | score the tuning resamples.}
19 | 
20 | \item{yardstick_metric_name}{A character with the name of the equivalent
21 | yardstick metric used to score the tuning resamples.}
22 | 
23 | \item{grid_id}{The h2o id for the tuning grid.}
24 | 
25 | \item{params}{A named list of hyperparameters and their values.}
26 | 
27 | \item{rename_args}{A named character vector used to remap the h2o
28 | hyperparameter names to tidymodels nomenclature. The names of the vector
29 | are the tidymodels nomenclature and the values are the h2o nomenclature,
30 | e.g. c(mtry = "mtries", min_n = "min_rows")}
31 | 
32 | \item{model_mode}{The mode of the model, either "classification",
33 | "regression", or "multiclass".}
34 | }
35 | \value{
36 | 
37 | }
38 | \description{
39 | Score the tuning results
40 | }
41 | \keyword{internal}
42 | 


--------------------------------------------------------------------------------
/man/h2o_automl_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/automl.R
 3 | \name{h2o_automl_train}
 4 | \alias{h2o_automl_train}
 5 | \title{Wrapper for training a h2o.automl model}
 6 | \usage{
 7 | h2o_automl_train(formula, data, ...)
 8 | }
 9 | \arguments{
10 | \item{formula}{formula}
11 | 
12 | \item{data}{data.frame of training data}
13 | 
14 | \item{...}{Other arguments to pass the h2o.automl}
15 | }
16 | \value{
17 | evaluated h2o model call
18 | }
19 | \description{
20 | Wrapper for training a h2o.automl model
21 | }
22 | 


--------------------------------------------------------------------------------
/man/h2o_gbm_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/boost_tree.R
 3 | \name{h2o_gbm_train}
 4 | \alias{h2o_gbm_train}
 5 | \title{Wrapper for training a h2o.gbm model as part of a parsnip `boost_tree`
 6 | h2o engine}
 7 | \usage{
 8 | h2o_gbm_train(
 9 |   formula,
10 |   data,
11 |   ntrees = 50,
12 |   max_depth = 5,
13 |   min_rows = 10,
14 |   learn_rate = 0.1,
15 |   sample_rate = 1,
16 |   col_sample_rate = 1,
17 |   min_split_improvement = 1e-05,
18 |   stopping_rounds = 0,
19 |   validation = 0,
20 |   algorithm = "h2o.gbm",
21 |   ...
22 | )
23 | }
24 | \arguments{
25 | \item{formula}{formula}
26 | 
27 | \item{data}{data.frame of training data}
28 | 
29 | \item{ntrees}{integer, the number of trees to build (default = 50).}
30 | 
31 | \item{max_depth}{integer, the maximum tree depth (default = 10).}
32 | 
33 | \item{min_rows}{integer, the minimum number of observations for a leaf
34 | (default = 10).}
35 | 
36 | \item{learn_rate}{numeric, the learning rate (default = 0.1, range is from
37 | 0.0 to 1.0).}
38 | 
39 | \item{sample_rate}{numeric, the proportion of samples to use to build each
40 | tree (default = 1.0).}
41 | 
42 | \item{col_sample_rate}{numeric, the proportion of features available during
43 | each node split (default = 1.0).}
44 | 
45 | \item{min_split_improvement}{numeric,  minimum relative improvement in
46 | squared error reduction in order for a split to happen (default = 1e-05)}
47 | 
48 | \item{stopping_rounds}{An integer specifying the number of training
49 | iterations without improvement before stopping. If `stopping_rounds = 0`
50 | (the default) then early stopping is disabled.  If `validation` is used,
51 | performance is base on the validation set; otherwise the training set is
52 | used.}
53 | 
54 | \item{validation}{A positive number. If on `[0, 1)` the value, `validation`
55 | is a random proportion of data in `x` and `y` that are used for performance
56 | assessment and potential early stopping. If 1 or greater, it is the _number_
57 | of training set samples use for these purposes.}
58 | 
59 | \item{algorithm}{Whether to use the default h2o 'h2o.gbm' algorithm or use
60 | 'h2o.xgboost' via h2o.}
61 | 
62 | \item{...}{other arguments passed to the h2o engine.}
63 | }
64 | \value{
65 | evaluated h2o model call
66 | }
67 | \description{
68 | Wrapper for training a h2o.gbm model as part of a parsnip `boost_tree`
69 | h2o engine
70 | }
71 | 


--------------------------------------------------------------------------------
/man/h2o_glm_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/glm_wrapper.R
 3 | \name{h2o_glm_train}
 4 | \alias{h2o_glm_train}
 5 | \title{Wrapper for training a h2o.glm model as part of a parsnip}
 6 | \usage{
 7 | h2o_glm_train(formula, data, alpha = NULL, lambda = NULL, family = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{formula}{formula}
11 | 
12 | \item{data}{data.frame of training data}
13 | 
14 | \item{alpha}{numeric, Distribution of regularization between the L1 (Lasso)
15 | and L2 (Ridge) penalties. A value of 1 for alpha represents Lasso
16 | regression, a value of 0 produces Ridge regression.}
17 | 
18 | \item{lambda}{numeric, regularization strength}
19 | 
20 | \item{family}{character, one of c("gaussian", "binomial", "quasibinomial",
21 | "ordinal", "multinomial", "poisson", "gamma", "tweedie",
22 | "negativebinomial")}
23 | 
24 | \item{...}{other arguments passed to the h2o engine.}
25 | }
26 | \value{
27 | evaluated h2o model call
28 | }
29 | \description{
30 | Wrapper for training a h2o.glm model as part of a parsnip
31 | }
32 | 


--------------------------------------------------------------------------------
/man/h2o_mlp_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mlp.R
 3 | \name{h2o_mlp_train}
 4 | \alias{h2o_mlp_train}
 5 | \title{Wrapper for training a h2o.deeplearning model as part of a parsnip `mlp`
 6 | h2o engine}
 7 | \usage{
 8 | h2o_mlp_train(
 9 |   formula,
10 |   data,
11 |   l2 = 0,
12 |   hidden_dropout_ratios = 0,
13 |   hidden = 100,
14 |   epochs = 10,
15 |   activation = "Rectifier",
16 |   stopping_rounds = 0,
17 |   validation = 0,
18 |   ...
19 | )
20 | }
21 | \arguments{
22 | \item{formula}{formula}
23 | 
24 | \item{data}{data.frame of training data}
25 | 
26 | \item{l2}{numeric, l2 regulation parameter, default = 0}
27 | 
28 | \item{hidden_dropout_ratios}{dropout ratio for a single hidden layer (default
29 | = 0)}
30 | 
31 | \item{hidden}{integer, number of neurons in the hidden layer (default = c(200, 200))}
32 | 
33 | \item{epochs}{integer, number of epochs (default = 10)}
34 | 
35 | \item{activation}{character, activation function. Must be one of: "Tanh",
36 | "TanhWithDropout", "Rectifier", "RectifierWithDropout", "Maxout",
37 | "MaxoutWithDropout". Defaults to "Rectifier. If `hidden_dropout_ratios` > 0
38 | then the equivalent activation function with dropout is used.}
39 | 
40 | \item{stopping_rounds}{An integer specifying the number of training
41 | iterations without improvement before stopping. If `stopping_rounds = 0`
42 | (the default) then early stopping is disabled.  If `validation` is used,
43 | performance is base on the validation set; otherwise the training set is
44 | used.}
45 | 
46 | \item{validation}{A positive number. If on `[0, 1)` the value, `validation`
47 | is a random proportion of data in `x` and `y` that are used for performance
48 | assessment and potential early stopping. If 1 or greater, it is the _number_
49 | of training set samples use for these purposes.}
50 | 
51 | \item{...}{other arguments not currently used}
52 | }
53 | \value{
54 | evaluated h2o model call
55 | }
56 | \description{
57 | Wrapper for training a h2o.deeplearning model as part of a parsnip `mlp`
58 | h2o engine
59 | }
60 | 


--------------------------------------------------------------------------------
/man/h2o_naiveBayes_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/naive_Bayes.R
 3 | \name{h2o_naiveBayes_train}
 4 | \alias{h2o_naiveBayes_train}
 5 | \title{Wrapper for training a h2o.naiveBayes model as part of a discrim `naive_Bayes`
 6 | h2o engine}
 7 | \usage{
 8 | h2o_naiveBayes_train(formula, data, laplace = 0, ...)
 9 | }
10 | \arguments{
11 | \item{formula}{formula}
12 | 
13 | \item{data}{data.frame of training data}
14 | 
15 | \item{laplace}{numeric, the Laplace smoothing parameter, must be >= 0.}
16 | 
17 | \item{...}{other arguments passed to the h2o engine.}
18 | }
19 | \value{
20 | a fitted h2o model.
21 | }
22 | \description{
23 | Wrapper for training a h2o.naiveBayes model as part of a discrim `naive_Bayes`
24 | h2o engine
25 | }
26 | 


--------------------------------------------------------------------------------
/man/h2o_rf_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rand_forest.R
 3 | \name{h2o_rf_train}
 4 | \alias{h2o_rf_train}
 5 | \title{Wrapper for training a h2o.randomForest model as part of a parsnip
 6 | `rand_forest` h2o engine}
 7 | \usage{
 8 | h2o_rf_train(
 9 |   formula,
10 |   data,
11 |   ntrees = 50,
12 |   min_rows = 10,
13 |   mtries = -1,
14 |   stopping_rounds = 0,
15 |   validation = 0,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{formula}{formula}
21 | 
22 | \item{data}{data.frame of training data}
23 | 
24 | \item{ntrees}{integer, the number of trees to build (default = 50)}
25 | 
26 | \item{min_rows}{integer, the minimum number of observations for a leaf
27 | (default = 10)}
28 | 
29 | \item{mtries}{integer, the number of columns to randomly select at each
30 | level. Default of -1 is sqrt(p) for classification and (p/3) for regression.}
31 | 
32 | \item{stopping_rounds}{An integer specifying the number of training
33 | iterations without improvement before stopping. If `stopping_rounds = 0`
34 | (the default) then early stopping is disabled.  If `validation` is used,
35 | performance is base on the validation set; otherwise the training set is
36 | used.}
37 | 
38 | \item{validation}{A positive number. If on `[0, 1)` the value, `validation`
39 | is a random proportion of data in `x` and `y` that are used for performance
40 | assessment and potential early stopping. If 1 or greater, it is the _number_
41 | of training set samples use for these purposes.}
42 | 
43 | \item{...}{other arguments not currently used}
44 | }
45 | \value{
46 | evaluated h2o model call
47 | }
48 | \description{
49 | Wrapper for training a h2o.randomForest model as part of a parsnip
50 | `rand_forest` h2o engine
51 | }
52 | 


--------------------------------------------------------------------------------
/man/h2o_rulefit_train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rule_fit.R
 3 | \name{h2o_rulefit_train}
 4 | \alias{h2o_rulefit_train}
 5 | \title{Wrapper for training a h2o.rulefit model as part of a parsnip
 6 | `rule_fit` h2o engine}
 7 | \usage{
 8 | h2o_rulefit_train(
 9 |   formula,
10 |   data,
11 |   rule_generation_ntrees = 50,
12 |   max_rule_length = 3,
13 |   lambda = 0,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{formula}{formula}
19 | 
20 | \item{data}{data.frame of training data}
21 | 
22 | \item{rule_generation_ntrees}{integer, the number of trees to build (default
23 | = 50)}
24 | 
25 | \item{max_rule_length}{integer, the maximum tree depth (default = 3).}
26 | 
27 | \item{lambda}{Specify the regularization strength for LASSO regressor.}
28 | 
29 | \item{...}{other arguments that are passed to the h2o model}
30 | }
31 | \value{
32 | evaluated h2o model call
33 | }
34 | \description{
35 | Wrapper for training a h2o.rulefit model as part of a parsnip
36 | `rule_fit` h2o engine
37 | }
38 | 


--------------------------------------------------------------------------------
/man/model_spec_to_algorithm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune_grid_h2o.R
 3 | \name{model_spec_to_algorithm}
 4 | \alias{model_spec_to_algorithm}
 5 | \title{Return the equivalent h2o algorithm name for a parsnip `model_spec` object.}
 6 | \usage{
 7 | model_spec_to_algorithm(object, model_args)
 8 | }
 9 | \arguments{
10 | \item{object}{A parsnip `model_spec` object.}
11 | 
12 | \item{model_args}{A list of model arguments.}
13 | }
14 | \value{
15 | A list with algorithm, the model name and the arguments with the
16 |   family attribute set for specific models (e.g. glm).
17 | }
18 | \description{
19 | Return the equivalent h2o algorithm name for a parsnip `model_spec` object.
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/man/mse_vec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/metrics.R
 3 | \name{mse_vec}
 4 | \alias{mse_vec}
 5 | \alias{mse}
 6 | \alias{mse.data.frame}
 7 | \title{Mean squared error}
 8 | \usage{
 9 | mse_vec(truth, estimate, na_rm = TRUE, ...)
10 | 
11 | mse(data, ...)
12 | 
13 | \method{mse}{data.frame}(data, truth, estimate, na_rm = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{truth}{The column identifier for the true class results (that is a
17 | factor). This should be an unquoted column name although this argument is
18 | passed by expression and supports quasiquotation (you can unquote column
19 | names). For _vec() functions, a factor vector.}
20 | 
21 | \item{estimate}{The column identifier for the predicted class results (that
22 | is also factor). As with truth this can be specified different ways but the
23 | primary method is to use an unquoted variable name. For _vec() functions, a
24 | factor vector.}
25 | 
26 | \item{na_rm}{A logical value indicating whether NA values should be stripped
27 | before the computation proceeds.}
28 | 
29 | \item{...}{Not currently used.}
30 | 
31 | \item{data}{A `data.frame` containing the `truth` and `estimate` columns.}
32 | }
33 | \value{
34 | A `tibble` with columns `.metric`, `.estimator`, and `.estimate` and 1 row of values.
35 | }
36 | \description{
37 | Calculate the mean squared error. This metric is in squared units of the original data.
38 | }
39 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/remove_h2o_models.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune_grid_h2o.R
 3 | \name{remove_h2o_models}
 4 | \alias{remove_h2o_models}
 5 | \title{Removes the models creating during tuning resampling}
 6 | \usage{
 7 | remove_h2o_models(grid_ids)
 8 | }
 9 | \arguments{
10 | \item{grid_ids}{A character vector of h2o grid ids for each tuning resample.}
11 | }
12 | \value{
13 | 
14 | }
15 | \description{
16 | Removes the models creating during tuning resampling
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/tune_grid_h2o.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tune_grid_h2o.R
 3 | \name{tune_grid_h2o}
 4 | \alias{tune_grid_h2o}
 5 | \title{Tune h2o models}
 6 | \usage{
 7 | tune_grid_h2o(
 8 |   object,
 9 |   preprocessor = NULL,
10 |   resamples,
11 |   param_info = NULL,
12 |   grid = 10,
13 |   metrics = NULL,
14 |   control = control_h2o(),
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{object}{A parsnip `model_spec` object.}
20 | 
21 | \item{preprocessor}{A `recipe` object.}
22 | 
23 | \item{resamples}{An `rset` object.}
24 | 
25 | \item{param_info}{A `dials::parameters()` object or NULL. If none is given, a
26 | parameters set is derived from other arguments. Passing this argument can
27 | be useful when parameter ranges need to be customized.}
28 | 
29 | \item{grid}{A `data.frame` of tuning combinations or a positive integer. The
30 | data frame should have columns for each parameter being tuned and rows for
31 | tuning parameter candidates. An integer denotes the number of candidate
32 | parameter sets to be created automatically. If a positive integer is used
33 | or no tuning grid is supplied, then a semi-random grid via
34 | `dials::grid_latin_hypercube` is created based on the specified number of
35 | tuning iterations (default size = 10).}
36 | 
37 | \item{metrics}{A `yardstick::metric_set` or NULL. Note that not all yardstick
38 | metrics can be used with `tune_grid_h2o`. The metrics must be one of
39 | `yardstick::rsq`, `yardstick::rmse` or `h2oparsnip::mse` for regression
40 | models, and `yardstick::accuracy`, `yardstick::mn_log_loss`,
41 | `yardstick::roc_auc` or `yardstick::pr_auc` for classification models. If
42 | NULL then the default is `yardstick::rsq` for regression models and
43 | `yardstick::mn_log_loss` for classification models.}
44 | 
45 | \item{control}{An object used to modify the tuning process.}
46 | 
47 | \item{...}{Not currently used.}
48 | }
49 | \value{
50 | 
51 | }
52 | \description{
53 | This is a prototype of a version of tune_grid that uses h2o.grid to perform
54 | hyperparameter tuning.
55 | }
56 | \section{Limitations}{
57 | 
58 | - Only model arguments can be tuned, not arguments in the preprocessing
59 | recipes.
60 | 
61 | - Parsnip only allows `data.frame` and `tbl_spark` objects to be passed to
62 | the `fit` method, not `H2OFrame` objects.
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/man/write_h2o.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/read_h2o.R
 3 | \name{write_h2o}
 4 | \alias{write_h2o}
 5 | \alias{read_h2o}
 6 | \alias{write_h2o.workflow}
 7 | \alias{write_h2o.model_fit}
 8 | \alias{read_h2o.workflow}
 9 | \alias{read_h2o.model_fit}
10 | \title{Saves an H2O model to file that is contained within a fitted parsnip model
11 | specification or contained within a workflow}
12 | \usage{
13 | write_h2o(object, filename, ...)
14 | 
15 | read_h2o(object, filename, ...)
16 | 
17 | \method{write_h2o}{workflow}(object, filename, ...)
18 | 
19 | \method{write_h2o}{model_fit}(object, filename, ...)
20 | 
21 | \method{read_h2o}{workflow}(object, filename, ...)
22 | 
23 | \method{read_h2o}{model_fit}(object, filename, ...)
24 | }
25 | \arguments{
26 | \item{object}{Either a `workflows::workflow()` object contained a fitted
27 | model when using the workflows package, or a `model_spec` object from a
28 | fitted model when using the parsnip package directly.}
29 | 
30 | \item{filename}{A `character` specifying the file path used to save the
31 | model. H2O models do not require a specific file extension.}
32 | 
33 | \item{...}{Currently not used.}
34 | }
35 | \value{
36 | The file path used to save the model.
37 | }
38 | \description{
39 | H2O models cannot be saved using the typical R approaches such as saveRDS
40 | because the actual H2O model is contained within a Java virtual machine. H2O
41 | models need to be saved and restored using the `h2o::h2o.saveModel` and
42 | `h2o::h2o.loadModel` functions. This is inconvenient for using H2O models
43 | contained within parsnip model specifications or workflow objects.
44 | }
45 | \details{
46 | The `write_h2o` function extracts the H2O model from within a parsnip or
47 | workflow fitted model and saves it to file using the `h2o::h2o.saveModel`
48 | function. To restore a model and insert it back into a previously fitted
49 | model use the `read_h2o` function.
50 | }
51 | \examples{
52 | library(parsnip)
53 | library(h2o)
54 | 
55 | # start a h2o session
56 | h2o.init()
57 | 
58 | # fit a parsnip model using the h2o engine
59 | clf <- mlp(mode = "classification") \%>\%
60 |   set_engine("h2o")
61 | 
62 | model_fit <- clf \%>\% fit(Species ~ ., iris)
63 | 
64 | # save the parsnip model
65 | saveRDS(model_fit, file.path(tempdir(), "my_model.rds"))
66 | 
67 | # save the h2o component of the model
68 | write_h2o(object = model_fit, filename = file.path(tempdir(), "my_h2o_model.mod"))
69 | h2o.shutdown(prompt = FALSE)
70 | 
71 | # restore a model
72 | h2o.init()
73 | model_fit <- readRDS(file.path(tempdir(), "my_model.rds"))
74 | 
75 | # read and insert the H2O model back into the parsnip object
76 | model_fit <- read_h2o(model_fit, file.path(tempdir(), "my_h2o_model.mod"))
77 | }
78 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(h2oparsnip)
3 | 
4 | test_check("h2oparsnip")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_boost_tree_h2o.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | library(parsnip)
  3 | library(tibble)
  4 | library(magrittr)
  5 | library(h2o)
  6 | 
  7 | 
  8 | test_that("boost_tree h2o formula method", {
  9 |   skip_on_cran()
 10 | 
 11 |   h2o.init(nthreads = 1)
 12 |   iris_df <- as_tibble(iris)
 13 | 
 14 |   # classfication
 15 |   h2o_clf_fitted <-
 16 |     h2o.gbm(
 17 |       x = 1:4,
 18 |       y = 5,
 19 |       training_frame = as.h2o(iris_df),
 20 |       ntrees = 50,
 21 |       max_depth = 10,
 22 |       min_rows = 1,
 23 |       learn_rate = 0.1,
 24 |       sample_rate = 0.8,
 25 |       col_sample_rate = 0.3,
 26 |       seed = 1234
 27 |     )
 28 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
 29 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
 30 | 
 31 |   clf <-
 32 |     boost_tree(
 33 |       mode = "classification",
 34 |       trees = 50,
 35 |       tree_depth = 10,
 36 |       min_n = 1,
 37 |       learn_rate = 0.1,
 38 |       sample_size = 0.8,
 39 |       mtry = 0.3
 40 |     ) %>%
 41 |     set_engine("h2o", seed = 1234)
 42 | 
 43 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
 44 | 
 45 |   clf_preds <- predict(fitted_clf, iris_df)
 46 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
 47 | 
 48 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
 49 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
 50 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
 51 |   expect_equal(clf_probs[[3]], h2o_clf_preds$virginica)
 52 | 
 53 |   # regression
 54 |   h2o_regr_fitted <-
 55 |     h2o.gbm(
 56 |       x = 2:5,
 57 |       y = 1,
 58 |       training_frame = as.h2o(iris_df),
 59 |       ntrees = 50,
 60 |       max_depth = 10,
 61 |       min_rows = 1,
 62 |       learn_rate = 0.1,
 63 |       sample_rate = 0.8,
 64 |       col_sample_rate = 0.3,
 65 |       seed = 1234
 66 |     )
 67 |   h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df))
 68 |   h2o_regr_preds <- as_tibble(h2o_regr_preds)
 69 | 
 70 |   regr <- boost_tree(
 71 |     mode = "regression",
 72 |     trees = 50,
 73 |     tree_depth = 10,
 74 |     min_n = 1,
 75 |     learn_rate = 0.1,
 76 |     sample_size = 0.8,
 77 |     mtry = 0.3
 78 |   ) %>%
 79 |     set_engine("h2o", seed = 1234)
 80 | 
 81 |   fitted_regr <- regr %>% fit(Sepal.Length ~ ., iris_df)
 82 |   regr_preds <- predict(fitted_regr, iris_df)
 83 | 
 84 |   expect_equal(h2o_regr_preds$predict, regr_preds$.pred)
 85 | })
 86 | 
 87 | test_that("boost_tree h2o multi_predict", {
 88 |   skip_on_cran()
 89 | 
 90 |   h2o.init(nthreads = 1)
 91 |   iris_df <- as_tibble(iris)
 92 | 
 93 |   clf <-
 94 |     boost_tree(
 95 |       mode = "classification",
 96 |       trees = 50,
 97 |       tree_depth = 10,
 98 |       min_n = 1,
 99 |       learn_rate = 0.1,
100 |       sample_size = 0.8,
101 |       mtry = 0.3
102 |     ) %>%
103 |     set_engine("h2o", seed = 1234)
104 | 
105 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
106 | 
107 |   expect_true(parsnip::has_multi_predict(fitted_clf))
108 |   expect_equal(parsnip::multi_predict_args(fitted_clf), "trees")
109 | 
110 |   expect_equal(
111 |     nrow(multi_predict(fitted_clf, iris_df, type = "class", trees = 10)),
112 |     150
113 |   )
114 | })
115 | 


--------------------------------------------------------------------------------
/tests/testthat/test_control_h2o.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(rsample)
 3 | library(h2o)
 4 | library(h2oparsnip)
 5 | library(modeldata)
 6 | library(recipes)
 7 | library(dials)
 8 | library(yardstick)
 9 | library(tune)
10 | 
11 | 
12 | test_that("test for control_h2o options", {
13 |   skip_on_cran()
14 |   h2o.init(nthreads = 1)
15 |   data(ames)
16 | 
17 |   spec <- rand_forest(mode = "regression", mtry = tune(), min_n = tune()) %>%
18 |     set_engine("h2o")
19 | 
20 |   rec <- ames %>%
21 |     recipe(Sale_Price ~ .)
22 | 
23 |   params <- list(mtry = mtry(c(5, 10)), min_n = min_n(c(1, 10))) %>%
24 |     grid_regular()
25 | 
26 |   resamples <- bootstraps(ames, times = 1)
27 | 
28 |   # test defaults
29 |   res <-
30 |     tune_grid_h2o(
31 |       object = spec,
32 |       preprocessor = rec,
33 |       resamples = resamples,
34 |       grid = params,
35 |       metrics = metric_set(rmse),
36 |       control = control_h2o(save_pred = FALSE)
37 |     )
38 |   expect_s3_class(res, "tune_results")
39 |   expect_equal(names(res), c("splits", "id", ".metrics", ".notes"))
40 | 
41 |   # test for returning predictions for numeric variable
42 |   res <-
43 |     tune_grid_h2o(
44 |       object = spec,
45 |       preprocessor = rec,
46 |       resamples = resamples,
47 |       grid = params,
48 |       metrics = metric_set(rmse),
49 |       control = control_h2o(save_pred = TRUE)
50 |     )
51 |   expect_s3_class(res, "tune_results")
52 |   expect_equal(names(res), c("splits", "id", ".metrics", ".predictions", ".notes"))
53 |   expect_equal(names(res$.predictions[[1]])[1], ".pred")
54 | 
55 |   # test for storing models
56 |   res <-
57 |     tune_grid_h2o(
58 |       object = spec,
59 |       preprocessor = rec,
60 |       resamples = resamples,
61 |       grid = params,
62 |       metrics = metric_set(rmse),
63 |       control = control_h2o(save_pred = TRUE, save_models = TRUE)
64 |     )
65 |   expect_s3_class(res, "tune_results")
66 |   expect_equal(names(res), c("splits", "id", ".metrics", ".predictions", ".models", ".notes"))
67 | 
68 |   h2o.shutdown(prompt = FALSE)
69 | })
70 | 


--------------------------------------------------------------------------------
/tests/testthat/test_linear_reg_h2o.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(tibble)
 4 | library(magrittr)
 5 | library(h2o)
 6 | 
 7 | test_that("linear_reg h2o formula method", {
 8 |   skip_on_cran()
 9 | 
10 |   h2o.init(nthreads = 1)
11 |   iris_df <- as_tibble(iris)
12 | 
13 |   # classfication
14 |   h2o_regr_fitted <-
15 |     h2o.glm(
16 |       x = 2:4,
17 |       y = 1,
18 |       training_frame = as.h2o(iris_df),
19 |       family = "gaussian",
20 |       seed = 1234,
21 |     )
22 |   h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df))
23 |   h2o_regr_preds <- as_tibble(h2o_regr_preds)
24 | 
25 |   regr <- linear_reg(mode = "regression") %>%
26 |     set_engine("h2o", seed = 1234)
27 | 
28 |   fitted_regr <- regr %>%
29 |     fit(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, iris_df)
30 | 
31 |   regr_preds <- predict(fitted_regr, iris_df)
32 | 
33 |   expect_equal(regr_preds[[1]], h2o_regr_preds$predict)
34 | })
35 | 


--------------------------------------------------------------------------------
/tests/testthat/test_logistic_reg_h2o.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(tibble)
 4 | library(magrittr)
 5 | library(h2o)
 6 | 
 7 | 
 8 | test_that("logistic_reg h2o formula method", {
 9 |   skip_on_cran()
10 | 
11 |   h2o.init(nthreads = 1)
12 |   iris_df <- as_tibble(iris)
13 |   iris_df <- iris_df[which(iris_df$Species %in% c("setosa", "versicolor")), ]
14 |   iris_df$Species <- droplevels(iris_df$Species)
15 | 
16 |   # classfication
17 |   h2o_clf_fitted <-
18 |     h2o.glm(
19 |       x = 1:4,
20 |       y = 5,
21 |       training_frame = as.h2o(iris_df),
22 |       family = "binomial",
23 |       seed = 1234,
24 |     )
25 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
26 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
27 | 
28 |   clf <- logistic_reg(mode = "classification") %>%
29 |     set_engine("h2o", seed = 1234)
30 | 
31 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
32 | 
33 |   clf_preds <- predict(fitted_clf, iris_df)
34 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
35 | 
36 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
37 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
38 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
39 | })
40 | 


--------------------------------------------------------------------------------
/tests/testthat/test_metrics.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | library(rsample)
  3 | library(h2o)
  4 | library(h2oparsnip)
  5 | library(modeldata)
  6 | library(recipes)
  7 | library(dials)
  8 | library(yardstick)
  9 | library(tune)
 10 | 
 11 | 
 12 | test_that("test regression metrics", {
 13 |   skip_on_cran()
 14 |   h2o.init(nthreads = 1)
 15 |   data(ames)
 16 | 
 17 |   spec <- rand_forest(mode = "regression", mtry = tune(), min_n = tune()) %>%
 18 |     set_engine("h2o")
 19 | 
 20 |   rec <- ames %>%
 21 |     recipe(Sale_Price ~ .)
 22 | 
 23 |   params <- list(mtry = mtry(c(5, 10)), min_n = min_n(c(1, 10))) %>%
 24 |     grid_regular()
 25 | 
 26 |   resamples <- bootstraps(ames, times = 1)
 27 | 
 28 |   regression_metrics <- metric_set(rmse, rsq)
 29 | 
 30 |   res <-
 31 |     tune_grid_h2o(
 32 |       object = spec,
 33 |       preprocessor = rec,
 34 |       resamples = resamples,
 35 |       grid = params,
 36 |       metrics = regression_metrics,
 37 |       control = control_h2o(save_pred = FALSE, save_models = FALSE)
 38 |     )
 39 |   expect_equal(unique(collect_metrics(res) %>% pull(.metric)), c("rmse", "rsq"))
 40 |   h2o.shutdown(prompt = FALSE)
 41 | })
 42 | 
 43 | 
 44 | test_that("test classification metrics multiclass", {
 45 |   skip_on_cran()
 46 |   h2o.init(nthreads = 1)
 47 |   data(iris)
 48 | 
 49 |   spec <- rand_forest(mode = "classification", min_n = tune()) %>%
 50 |     set_engine("h2o")
 51 | 
 52 |   rec <- iris %>%
 53 |     recipe(Species ~ .)
 54 | 
 55 |   params <- list(min_n = min_n(c(1, 10))) %>%
 56 |     grid_regular()
 57 | 
 58 |   resamples <- bootstraps(iris, times = 1)
 59 | 
 60 |   clf_metrics <- metric_set(accuracy, mn_log_loss)
 61 | 
 62 |   res <-
 63 |     tune_grid_h2o(
 64 |       object = spec,
 65 |       preprocessor = rec,
 66 |       resamples = resamples,
 67 |       grid = params,
 68 |       metrics = clf_metrics,
 69 |       control = control_h2o(save_pred = FALSE, save_models = FALSE)
 70 |     )
 71 |   expect_equal(unique(collect_metrics(res) %>% pull(.metric)), c("accuracy", "mn_log_loss"))
 72 | 
 73 |   h2o.shutdown(prompt = FALSE)
 74 | })
 75 | 
 76 | 
 77 | test_that("test classification metrics binary", {
 78 |   skip_on_cran()
 79 |   h2o.init(nthreads = 1)
 80 |   data(two_class_dat)
 81 | 
 82 |   spec <- rand_forest(mode = "classification", min_n = tune()) %>%
 83 |     set_engine("h2o")
 84 | 
 85 |   rec <- two_class_dat %>%
 86 |     recipe(Class ~ .)
 87 | 
 88 |   params <- list(min_n = min_n(c(1, 10))) %>%
 89 |     grid_regular()
 90 | 
 91 |   resamples <- bootstraps(two_class_dat, times = 1)
 92 | 
 93 |   clf_metrics <- metric_set(roc_auc, pr_auc)
 94 | 
 95 |   res <-
 96 |     tune_grid_h2o(
 97 |       object = spec,
 98 |       preprocessor = rec,
 99 |       resamples = resamples,
100 |       grid = params,
101 |       metrics = clf_metrics,
102 |       control = control_h2o(save_pred = FALSE, save_models = FALSE)
103 |     )
104 |   expect_equal(
105 |     unique(collect_metrics(res) %>% pull(.metric)),
106 |     c("pr_auc", "roc_auc")
107 |   )
108 | 
109 |   h2o.shutdown(prompt = FALSE)
110 | })
111 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mlp_h2o.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | library(parsnip)
  3 | library(tibble)
  4 | library(magrittr)
  5 | library(h2o)
  6 | 
  7 | 
  8 | test_that("mlp h2o formula method", {
  9 |   skip_on_cran()
 10 | 
 11 |   h2o.init(nthreads = 1)
 12 |   iris_df <- as_tibble(iris)
 13 | 
 14 |   # classfication
 15 |   h2o_clf_fitted <-
 16 |     h2o.deeplearning(
 17 |       x = 1:4,
 18 |       y = 5,
 19 |       training_frame = as.h2o(iris_df),
 20 |       hidden = 100,
 21 |       activation = "Rectifier",
 22 |       seed = 1234,
 23 |       reproducible = TRUE
 24 |     )
 25 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
 26 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
 27 | 
 28 |   mlp_clf <- mlp(mode = "classification", hidden_units = 100) %>%
 29 |     set_engine("h2o", seed = 1234, reproducible = TRUE)
 30 | 
 31 |   fitted_clf <- mlp_clf %>% fit(Species ~ ., iris_df)
 32 | 
 33 |   clf_preds <- predict(fitted_clf, iris_df)
 34 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
 35 | 
 36 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
 37 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
 38 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
 39 |   expect_equal(clf_probs[[3]], h2o_clf_preds$virginica)
 40 | 
 41 |   # regression
 42 |   h2o_regr_fitted <-
 43 |     h2o::h2o.deeplearning(
 44 |       x = 2:5,
 45 |       y = 1,
 46 |       training_frame = as.h2o(iris_df),
 47 |       hidden = 100,
 48 |       activation = "Rectifier",
 49 |       seed = 1234,
 50 |       reproducible = TRUE
 51 |     )
 52 |   h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df))
 53 |   h2o_regr_preds <- as_tibble(h2o_regr_preds)
 54 | 
 55 |   mlp_regr <- mlp(mode = "regression", hidden_units = 100) %>%
 56 |     set_engine("h2o", seed = 1234, reproducible = TRUE)
 57 | 
 58 |   fitted_regr <- mlp_regr %>% fit(Sepal.Length ~ ., iris_df)
 59 |   regr_preds <- predict(fitted_regr, iris_df)
 60 | 
 61 |   expect_equal(h2o_regr_preds$predict, regr_preds$.pred)
 62 | })
 63 | 
 64 | 
 65 | test_that("mlp h2o non-formula method", {
 66 |   skip_on_cran()
 67 | 
 68 |   h2o.init(nthreads = 1)
 69 |   iris_df <- as_tibble(iris)
 70 | 
 71 |   h2o_clf_fitted <-
 72 |     h2o::h2o.deeplearning(
 73 |       x = 1:4,
 74 |       y = 5,
 75 |       training_frame = as.h2o(iris_df),
 76 |       hidden = 100,
 77 |       activation = "Rectifier",
 78 |       seed = 1234,
 79 |       reproducible = TRUE
 80 |     )
 81 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
 82 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
 83 | 
 84 |   mlp_clf <- mlp(mode = "classification", hidden_units = 100) %>%
 85 |     set_engine("h2o", seed = 1234, reproducible = TRUE)
 86 | 
 87 |   fitted_clf <- mlp_clf %>% fit_xy(iris_df[, -5], iris_df$Species)
 88 | 
 89 |   clf_preds <- predict(fitted_clf, iris_df)
 90 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
 91 | 
 92 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
 93 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
 94 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
 95 |   expect_equal(clf_probs[[3]], h2o_clf_preds$virginica)
 96 | })
 97 | 
 98 | 
 99 | test_that("mlp h2o automatic use of activation function with dropout", {
100 |   skip_on_cran()
101 | 
102 |   h2o.init(nthreads = 1)
103 |   iris_df <- as_tibble(iris)
104 | 
105 |   # rectifier with dropout
106 |   h2o_clf_fitted <-
107 |     h2o::h2o.deeplearning(
108 |       x = 1:4,
109 |       y = 5,
110 |       training_frame = as.h2o(iris_df),
111 |       hidden = 100,
112 |       activation = "RectifierWithDropout",
113 |       hidden_dropout_ratios = 0.2,
114 |       seed = 1234,
115 |       reproducible = TRUE
116 |     )
117 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
118 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
119 | 
120 |   mlp_clf <-
121 |     mlp(
122 |       mode = "classification",
123 |       hidden_units = 100,
124 |       activation = "relu",
125 |       dropout = 0.2
126 |     ) %>%
127 |     set_engine("h2o", seed = 1234, reproducible = TRUE)
128 | 
129 |   fitted_clf <- mlp_clf %>% fit_xy(iris_df[, -5], iris_df$Species)
130 |   clf_preds <- predict(fitted_clf, iris_df)
131 | 
132 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
133 | })
134 | 
135 | 
136 | test_that("mlp h2o multi_predict", {
137 |   mlp_clf <-
138 |     mlp(mode = "classification", hidden_units = 100, epochs = 100) %>%
139 |     set_engine("h2o", seed = 1234, reproducible = TRUE)
140 | 
141 |   fitted_clf <- mlp_clf %>%
142 |     fit(Species ~ ., iris_df)
143 | 
144 |   expect_true(parsnip::has_multi_predict(fitted_clf))
145 |   expect_equal(parsnip::multi_predict_args(fitted_clf), "epochs")
146 | 
147 |   expect_equal(
148 |     nrow(multi_predict(fitted_clf, iris_df, type = "class", epochs = 200)),
149 |     150
150 |   )
151 | })
152 | 
153 | 
154 | test_that("mlp h2o early stopping", {
155 |   mlp_clf <-
156 |     mlp(mode = "classification", hidden_units = 100, epochs = 100) %>%
157 |     set_engine("h2o",
158 |       seed = 1234, reproducible = TRUE, stopping_rounds = 10,
159 |       validation = 0.1
160 |     )
161 | 
162 |   fitted_clf <- mlp_clf %>%
163 |     fit(Species ~ ., iris_df)
164 | })
165 | 


--------------------------------------------------------------------------------
/tests/testthat/test_multinom_reg_h2o.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(tibble)
 4 | library(magrittr)
 5 | library(h2o)
 6 | 
 7 | 
 8 | test_that("multinom_reg h2o formula method", {
 9 |   skip_on_cran()
10 | 
11 |   h2o.init(nthreads = 1)
12 |   iris_df <- as_tibble(iris)
13 | 
14 |   # classfication
15 |   h2o_clf_fitted <-
16 |     h2o.glm(
17 |       x = 1:4,
18 |       y = 5,
19 |       training_frame = as.h2o(iris_df),
20 |       family = "multinomial",
21 |       seed = 1234,
22 |     )
23 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
24 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
25 | 
26 |   clf <- multinom_reg(mode = "classification") %>%
27 |     set_engine("h2o", seed = 1234)
28 | 
29 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
30 | 
31 |   clf_preds <- predict(fitted_clf, iris_df)
32 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
33 | 
34 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
35 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
36 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
37 |   expect_equal(clf_probs[[3]], h2o_clf_preds$virginica)
38 | })
39 | 


--------------------------------------------------------------------------------
/tests/testthat/test_naive_Bayes.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(discrim)
 4 | library(tibble)
 5 | library(magrittr)
 6 | library(h2o)
 7 | 
 8 | test_that("naive_Bayes h2o formula method", {
 9 |   skip_on_cran()
10 | 
11 |   h2o.init(nthreads = 1)
12 |   iris_df <- as_tibble(iris)
13 | 
14 |   # classfication
15 |   h2o_clf_fitted <-
16 |     h2o.naiveBayes(
17 |       x = 1:4,
18 |       y = 5,
19 |       training_frame = as.h2o(iris_df),
20 |       seed = 1234
21 |     )
22 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
23 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
24 | 
25 |   clf <- naive_Bayes(mode = "classification") %>%
26 |     set_engine("h2o", seed = 1234)
27 | 
28 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
29 | 
30 |   clf_preds <- predict(fitted_clf, iris_df)
31 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
32 | 
33 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
34 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
35 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
36 | })
37 | 


--------------------------------------------------------------------------------
/tests/testthat/test_rand_forest_h2o.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(tibble)
 4 | library(magrittr)
 5 | library(h2o)
 6 | 
 7 | 
 8 | test_that("boost_tree h2o formula method", {
 9 |   skip_on_cran()
10 |   h2o.init(nthreads = 1)
11 |   iris_df <- as_tibble(iris)
12 | 
13 |   # classfication
14 |   h2o_clf_fitted <-
15 |     h2o.randomForest(
16 |       x = 1:4,
17 |       y = 5,
18 |       training_frame = as.h2o(iris_df),
19 |       ntrees = 50,
20 |       min_rows = 1,
21 |       mtries = -1,
22 |       seed = 1234,
23 |     )
24 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
25 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
26 | 
27 |   clf <-
28 |     rand_forest(
29 |       mode = "classification",
30 |       trees = 50,
31 |       min_n = 1,
32 |       mtry = -1
33 |     ) %>%
34 |     set_engine("h2o", seed = 1234)
35 | 
36 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
37 | 
38 |   clf_preds <- predict(fitted_clf, iris_df)
39 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
40 | 
41 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
42 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
43 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
44 |   expect_equal(clf_probs[[3]], h2o_clf_preds$virginica)
45 | 
46 |   # regression
47 |   h2o_regr_fitted <-
48 |     h2o.randomForest(
49 |       x = 2:5,
50 |       y = 1,
51 |       training_frame = as.h2o(iris_df),
52 |       ntrees = 50,
53 |       min_rows = 1,
54 |       mtries = -1,
55 |       seed = 1234
56 |     )
57 |   h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df))
58 |   h2o_regr_preds <- as_tibble(h2o_regr_preds)
59 | 
60 |   regr <- rand_forest(
61 |     mode = "regression",
62 |     trees = 50,
63 |     min_n = 1,
64 |     mtry = -1
65 |   ) %>%
66 |     set_engine("h2o", seed = 1234)
67 | 
68 |   fitted_regr <- regr %>% fit(Sepal.Length ~ ., iris_df)
69 |   regr_preds <- predict(fitted_regr, iris_df)
70 | 
71 |   expect_equal(h2o_regr_preds$predict, regr_preds$.pred)
72 | })
73 | 


--------------------------------------------------------------------------------
/tests/testthat/test_rule_fit.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(tibble)
 4 | library(magrittr)
 5 | library(h2o)
 6 | library(rules)
 7 | 
 8 | 
 9 | test_that("boost_tree h2o formula method", {
10 |   skip_on_cran()
11 |   h2o.init(nthreads = 1)
12 |   iris_df <- as_tibble(iris)
13 | 
14 |   # classfication
15 |   h2o_clf_fitted <-
16 |     h2o.rulefit(
17 |       x = 1:4,
18 |       y = 5,
19 |       training_frame = as.h2o(iris_df),
20 |       seed = 1234,
21 |     )
22 |   h2o_clf_preds <- predict(h2o_clf_fitted, as.h2o(iris_df))
23 |   h2o_clf_preds <- as_tibble(h2o_clf_preds)
24 | 
25 |   clf <-
26 |     rule_fit(mode = "classification") %>%
27 |     set_engine("h2o", seed = 1234)
28 | 
29 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
30 | 
31 |   clf_preds <- predict(fitted_clf, iris_df)
32 |   clf_probs <- predict(fitted_clf, iris_df, type = "prob")
33 | 
34 |   expect_equal(clf_preds[[1]], h2o_clf_preds$predict)
35 |   expect_equal(clf_probs[[1]], h2o_clf_preds$setosa)
36 |   expect_equal(clf_probs[[2]], h2o_clf_preds$versicolor)
37 |   expect_equal(clf_probs[[3]], h2o_clf_preds$virginica)
38 | 
39 |   # regression
40 |   h2o_regr_fitted <-
41 |     h2o.rulefit(
42 |       x = 2:5,
43 |       y = 1,
44 |       training_frame = as.h2o(iris_df),
45 |       seed = 1234
46 |     )
47 |   h2o_regr_preds <- predict(h2o_regr_fitted, as.h2o(iris_df))
48 |   h2o_regr_preds <- as_tibble(h2o_regr_preds)
49 | 
50 |   regr <- rule_fit(mode = "regression", ) %>%
51 |     set_engine("h2o", seed = 1234)
52 | 
53 |   fitted_regr <- regr %>% fit(Sepal.Length ~ ., iris_df)
54 |   regr_preds <- predict(fitted_regr, iris_df)
55 | 
56 |   expect_equal(h2o_regr_preds$predict, regr_preds$.pred)
57 | })
58 | 


--------------------------------------------------------------------------------
/tests/testthat/test_save.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(parsnip)
 3 | library(tibble)
 4 | library(magrittr)
 5 | library(h2o)
 6 | 
 7 | 
 8 | test_that("model persistence, model spec", {
 9 |   skip_on_cran()
10 | 
11 |   h2o.init(nthreads = 1)
12 |   iris_df <- as_tibble(iris)
13 | 
14 |   clf <-
15 |     mlp(mode = "classification") %>%
16 |     set_engine("h2o", seed = 1234)
17 | 
18 |   fitted_clf <- clf %>% fit(Species ~ ., iris_df)
19 |   preds <- predict(fitted_clf, iris_df)
20 | 
21 |   # save model
22 |   model_file <- tempfile(fileext = ".mod")
23 |   parsnip_file <- tempfile(fileext = ".rds")
24 | 
25 |   saveRDS(fitted_clf, parsnip_file)
26 |   write_h2o(fitted_clf, model_file)
27 | 
28 |   # remove model
29 |   h2o.removeAll()
30 |   remove(fitted_clf)
31 | 
32 |   # restore model
33 |   restored_clf <- readRDS(parsnip_file)
34 |   restored_clf <- read_h2o(restored_clf, model_file)
35 |   preds_post <- predict(restored_clf, iris_df)
36 | 
37 |   # compare
38 |   expect_equal(preds, preds_post)
39 | })
40 | 


--------------------------------------------------------------------------------