├── _pkgdown.yml
├── LICENSE
├── .gitignore
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_step_select_mrmr.R
    │   ├── test_step_select_infgain.R
    │   ├── test_step_select_linear.R
    │   ├── test_step_select_tree.R
    │   ├── test_step_select_forests.R
    │   ├── test_step_select_boruta.R
    │   └── test_step_select_vip.R
├── docs
    ├── reference
    │   ├── Rplot001.png
    │   ├── pipe.html
    │   ├── top_p.html
    │   └── index.html
    ├── pkgdown.yml
    ├── link.svg
    ├── sitemap.xml
    ├── bootstrap-toc.css
    ├── docsearch.js
    ├── pkgdown.js
    ├── LICENSE-text.html
    ├── 404.html
    ├── bootstrap-toc.js
    ├── authors.html
    ├── LICENSE.html
    └── pkgdown.css
├── .Rbuildignore
├── R
    ├── imports.R
    ├── utils-pipe.R
    ├── parameters.R
    ├── recipeselectors.R
    ├── misc.R
    ├── step_select_boruta.R
    ├── step_select_mrmr.R
    ├── step_select_vip.R
    ├── step_select_roc.R
    ├── step_select_carscore.R
    ├── step_select_xtab.R
    ├── step_select_infgain.R
    ├── step_select_tree.R
    ├── step_select_forests.R
    └── pull_importances.R
├── man
    ├── pipe.Rd
    ├── top_p.Rd
    ├── recipeselectors.Rd
    ├── pull_importances.Rd
    ├── step_select_boruta.Rd
    ├── step_select_mrmr.Rd
    ├── step_select_roc.Rd
    ├── step_select_xtab.Rd
    ├── step_select_vip.Rd
    ├── step_select_carscore.Rd
    ├── step_select_tree.Rd
    ├── step_select_infgain.Rd
    ├── step_select_linear.Rd
    └── step_select_forests.Rd
├── recipeselectors.Rproj
├── LICENSE.md
├── DESCRIPTION
├── NAMESPACE
└── README.md


/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | destination: docs
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Steven Pawley
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(recipeselectors)
3 | 
4 | test_check("recipeselectors")
5 | 


--------------------------------------------------------------------------------
/docs/reference/Rplot001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevenpawley/recipeselectors/HEAD/docs/reference/Rplot001.png


--------------------------------------------------------------------------------
/docs/pkgdown.yml:
--------------------------------------------------------------------------------
1 | pandoc: 2.17.1.1
2 | pkgdown: 2.0.2
3 | pkgdown_sha: ~
4 | articles: {}
5 | last_built: 2022-03-23T20:12Z
6 | 
7 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^README\.Rmd$
 5 | ^\.travis\.yml$
 6 | ^codecov\.yml$
 7 | ^_pkgdown\.yml$
 8 | ^docs$
 9 | ^pkgdown$
10 | 


--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
1 | ## usethis namespace: start
2 | #' @importFrom tibble tibble as_tibble
3 | #' @importFrom recipes prep bake
4 | #' @importFrom generics tidy
5 | #' @importFrom tune tunable
6 | ## usethis namespace: end
7 | NULL
8 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/recipeselectors.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_mrmr.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | data("iris")
 5 | 
 6 | test_that("step_select_mrmr, execution", {
 7 |   skip_if_not_installed("praznik")
 8 | 
 9 |   irisX <- iris[-5]
10 |   y <- iris$Species
11 | 
12 |   res <- praznik::MRMR(X = irisX, Y = y, k = 4)
13 | 
14 |   mrmr_scores <- tibble(
15 |     variable = names(res$score),
16 |     scores = res$score
17 |   )
18 | 
19 |   rec <- recipe(Species ~ ., data = iris)
20 | 
21 |   mrmr_rec <- rec %>%
22 |     step_select_mrmr(all_predictors(), outcome = "Species", top_p = 2) %>%
23 |     prep()
24 | 
25 |   mrmr_pred <- juice(mrmr_rec)
26 |   expect_true(all(names(mrmr_pred)[1:2] %in% mrmr_scores$variable[1:2]))
27 | 
28 |   expect_equal(mrmr_scores$scores, mrmr_rec$steps[[1]]$scores$score)
29 | })
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/link.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 20 20" style="enable-background:new 0 0 20 20;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#75AADB;}
 7 | </style>
 8 | <path class="st0" d="M4,11.3h1.3v1.3H4c-2,0-4-2.3-4-4.7s2.1-4.7,4-4.7h5.3c1.9,0,4,2.3,4,4.7c0,1.9-1.2,3.6-2.7,4.3v-1.5
 9 | 	C11.4,10.2,12,9.1,12,8c0-1.7-1.4-3.3-2.7-3.3H4C2.7,4.7,1.3,6.3,1.3,8S2.7,11.3,4,11.3z M16,7.3h-1.3v1.3H16c1.3,0,2.7,1.6,2.7,3.3
10 | 	s-1.4,3.3-2.7,3.3h-5.3C9.4,15.3,8,13.7,8,12c0-1.1,0.6-2.2,1.3-2.8V7.7C7.9,8.4,6.7,10.1,6.7,12c0,2.4,2.1,4.7,4,4.7H16
11 | 	c1.9,0,4-2.3,4-4.7S18,7.3,16,7.3z"/>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_infgain.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | data("iris")
 5 | 
 6 | test_that("step_select_infgain, execution", {
 7 |   skip_if_not_installed("FSelectorRcpp")
 8 | 
 9 |   irisX <- iris[-5]
10 |   y <- iris$Species
11 | 
12 |   ig_scores <- as_tibble(FSelectorRcpp::information_gain(x = irisX, y = y))
13 |   ig_scores <- ig_scores[order(ig_scores$importance), ]
14 |   ig_scores$importance <- rlang::set_names(ig_scores$importance, ig_scores$attributes)
15 |   ig_scores <- ig_scores[order(ig_scores$importance, decreasing = TRUE), ]
16 | 
17 |   rec <- recipe(Species ~ ., data = iris)
18 | 
19 |   ig_rec <- rec %>%
20 |     step_select_infgain(
21 |       all_predictors(), outcome = "Species", type = "infogain", top_p = 2) %>%
22 |     prep()
23 | 
24 |   ig_pred <- juice(ig_rec)
25 |   expect_true(all(names(ig_pred)[1:2] %in% ig_scores$attributes[1:2]))
26 | })
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/man/top_p.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parameters.R
 3 | \name{top_p}
 4 | \alias{top_p}
 5 | \title{Parameter functions for feature selection recipes}
 6 | \usage{
 7 | top_p(range = c(1L, 4L), trans = NULL)
 8 | }
 9 | \arguments{
10 | \item{range}{A two-element vector holding the _defaults_ for the smallest and
11 | largest possible values, respectively.}
12 | 
13 | \item{trans}{A `trans` object from the `scales` package, such as
14 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
15 | the default is used which matches the units used in `range`. If no
16 | transformation, `NULL`.}
17 | }
18 | \value{
19 | A function with classes "quant_param" and "param"
20 | }
21 | \description{
22 | Feature selection recipes allow the top-performing features to be selected
23 | using two parameters. `top_p` is for specifying the number of the
24 | top-performing features.
25 | }
26 | \examples{
27 | top_p(c(3, 10))
28 | }
29 | 


--------------------------------------------------------------------------------
/R/parameters.R:
--------------------------------------------------------------------------------
 1 | #' Parameter functions for feature selection recipes
 2 | #'
 3 | #' Feature selection recipes allow the top-performing features to be selected
 4 | #' using two parameters. `top_p` is for specifying the number of the
 5 | #' top-performing features.
 6 | #'
 7 | #' @param range A two-element vector holding the _defaults_ for the smallest and
 8 | #'   largest possible values, respectively.
 9 | #' @param trans A `trans` object from the `scales` package, such as
10 | #'   `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
11 | #'   the default is used which matches the units used in `range`. If no
12 | #'   transformation, `NULL`.
13 | #'
14 | #' @return A function with classes "quant_param" and "param"
15 | #' @export
16 | #'
17 | #' @examples
18 | #' top_p(c(3, 10))
19 | top_p <- function(range = c(1L, 4L), trans = NULL) {
20 |   dials::new_quant_param(
21 |     type = "integer",
22 |     range = range,
23 |     inclusive = c(TRUE, TRUE),
24 |     trans = trans,
25 |     label = c(top_p = "# Selected Predictors"),
26 |     finalize = dials::get_p
27 |   )
28 | }
29 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2019 Steven Pawley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: recipeselectors
 2 | Type: Package
 3 | Title: Extra Recipes Steps for Supervised Feature Selection
 4 | Version: 0.0.1
 5 | Authors@R: 
 6 |     person(given = "Steven",
 7 |            family = "Pawley",
 8 |            role = c("aut", "cre"),
 9 |            email = "dr.stevenpawley@gmail.com")
10 | Maintainer: Steven Pawley <dr.stevenpawley@gmail.com>
11 | Description: Provides additional steps for supervised feature selection to be 
12 |     used with the 'recipes' package.
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | LazyData: true
16 | URL: https://github.com/stevenpawley/recipeselectors
17 | BugReports: https://github.com/stevenpawley/recipeselectors/issues
18 | Depends:
19 |      R (>= 2.10),
20 |      recipes
21 | Imports:
22 |     generics,
23 |     tibble,
24 |     parsnip,
25 |     tune,
26 |     dials,
27 |     purrr,
28 |     rlang (>= 0.1.2),
29 |     magrittr,
30 |     dplyr,
31 |     scales,
32 |     pROC,
33 |     stats
34 | RoxygenNote: 7.1.2
35 | Suggests: 
36 |     testthat,
37 |     roxygen2,
38 |     FSelectorRcpp,
39 |     praznik,
40 |     ranger,
41 |     Boruta,
42 |     care,
43 |     modeldata,
44 |     covr
45 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_linear.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | library(modeldata)
 6 | 
 7 | data("cells")
 8 | 
 9 | test_that("step_select_linear, execution using top_p on binary case", {
10 |   rec <- cells %>%
11 |     select(-case) %>%
12 |     recipe(class ~ .) %>%
13 |     step_normalize(all_numeric_predictors()) %>%
14 |     step_select_linear(
15 |       all_predictors(),
16 |       outcome = "class",
17 |       top_p = 2
18 |     )
19 | 
20 |   prepped <- prep(rec)
21 |   selected <- bake(prepped, new_data = NULL)
22 | 
23 |   expect_length(names(selected), 3)
24 | })
25 | 
26 | 
27 | test_that("step_select_linear, execution using threshold on binary case", {
28 |   # test selection by retaining features with scores >= 50th percentile
29 |   rec <- cells %>%
30 |     select(-case) %>%
31 |     recipe(class ~ .) %>%
32 |     step_normalize(all_numeric_predictors()) %>%
33 |     step_select_linear(
34 |       all_predictors(),
35 |       outcome = "class",
36 |       threshold = 0.99
37 |     )
38 | 
39 |   prepped <- prep(rec)
40 |   selected <- juice(prepped)
41 | 
42 |   expect_length(names(selected), 2)
43 | })
44 | 


--------------------------------------------------------------------------------
/R/recipeselectors.R:
--------------------------------------------------------------------------------
 1 | #' recipeselectors: A collection of steps for feature selection to use with the
 2 | #' 'recipes' package
 3 | #'
 4 | #' \pkg{recipeselectors} provides a collection of additional step objects
 5 | #' related to feature selection to be used with the 'recipes' package.
 6 | #'
 7 | #' @examples
 8 | #' library(parsnip)
 9 | #' library(recipes)
10 | #' library(magrittr)
11 | #'
12 | #' # load the example iris dataset
13 | #' data(iris)
14 | #'
15 | #' # define a base model to use for feature importances
16 | #' base_model <- rand_forest(mode = "classification") %>%
17 | #'     set_engine("ranger", importance = "permutation")
18 | #'
19 | #' # create a preprocessing recipe
20 | #' rec <- iris %>%
21 | #'  recipe(Species ~ .) %>%
22 | #'  step_select_vip(all_predictors(), model = base_model, top_p = 2,
23 | #'                  outcome = "Species")
24 | #'
25 | #' prepped <- prep(rec)
26 | #'
27 | #' # create a model specification
28 | #' clf <- decision_tree(mode = "classification") %>%
29 | #'     set_engine("rpart")
30 | #'
31 | #' clf_fitted <- clf %>%
32 | #'     fit(Species ~ ., juice(prepped))
33 | #'
34 | #' @author Steven Pawley, \email{dr.stevenpawley@@gmail.com}
35 | 
36 | #' @docType package
37 | #' @name recipeselectors
38 | NULL
39 | 


--------------------------------------------------------------------------------
/man/recipeselectors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recipeselectors.R
 3 | \docType{package}
 4 | \name{recipeselectors}
 5 | \alias{recipeselectors}
 6 | \title{recipeselectors: A collection of steps for feature selection to use with the
 7 | 'recipes' package}
 8 | \description{
 9 | \pkg{recipeselectors} provides a collection of additional step objects
10 | related to feature selection to be used with the 'recipes' package.
11 | }
12 | \examples{
13 | library(parsnip)
14 | library(recipes)
15 | library(magrittr)
16 | 
17 | # load the example iris dataset
18 | data(iris)
19 | 
20 | # define a base model to use for feature importances
21 | base_model <- rand_forest(mode = "classification") \%>\%
22 |     set_engine("ranger", importance = "permutation")
23 | 
24 | # create a preprocessing recipe
25 | rec <- iris \%>\%
26 |  recipe(Species ~ .) \%>\%
27 |  step_select_vip(all_predictors(), model = base_model, top_p = 2,
28 |                  outcome = "Species")
29 | 
30 | prepped <- prep(rec)
31 | 
32 | # create a model specification
33 | clf <- decision_tree(mode = "classification") \%>\%
34 |     set_engine("rpart")
35 | 
36 | clf_fitted <- clf \%>\%
37 |     fit(Species ~ ., juice(prepped))
38 | 
39 | }
40 | \author{
41 | Steven Pawley, \email{dr.stevenpawley@gmail.com}
42 | }
43 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_tree.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | data("iris")
 6 | 
 7 | test_that("step_select_tree, execution using top_p", {
 8 |   skip_if_not_installed("rpart")
 9 | 
10 |   irisX <- iris[-5]
11 |   y <- iris$Species
12 | 
13 |   rec <- iris %>%
14 |     recipe(Species ~.) %>%
15 |     step_select_tree(
16 |       all_predictors(),
17 |       outcome = "Species",
18 |       engine = "rpart",
19 |       top_p = 2
20 |     )
21 | 
22 |   prepped <- prep(rec)
23 |   selected <- juice(prepped)
24 | 
25 |   expect_length(names(selected), 3)
26 | })
27 | 
28 | 
29 | test_that("step_select_tree, execution using threshold", {
30 |   skip_if_not_installed("rpart")
31 | 
32 |   irisX <- iris[-5]
33 |   y <- iris$Species
34 | 
35 |   # test selection by retaining features with scores >= 50th percentile
36 |   rec <- iris %>%
37 |     recipe(Species ~.) %>%
38 |     step_select_tree(
39 |       all_predictors(),
40 |       outcome = "Species",
41 |       threshold = 0.5
42 |     )
43 | 
44 |   prepped <- prep(rec)
45 |   selected <- juice(prepped)
46 | 
47 |   expect_length(names(selected), 3)
48 | 
49 |   # test selection by retaining features with scores in 90th percentile
50 |   rec <- iris %>%
51 |     recipe(Species ~.) %>%
52 |     step_select_tree(
53 |       all_predictors(),
54 |       outcome = "Species",
55 |       threshold = 0.9
56 |     )
57 | 
58 |   prepped <- prep(rec)
59 |   selected <- juice(prepped)
60 | 
61 |   expect_length(names(selected), 2)
62 | })
63 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_forests.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | 
 6 | data("iris")
 7 | 
 8 | test_that("step_select_forests, execution using top_p", {
 9 |   skip_if_not_installed("ranger")
10 | 
11 |   rec <- iris %>%
12 |     recipe(Species ~.) %>%
13 |     step_select_forests(
14 |       all_predictors(),
15 |       outcome = "Species",
16 |       engine = "ranger",
17 |       top_p = 2
18 |     )
19 | 
20 |   prepped <- prep(rec)
21 |   tidy(rec, number = 1)
22 |   selected <- juice(prepped)
23 | 
24 |   expect_length(names(selected), 3)
25 | })
26 | 
27 | 
28 | test_that("step_select_forests, execution using threshold", {
29 |   skip_if_not_installed("ranger")
30 | 
31 |   irisX <- iris[-5]
32 |   y <- iris$Species
33 | 
34 |   # test selection by retaining features with scores >= 50th percentile
35 |   rec <- iris %>%
36 |     recipe(Species ~.) %>%
37 |     step_select_forests(
38 |       all_predictors(),
39 |       outcome = "Species",
40 |       threshold = 0.5
41 |     )
42 | 
43 |   prepped <- prep(rec)
44 |   selected <- juice(prepped)
45 | 
46 |   expect_length(names(selected), 3)
47 | 
48 |   # test selection by retaining features with scores in 90th percentile
49 |   rec <- iris %>%
50 |     recipe(Species ~.) %>%
51 |     step_select_forests(
52 |       all_predictors(),
53 |       outcome = "Species",
54 |       threshold = 0.9
55 |     )
56 | 
57 |   prepped <- prep(rec)
58 |   selected <- juice(prepped)
59 | 
60 |   expect_length(names(selected), 2)
61 | })
62 | 
63 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_boruta.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(modeldata)
 5 | 
 6 | data("lending_club")
 7 | 
 8 | test_that("step_select_boruta, execution", {
 9 |   skip_if_not_installed("Boruta")
10 | 
11 |   # Boruta model results
12 |   set.seed(1234)
13 |   boruta_mod <- Boruta::Boruta(
14 |     x = lending_club[, -23],
15 |     y = lending_club$Class
16 |   )
17 |   excluded <- names(
18 |     boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"]
19 |   )
20 | 
21 |   # step_select_boruta results
22 |   rec <- recipe(Class ~ ., data = lending_club) %>%
23 |     step_select_boruta(all_predictors(), outcome = "Class")
24 |   set.seed(1234)
25 |   prepped <- rec %>% prep()
26 | 
27 |   # check
28 |   expect_equal(excluded, prepped$steps[[1]]$exclude)
29 |   expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory)
30 | })
31 | 
32 | 
33 | test_that("step_select_boruta, options", {
34 |   skip_if_not_installed("Boruta")
35 | 
36 |   # Boruta model results
37 |   set.seed(1234)
38 |   boruta_mod <- Boruta::Boruta(
39 |     x = lending_club[, -23],
40 |     y = lending_club$Class,
41 |     getImp = Boruta::getImpRfGini
42 |   )
43 |   excluded <- names(
44 |     boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"]
45 |   )
46 | 
47 |   # step_select_boruta results
48 |   rec <- recipe(Class ~ ., data = lending_club) %>%
49 |     step_select_boruta(all_predictors(), outcome = "Class",
50 |                        options = list(getImp = Boruta::getImpRfGini))
51 |   set.seed(1234)
52 |   prepped <- rec %>% prep()
53 | 
54 |   # check
55 |   expect_equal(excluded, prepped$steps[[1]]$exclude)
56 |   expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory)
57 | })
58 | 
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_vip.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | data("iris")
 6 | 
 7 | test_that("step_select_vip, execution using top_p", {
 8 |   skip_if_not_installed("ranger")
 9 | 
10 |   irisX <- iris[-5]
11 |   y <- iris$Species
12 | 
13 |   base_model <- rand_forest(mode = "classification") %>%
14 |     set_engine("ranger", importance = "permutation")
15 | 
16 |   rec <- iris %>%
17 |     recipe(Species ~.) %>%
18 |     step_select_vip(
19 |       all_predictors(),
20 |       outcome = "Species",
21 |       model = base_model,
22 |       top_p = 2
23 |     )
24 | 
25 |   prepped <- prep(rec)
26 |   selected <- juice(prepped)
27 | 
28 |   expect_length(names(selected), 3)
29 | })
30 | 
31 | 
32 | test_that("step_select_vip, execution using threshold", {
33 |   skip_if_not_installed("ranger")
34 | 
35 |   irisX <- iris[-5]
36 |   y <- iris$Species
37 | 
38 |   base_model <- rand_forest(mode = "classification") %>%
39 |     set_engine("ranger", importance = "permutation")
40 | 
41 |   # test selection by retaining features with scores >= 50th percentile
42 |   rec <- iris %>%
43 |     recipe(Species ~.) %>%
44 |     step_select_vip(
45 |       all_predictors(),
46 |       outcome = "Species",
47 |       model = base_model,
48 |       threshold = 0.5
49 |     )
50 | 
51 |   prepped <- prep(rec)
52 |   selected <- juice(prepped)
53 | 
54 |   expect_length(names(selected), 3)
55 | 
56 |   # test selection by retaining features with scores in 90th percentile
57 |   rec <- iris %>%
58 |     recipe(Species ~.) %>%
59 |     step_select_vip(
60 |       all_predictors(),
61 |       outcome = "Species",
62 |       model = base_model,
63 |       threshold = 0.9
64 |     )
65 | 
66 |   prepped <- prep(rec)
67 |   selected <- juice(prepped)
68 | 
69 |   expect_length(names(selected), 2)
70 | })
71 | 
72 | 


--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |   <url>
 4 |     <loc>/404.html</loc>
 5 |   </url>
 6 |   <url>
 7 |     <loc>/LICENSE-text.html</loc>
 8 |   </url>
 9 |   <url>
10 |     <loc>/LICENSE.html</loc>
11 |   </url>
12 |   <url>
13 |     <loc>/authors.html</loc>
14 |   </url>
15 |   <url>
16 |     <loc>/index.html</loc>
17 |   </url>
18 |   <url>
19 |     <loc>/reference/index.html</loc>
20 |   </url>
21 |   <url>
22 |     <loc>/reference/pipe.html</loc>
23 |   </url>
24 |   <url>
25 |     <loc>/reference/pull_importances.html</loc>
26 |   </url>
27 |   <url>
28 |     <loc>/reference/recipeselectors.html</loc>
29 |   </url>
30 |   <url>
31 |     <loc>/reference/step_boruta.html</loc>
32 |   </url>
33 |   <url>
34 |     <loc>/reference/step_idw.html</loc>
35 |   </url>
36 |   <url>
37 |     <loc>/reference/step_importance.html</loc>
38 |   </url>
39 |   <url>
40 |     <loc>/reference/step_infgain.html</loc>
41 |   </url>
42 |   <url>
43 |     <loc>/reference/step_mrmr.html</loc>
44 |   </url>
45 |   <url>
46 |     <loc>/reference/step_select_boruta.html</loc>
47 |   </url>
48 |   <url>
49 |     <loc>/reference/step_select_carscore.html</loc>
50 |   </url>
51 |   <url>
52 |     <loc>/reference/step_select_forests.html</loc>
53 |   </url>
54 |   <url>
55 |     <loc>/reference/step_select_infgain.html</loc>
56 |   </url>
57 |   <url>
58 |     <loc>/reference/step_select_linear.html</loc>
59 |   </url>
60 |   <url>
61 |     <loc>/reference/step_select_mrmr.html</loc>
62 |   </url>
63 |   <url>
64 |     <loc>/reference/step_select_roc.html</loc>
65 |   </url>
66 |   <url>
67 |     <loc>/reference/step_select_tree.html</loc>
68 |   </url>
69 |   <url>
70 |     <loc>/reference/step_select_vip.html</loc>
71 |   </url>
72 |   <url>
73 |     <loc>/reference/step_select_xtab.html</loc>
74 |   </url>
75 |   <url>
76 |     <loc>/reference/tidyeval.html</loc>
77 |   </url>
78 |   <url>
79 |     <loc>/reference/top_p.html</loc>
80 |   </url>
81 | </urlset>
82 | 


--------------------------------------------------------------------------------
/docs/bootstrap-toc.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/)
 3 |  * Copyright 2015 Aidan Feldman
 4 |  * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */
 5 | 
 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */
 7 | 
 8 | /* All levels of nav */
 9 | nav[data-toggle='toc'] .nav > li > a {
10 |   display: block;
11 |   padding: 4px 20px;
12 |   font-size: 13px;
13 |   font-weight: 500;
14 |   color: #767676;
15 | }
16 | nav[data-toggle='toc'] .nav > li > a:hover,
17 | nav[data-toggle='toc'] .nav > li > a:focus {
18 |   padding-left: 19px;
19 |   color: #563d7c;
20 |   text-decoration: none;
21 |   background-color: transparent;
22 |   border-left: 1px solid #563d7c;
23 | }
24 | nav[data-toggle='toc'] .nav > .active > a,
25 | nav[data-toggle='toc'] .nav > .active:hover > a,
26 | nav[data-toggle='toc'] .nav > .active:focus > a {
27 |   padding-left: 18px;
28 |   font-weight: bold;
29 |   color: #563d7c;
30 |   background-color: transparent;
31 |   border-left: 2px solid #563d7c;
32 | }
33 | 
34 | /* Nav: second level (shown on .active) */
35 | nav[data-toggle='toc'] .nav .nav {
36 |   display: none; /* Hide by default, but at >768px, show it */
37 |   padding-bottom: 10px;
38 | }
39 | nav[data-toggle='toc'] .nav .nav > li > a {
40 |   padding-top: 1px;
41 |   padding-bottom: 1px;
42 |   padding-left: 30px;
43 |   font-size: 12px;
44 |   font-weight: normal;
45 | }
46 | nav[data-toggle='toc'] .nav .nav > li > a:hover,
47 | nav[data-toggle='toc'] .nav .nav > li > a:focus {
48 |   padding-left: 29px;
49 | }
50 | nav[data-toggle='toc'] .nav .nav > .active > a,
51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a,
52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a {
53 |   padding-left: 28px;
54 |   font-weight: 500;
55 | }
56 | 
57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */
58 | nav[data-toggle='toc'] .nav > .active > ul {
59 |   display: block;
60 | }
61 | 


--------------------------------------------------------------------------------
/man/pull_importances.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pull_importances.R
 3 | \name{pull_importances}
 4 | \alias{pull_importances}
 5 | \title{Pull feature importances from a parsnip fitted model}
 6 | \usage{
 7 | pull_importances(object, scaled = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A `model_fit` object.}
11 | 
12 | \item{scaled}{A logical indicating whether to rescale the importances between
13 | 0 and 1. Default is TRUE.}
14 | 
15 | \item{...}{A list of other parameters passed to the feature importance
16 | method.}
17 | }
18 | \value{
19 | tibble
20 | }
21 | \description{
22 | `pull_importances` is a generic function to extract feature importance scores
23 | or coefficients from a parsnip `model_fit` object and return them as a tibble
24 | with a 'feature' and 'importance' column. This is designed to support the
25 | `step_importance` recipe step.
26 | }
27 | \details{
28 | Most of the basic models within the parsnip package that support feature
29 | importances are implemented (call `methods(pull_importances)` to list models
30 | that are currently implemented). If need to pull the feature importance scores
31 | from a model that is not currently supported in this package, then you can
32 | add a class to the pull_importances generic function which returns a
33 | two-column tibble:
34 | }
35 | \examples{
36 | library(parsnip)
37 | 
38 | # pull feature importances from a model_fit object
39 | model <- boost_tree(mode = "classification") \%>\%
40 |     set_engine("xgboost")
41 | model_fit <- model \%>\% fit(Species ~., iris)
42 | pull_importances(model_fit)
43 | 
44 | # create a new pull_importances method
45 | pull_importances._ranger <- function(object, scaled = FALSE, ...) {
46 |     # create a call to the ranger::importance function avoiding having to use
47 |     # ranger as a dependency
48 |     call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit)
49 |     scores <- rlang::eval_tidy(call)
50 | 
51 |     # create a tibble with 'feature' and 'importance' columns
52 |     scores <- tibble::tibble(
53 |       feature = names(scores),
54 |       importance = as.numeric(scores)
55 |     )
56 |     # optionally rescale the importance scores
57 |     if (isTRUE(scaled))
58 |       scores$importance <- rescale(scores$importance)
59 | 
60 |     scores
61 | }
62 | }
63 | 


--------------------------------------------------------------------------------
/docs/docsearch.js:
--------------------------------------------------------------------------------
 1 | $(function() {
 2 | 
 3 |   // register a handler to move the focus to the search bar
 4 |   // upon pressing shift + "/" (i.e. "?")
 5 |   $(document).on('keydown', function(e) {
 6 |     if (e.shiftKey && e.keyCode == 191) {
 7 |       e.preventDefault();
 8 |       $("#search-input").focus();
 9 |     }
10 |   });
11 | 
12 |   $(document).ready(function() {
13 |     // do keyword highlighting
14 |     /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */
15 |     var mark = function() {
16 | 
17 |       var referrer = document.URL ;
18 |       var paramKey = "q" ;
19 | 
20 |       if (referrer.indexOf("?") !== -1) {
21 |         var qs = referrer.substr(referrer.indexOf('?') + 1);
22 |         var qs_noanchor = qs.split('#')[0];
23 |         var qsa = qs_noanchor.split('&');
24 |         var keyword = "";
25 | 
26 |         for (var i = 0; i < qsa.length; i++) {
27 |           var currentParam = qsa[i].split('=');
28 | 
29 |           if (currentParam.length !== 2) {
30 |             continue;
31 |           }
32 | 
33 |           if (currentParam[0] == paramKey) {
34 |             keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20"));
35 |           }
36 |         }
37 | 
38 |         if (keyword !== "") {
39 |           $(".contents").unmark({
40 |             done: function() {
41 |               $(".contents").mark(keyword);
42 |             }
43 |           });
44 |         }
45 |       }
46 |     };
47 | 
48 |     mark();
49 |   });
50 | });
51 | 
52 | /* Search term highlighting ------------------------------*/
53 | 
54 | function matchedWords(hit) {
55 |   var words = [];
56 | 
57 |   var hierarchy = hit._highlightResult.hierarchy;
58 |   // loop to fetch from lvl0, lvl1, etc.
59 |   for (var idx in hierarchy) {
60 |     words = words.concat(hierarchy[idx].matchedWords);
61 |   }
62 | 
63 |   var content = hit._highlightResult.content;
64 |   if (content) {
65 |     words = words.concat(content.matchedWords);
66 |   }
67 | 
68 |   // return unique words
69 |   var words_uniq = [...new Set(words)];
70 |   return words_uniq;
71 | }
72 | 
73 | function updateHitURL(hit) {
74 | 
75 |   var words = matchedWords(hit);
76 |   var url = "";
77 | 
78 |   if (hit.anchor) {
79 |     url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor;
80 |   } else {
81 |     url = hit.url + '?q=' + escape(words.join(" "));
82 |   }
83 | 
84 |   return url;
85 | }
86 | 


--------------------------------------------------------------------------------
/R/misc.R:
--------------------------------------------------------------------------------
  1 | check_zero_one <- function(x) {
  2 |   if (is.na(x)) {
  3 |     return(x)
  4 |   } else {
  5 |     if (is.numeric(x)) {
  6 |       if (x >= 1 | x <= 0) {
  7 |         rlang::abort("`threshold` should be on (0, 1).")
  8 |       }
  9 |     } else {
 10 |       rlang::abort("`threshold` should be numeric.")
 11 |     }
 12 |   }
 13 |   return(x)
 14 | }
 15 | 
 16 | check_top_p <- function(x, n) {
 17 |   if (is.na(x)) {
 18 |     return(x)
 19 |   } else {
 20 |     if (is.numeric(x)) {
 21 |       if (!is.integer(x)) {
 22 |         x <- as.integer(x)
 23 |       }
 24 |       if (x >= n | x <= 0) {
 25 |         msg <- paste0("`top_p` should be on (0, ", n, ").")
 26 |         rlang::warn(msg)
 27 |         x <- min(n - 1, x)
 28 |       }
 29 |     } else {
 30 |       rlang::abort("`top_p` should be numeric.")
 31 |     }
 32 |   }
 33 |   x
 34 | }
 35 | 
 36 | check_criteria <- function(top_p, threshold, cl) {
 37 |   if (is.na(top_p) & is.na(threshold)) {
 38 |     msg <- paste0(
 39 |       "For `",
 40 |       cl[[1]],
 41 |       "`, `top_p` and `threshold` cannot both be missing."
 42 |     )
 43 |     rlang::abort(msg)
 44 |   }
 45 |   invisible(NULL)
 46 | }
 47 | 
 48 | dual_filter <- function(x, top_p, threshold, maximize) {
 49 |   na_x <- x[ is.na(x)]
 50 |   x <- x[!is.na(x)]
 51 |   x <- sort(x)
 52 |   if (maximize) {
 53 |     x <- rev(x)
 54 |   }
 55 |   p <- length(x)
 56 | 
 57 |   if (!is.na(top_p)) {
 58 |     top_p_lgl <- seq_along(x) <= top_p
 59 |   } else {
 60 |     top_p_lgl <- rep(FALSE, p)
 61 |   }
 62 | 
 63 |   if (!is.na(threshold)) {
 64 |     if (maximize) {
 65 |       threshold_lgl <- x >= threshold
 66 |     } else {
 67 |       threshold_lgl <- x <= threshold
 68 |     }
 69 |   } else {
 70 |     threshold_lgl <- rep(FALSE, p)
 71 |   }
 72 |   keep_lgl <- top_p_lgl | threshold_lgl
 73 |   c(names(x)[!keep_lgl], names(na_x))
 74 | }
 75 | 
 76 | select_percentile <- function(x, top_p, threshold, maximize) {
 77 |   # filter a named vector by the top_p features or using a percentile
 78 |   # threshold
 79 | 
 80 |   x <- x[!is.na(x)]
 81 | 
 82 |   if (!is.na(threshold)) {
 83 |     p_to_exceed <- stats::quantile(x, threshold)
 84 | 
 85 |     if (maximize) {
 86 |       removals <- x < p_to_exceed
 87 |     } else {
 88 |       removals <- x >= p_to_exceed
 89 |     }
 90 | 
 91 |     removals <- names(removals[removals])
 92 | 
 93 |   } else {
 94 |     if (maximize) {
 95 |       x <- sort(x, decreasing = TRUE)
 96 |     } else {
 97 |       x <- sort(x, decreasing = FALSE)
 98 |     }
 99 | 
100 |     removals <- names(x[-seq_len(top_p)])
101 |   }
102 | 
103 |   removals
104 | }
105 | 


--------------------------------------------------------------------------------
/man/step_select_boruta.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/step_select_boruta.R
 3 | \name{step_select_boruta}
 4 | \alias{step_select_boruta}
 5 | \alias{tidy.step_select_boruta}
 6 | \title{Feature selection step using Boruta}
 7 | \usage{
 8 | step_select_boruta(
 9 |   recipe,
10 |   ...,
11 |   outcome = NULL,
12 |   role = "predictor",
13 |   trained = FALSE,
14 |   exclude = NULL,
15 |   options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100),
16 |   res = NULL,
17 |   skip = FALSE,
18 |   id = recipes::rand_id("select_boruta")
19 | )
20 | 
21 | \method{tidy}{step_select_boruta}(x, ...)
22 | }
23 | \arguments{
24 | \item{recipe}{A recipe object. The step will be added to the sequence of
25 | operations for this recipe.}
26 | 
27 | \item{...}{One or more selector functions to choose which variables are
28 | affected by the step. See selections() for more details. For the tidy
29 | method, these are not currently used.}
30 | 
31 | \item{outcome}{A character string with the name of the response variable to
32 | use to calculate the feature importance scores.}
33 | 
34 | \item{role}{Not used by this step since no new variables are created.}
35 | 
36 | \item{trained}{A logical to indicate if the quantities for preprocessing have
37 | been estimated.}
38 | 
39 | \item{exclude}{A character vector of predictor names that will be removed
40 | from the data. This will be set when `prep()` is used on the recipe and
41 | should not be set by the user.}
42 | 
43 | \item{options}{A list of options to pass to `Boruta::Boruta()`. The defaults
44 | use Boruta's defaults. *Note* that `x` and `y` should not be passed here.}
45 | 
46 | \item{res}{The `Boruta::Boruta` object is stored here once this preprocessing
47 | step has been trained by `prep.recipe()`.}
48 | 
49 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
50 | bake.recipe()? While all operations are baked when prep.recipe() is run,
51 | some operations may not be able to be conducted on new data (e.g.
52 | processing the outcome variable(s)). Care should be taken when using skip =
53 | TRUE as it may affect the computations for subsequent operations.}
54 | 
55 | \item{id}{A character string that is unique to this step to identify it.}
56 | 
57 | \item{x}{A `step_select_boruta` object.}
58 | }
59 | \value{
60 | a `step_select_boruta` object.
61 | }
62 | \description{
63 | `step_select_boruta` creates a *specification* of a recipe step that selects a
64 | subset of predictors using the Boruta feature selection approach.
65 | }
66 | \examples{
67 | library(recipes)
68 | library(parsnip)
69 | 
70 | # load the example iris dataset
71 | data(cells, package = "modeldata")
72 | 
73 | # create a preprocessing recipe
74 | rec <-
75 |  recipe(class ~ ., data = cells[, -1]) \%>\%
76 |  step_select_boruta(all_predictors(), outcome = "class")
77 | 
78 | prepped <- prep(rec)
79 | 
80 | preproc_data <- juice(prepped)
81 | prepped
82 | }
83 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(bake,step_select_boruta)
 4 | S3method(bake,step_select_carscore)
 5 | S3method(bake,step_select_forests)
 6 | S3method(bake,step_select_infgain)
 7 | S3method(bake,step_select_linear)
 8 | S3method(bake,step_select_mrmr)
 9 | S3method(bake,step_select_roc)
10 | S3method(bake,step_select_tree)
11 | S3method(bake,step_select_vip)
12 | S3method(bake,step_select_xtab)
13 | S3method(prep,step_select_boruta)
14 | S3method(prep,step_select_carscore)
15 | S3method(prep,step_select_forests)
16 | S3method(prep,step_select_infgain)
17 | S3method(prep,step_select_linear)
18 | S3method(prep,step_select_mrmr)
19 | S3method(prep,step_select_roc)
20 | S3method(prep,step_select_tree)
21 | S3method(prep,step_select_vip)
22 | S3method(prep,step_select_xtab)
23 | S3method(print,step_select_boruta)
24 | S3method(print,step_select_carscore)
25 | S3method(print,step_select_forests)
26 | S3method(print,step_select_infgain)
27 | S3method(print,step_select_linear)
28 | S3method(print,step_select_mrmr)
29 | S3method(print,step_select_roc)
30 | S3method(print,step_select_tree)
31 | S3method(print,step_select_vip)
32 | S3method(print,step_select_xtab)
33 | S3method(pull_importances,"_C5.0")
34 | S3method(pull_importances,"_H2OMultinomialModel")
35 | S3method(pull_importances,"_H2ORegressionModel")
36 | S3method(pull_importances,"_cubist")
37 | S3method(pull_importances,"_earth")
38 | S3method(pull_importances,"_elnet")
39 | S3method(pull_importances,"_glm")
40 | S3method(pull_importances,"_lm")
41 | S3method(pull_importances,"_lognet")
42 | S3method(pull_importances,"_randomForest")
43 | S3method(pull_importances,"_ranger")
44 | S3method(pull_importances,"_rpart")
45 | S3method(pull_importances,"_xgb.Booster")
46 | S3method(pull_importances,default)
47 | S3method(tidy,step_select_boruta)
48 | S3method(tidy,step_select_carscore)
49 | S3method(tidy,step_select_forests)
50 | S3method(tidy,step_select_infgain)
51 | S3method(tidy,step_select_linear)
52 | S3method(tidy,step_select_mrmr)
53 | S3method(tidy,step_select_roc)
54 | S3method(tidy,step_select_tree)
55 | S3method(tidy,step_select_vip)
56 | S3method(tidy,step_select_xtab)
57 | S3method(tunable,step_select_carscore)
58 | S3method(tunable,step_select_forests)
59 | S3method(tunable,step_select_infgain)
60 | S3method(tunable,step_select_linear)
61 | S3method(tunable,step_select_mrmr)
62 | S3method(tunable,step_select_roc)
63 | S3method(tunable,step_select_tree)
64 | S3method(tunable,step_select_vip)
65 | S3method(tunable,step_select_xtab)
66 | export("%>%")
67 | export(pull_importances)
68 | export(step_select_boruta)
69 | export(step_select_carscore)
70 | export(step_select_forests)
71 | export(step_select_infgain)
72 | export(step_select_linear)
73 | export(step_select_mrmr)
74 | export(step_select_roc)
75 | export(step_select_tree)
76 | export(step_select_vip)
77 | export(step_select_xtab)
78 | export(top_p)
79 | importFrom(generics,tidy)
80 | importFrom(magrittr,"%>%")
81 | importFrom(recipes,bake)
82 | importFrom(recipes,prep)
83 | importFrom(recipes,step)
84 | importFrom(tibble,as_tibble)
85 | importFrom(tibble,tibble)
86 | importFrom(tune,tunable)
87 | 


--------------------------------------------------------------------------------
/man/step_select_mrmr.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_mrmr.R
  3 | \name{step_select_mrmr}
  4 | \alias{step_select_mrmr}
  5 | \alias{tidy.step_select_mrmr}
  6 | \title{Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)}
  7 | \usage{
  8 | step_select_mrmr(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   threads = 0,
 17 |   exclude = NULL,
 18 |   scores = NULL,
 19 |   skip = FALSE,
 20 |   id = recipes::rand_id("select_mrmr")
 21 | )
 22 | 
 23 | \method{tidy}{step_select_mrmr}(x, ...)
 24 | }
 25 | \arguments{
 26 | \item{recipe}{A recipe object. The step will be added to the sequence of
 27 | operations for this recipe}
 28 | 
 29 | \item{...}{One or more selector functions to choose which variables are
 30 | affected by the step. See selections() for more details. For the tidy
 31 | method, these are not currently used}
 32 | 
 33 | \item{outcome}{A character string specifying the name of response variable
 34 | used to evaluate mRMR.}
 35 | 
 36 | \item{role}{Not used by this step since no new variables are created}
 37 | 
 38 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 39 | been estimated}
 40 | 
 41 | \item{top_p}{An integer that will be used to select the number of best
 42 | scoring features.}
 43 | 
 44 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 45 | of best scoring features to select. Features with scores that are _larger_
 46 | than the specified threshold will be retained, for example `threshold =
 47 | 0.9` will retain only predictors with scores in the top 90th percentile.
 48 | Note that this overrides `top_p`.}
 49 | 
 50 | \item{threads}{An integer specifying the number of threads to use for
 51 | processing. The default = 0 uses all available threads.}
 52 | 
 53 | \item{exclude}{A character vector of predictor names that will be removed
 54 | from the data. This will be set when `prep()` is used on the recipe and
 55 | should not be set by the user.}
 56 | 
 57 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 58 | names of the variables and their mRMR scores. This parameter is only
 59 | produced after the recipe has been trained.}
 60 | 
 61 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 62 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 63 | some operations may not be able to be conducted on new data (e.g.
 64 | processing the outcome variable(s)). Care should be taken when using skip =
 65 | TRUE as it may affect the computations for subsequent operations.}
 66 | 
 67 | \item{id}{A character string that is unique to this step to identify it.}
 68 | 
 69 | \item{x}{A `step_select_mrmr` object.}
 70 | }
 71 | \value{
 72 | A step_select_mrmr object.
 73 | }
 74 | \description{
 75 | `step_select_mrmr` creates a *specification* of a recipe step that will apply
 76 | minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric
 77 | data. The top `top_p` scoring features, or features whose scores occur in
 78 | the top percentile `threshold` will be retained as new predictors.
 79 | }
 80 | \details{
 81 | The recipe will stop if both `top_p` and `threshold` are left unspecified.
 82 | }
 83 | \examples{
 84 | library(recipes)
 85 | 
 86 | data(cells, package = "modeldata")
 87 | 
 88 | rec <-
 89 |  recipe(class ~ ., data = cells[, -1]) \%>\%
 90 |  step_select_mrmr(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9)
 91 | 
 92 | prepped <- prep(rec)
 93 | 
 94 | new_data <- juice(prepped)
 95 | prepped
 96 | }
 97 | \concept{preprocessing}
 98 | \concept{supervised_filter}
 99 | \keyword{datagen}
100 | 


--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
  1 | /* http://gregfranko.com/blog/jquery-best-practices/ */
  2 | (function($) {
  3 |   $(function() {
  4 | 
  5 |     $('.navbar-fixed-top').headroom();
  6 | 
  7 |     $('body').css('padding-top', $('.navbar').height() + 10);
  8 |     $(window).resize(function(){
  9 |       $('body').css('padding-top', $('.navbar').height() + 10);
 10 |     });
 11 | 
 12 |     $('[data-toggle="tooltip"]').tooltip();
 13 | 
 14 |     var cur_path = paths(location.pathname);
 15 |     var links = $("#navbar ul li a");
 16 |     var max_length = -1;
 17 |     var pos = -1;
 18 |     for (var i = 0; i < links.length; i++) {
 19 |       if (links[i].getAttribute("href") === "#")
 20 |         continue;
 21 |       // Ignore external links
 22 |       if (links[i].host !== location.host)
 23 |         continue;
 24 | 
 25 |       var nav_path = paths(links[i].pathname);
 26 | 
 27 |       var length = prefix_length(nav_path, cur_path);
 28 |       if (length > max_length) {
 29 |         max_length = length;
 30 |         pos = i;
 31 |       }
 32 |     }
 33 | 
 34 |     // Add class to parent <li>, and enclosing <li> if in dropdown
 35 |     if (pos >= 0) {
 36 |       var menu_anchor = $(links[pos]);
 37 |       menu_anchor.parent().addClass("active");
 38 |       menu_anchor.closest("li.dropdown").addClass("active");
 39 |     }
 40 |   });
 41 | 
 42 |   function paths(pathname) {
 43 |     var pieces = pathname.split("/");
 44 |     pieces.shift(); // always starts with /
 45 | 
 46 |     var end = pieces[pieces.length - 1];
 47 |     if (end === "index.html" || end === "")
 48 |       pieces.pop();
 49 |     return(pieces);
 50 |   }
 51 | 
 52 |   // Returns -1 if not found
 53 |   function prefix_length(needle, haystack) {
 54 |     if (needle.length > haystack.length)
 55 |       return(-1);
 56 | 
 57 |     // Special case for length-0 haystack, since for loop won't run
 58 |     if (haystack.length === 0) {
 59 |       return(needle.length === 0 ? 0 : -1);
 60 |     }
 61 | 
 62 |     for (var i = 0; i < haystack.length; i++) {
 63 |       if (needle[i] != haystack[i])
 64 |         return(i);
 65 |     }
 66 | 
 67 |     return(haystack.length);
 68 |   }
 69 | 
 70 |   /* Clipboard --------------------------*/
 71 | 
 72 |   function changeTooltipMessage(element, msg) {
 73 |     var tooltipOriginalTitle=element.getAttribute('data-original-title');
 74 |     element.setAttribute('data-original-title', msg);
 75 |     $(element).tooltip('show');
 76 |     element.setAttribute('data-original-title', tooltipOriginalTitle);
 77 |   }
 78 | 
 79 |   if(ClipboardJS.isSupported()) {
 80 |     $(document).ready(function() {
 81 |       var copyButton = "<button type='button' class='btn btn-primary btn-copy-ex' type = 'submit' title='Copy to clipboard' aria-label='Copy to clipboard' data-toggle='tooltip' data-placement='left auto' data-trigger='hover' data-clipboard-copy><i class='fa fa-copy'></i></button>";
 82 | 
 83 |       $("div.sourceCode").addClass("hasCopyButton");
 84 | 
 85 |       // Insert copy buttons:
 86 |       $(copyButton).prependTo(".hasCopyButton");
 87 | 
 88 |       // Initialize tooltips:
 89 |       $('.btn-copy-ex').tooltip({container: 'body'});
 90 | 
 91 |       // Initialize clipboard:
 92 |       var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', {
 93 |         text: function(trigger) {
 94 |           return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, "");
 95 |         }
 96 |       });
 97 | 
 98 |       clipboardBtnCopies.on('success', function(e) {
 99 |         changeTooltipMessage(e.trigger, 'Copied!');
100 |         e.clearSelection();
101 |       });
102 | 
103 |       clipboardBtnCopies.on('error', function() {
104 |         changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy');
105 |       });
106 |     });
107 |   }
108 | })(window.jQuery || window.$)
109 | 


--------------------------------------------------------------------------------
/man/step_select_roc.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_roc.R
  3 | \name{step_select_roc}
  4 | \alias{step_select_roc}
  5 | \alias{tidy.step_select_roc}
  6 | \title{Filter Numeric Predictors using ROC Curve}
  7 | \usage{
  8 | step_select_roc(
  9 |   recipe,
 10 |   ...,
 11 |   outcome,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   threshold = NA,
 15 |   top_p = NA,
 16 |   exclude = NULL,
 17 |   skip = FALSE,
 18 |   id = recipes::rand_id("select_roc")
 19 | )
 20 | 
 21 | \method{tidy}{step_select_roc}(x, ...)
 22 | }
 23 | \arguments{
 24 | \item{recipe}{A recipe object. The step will be added to the sequence of
 25 | operations for this recipe.}
 26 | 
 27 | \item{...}{One or more selector functions to choose which predictors are
 28 | affected by the step. See [selections()] for more details. For the `tidy`
 29 | method, these are not currently used.}
 30 | 
 31 | \item{outcome}{A single character string that specifies a single categorical
 32 | variable to be used as the class.}
 33 | 
 34 | \item{role}{For model terms created by this step, what analysis role should
 35 | they be assigned?. By default, the function assumes that resulting distances
 36 | will be used as predictors in a model.}
 37 | 
 38 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 39 | been estimated.}
 40 | 
 41 | \item{threshold}{A numeric value, in AUC units, where predictors with ROC
 42 | AUC values _larger_ than the threshold will be retained. A value of `NA`
 43 | implies that this criterion will be ignored.}
 44 | 
 45 | \item{top_p}{An integer that will be used to select the predictors with the
 46 | largest ROC AUC values. A value of `NA` implies that this criterion will be
 47 | ignored.}
 48 | 
 49 | \item{exclude}{A character vector of predictor names that will be removed
 50 | from the data. This will be set when `prep()` is used on the recipe and
 51 | should not be set by the user.}
 52 | 
 53 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 54 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 55 | some operations may not be able to be conducted on new data (e.g.
 56 | processing the outcome variable(s)). Care should be taken when using skip =
 57 | TRUE as it may affect the computations for subsequent operations.}
 58 | 
 59 | \item{id}{A character string that is unique to this step to identify it.}
 60 | 
 61 | \item{x}{A `step_select_roc` object.}
 62 | }
 63 | \value{
 64 | An updated version of `recipe` with the new step
 65 |  added to the sequence of existing steps (if any). For the
 66 |  `tidy` method, a tibble with a `terms` column for which predictors were
 67 | removed.
 68 | }
 69 | \description{
 70 | `step_select_roc` creates a *specification* of a recipe step that will
 71 |  filter predictors using their relationship with the outcome as measured
 72 |  using a Receiver Operating Characteristic curve.
 73 | }
 74 | \details{
 75 | The recipe will stop if both `top_p` and `threshold` are left unspecified.
 76 | 
 77 | The ROC AUC will be set to be 1 - AUC if the value is less than 0.50.
 78 | }
 79 | \examples{
 80 | data(cells, package = "modeldata")
 81 | 
 82 | rec <-
 83 |   recipe(class ~ ., data = cells[, -1]) \%>\%
 84 |   step_select_roc(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) \%>\%
 85 |   prep()
 86 | 
 87 | rec \%>\% juice(all_predictors()) \%>\% names()
 88 | 
 89 | # Use ROC values to select but always keep at least one:
 90 | rec <-
 91 |   recipe(class ~ ., data = cells[, -1]) \%>\%
 92 |   step_select_roc(all_predictors(), outcome = "class", top_p = 1, threshold = 0.99) \%>\%
 93 |   prep()
 94 | 
 95 | rec \%>\% juice(all_predictors()) \%>\% names()
 96 | 
 97 | # in case of missing data...
 98 | }
 99 | \concept{preprocessing}
100 | \concept{supervised_filter}
101 | \keyword{datagen}
102 | 


--------------------------------------------------------------------------------
/man/step_select_xtab.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_xtab.R
  3 | \name{step_select_xtab}
  4 | \alias{step_select_xtab}
  5 | \alias{tidy.step_select_xtab}
  6 | \title{Filter Categorical Predictors using Contingency Tables}
  7 | \usage{
  8 | step_select_xtab(
  9 |   recipe,
 10 |   ...,
 11 |   outcome,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   threshold = NA,
 15 |   top_p = NA,
 16 |   exact = FALSE,
 17 |   fdr = TRUE,
 18 |   exclude = NULL,
 19 |   skip = FALSE,
 20 |   id = recipes::rand_id("select_xtab")
 21 | )
 22 | 
 23 | \method{tidy}{step_select_xtab}(x, ...)
 24 | }
 25 | \arguments{
 26 | \item{recipe}{A recipe object. The step will be added to the sequence of
 27 | operations for this recipe.}
 28 | 
 29 | \item{...}{One or more selector functions to choose which predictors are
 30 | affected by the step. See [selections()] for more details. For the `tidy`
 31 | method, these are not currently used.}
 32 | 
 33 | \item{outcome}{A single character string that specifies a single categorical
 34 | variable to be used as the class.}
 35 | 
 36 | \item{role}{For model terms created by this step, what analysis role should
 37 | they be assigned?. By default, the function assumes that resulting distances
 38 | will be used as predictors in a model.}
 39 | 
 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 41 | been estimated.}
 42 | 
 43 | \item{threshold}{A numeric value, in p-value/FDR units, where predictors with
 44 | _smaller_ than the threshold will be retained. A value of `NA`
 45 | implies that this criterion will be ignored.}
 46 | 
 47 | \item{top_p}{An integer that will be used to select the predictors with the
 48 | smallest p/FDR values. A value of `NA` implies that this criterion will be
 49 | ignored.}
 50 | 
 51 | \item{exact}{Should an exact test be used?}
 52 | 
 53 | \item{fdr}{Should false discovery rates (FDR) be used instead of p-values?}
 54 | 
 55 | \item{exclude}{A character vector of predictor names that will be removed
 56 | from the data. This will be set when `prep()` is used on the recipe and
 57 | should not be set by the user.}
 58 | 
 59 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 60 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 61 | some operations may not be able to be conducted on new data (e.g.
 62 | processing the outcome variable(s)). Care should be taken when using skip =
 63 | TRUE as it may affect the computations for subsequent operations.}
 64 | 
 65 | \item{id}{A character string that is unique to this step to identify it.}
 66 | 
 67 | \item{x}{A `step_select_xtab` object.}
 68 | }
 69 | \value{
 70 | An updated version of `recipe` with the new step added to the
 71 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 72 |  `terms` column for which predictors were removed.
 73 | }
 74 | \description{
 75 | `step_select_xtab` creates a *specification* of a recipe step that will
 76 |  filter predictors using their relationship with the outcome as measured
 77 |  using statistical tests for association.
 78 | }
 79 | \details{
 80 | The recipe will stop if both `top_p` and `threshold` are left unspecified. If
 81 | both are used, they are combined via 'or'.
 82 | 
 83 | The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]).
 84 | 
 85 | Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed.
 86 | }
 87 | \examples{
 88 | data(attrition, package = "modeldata")
 89 | 
 90 | rec <-
 91 |   recipe(Attrition ~ ., data = attrition) \%>\%
 92 |   step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition",
 93 |                    top_p = 1, threshold = 0.001, exact = TRUE) \%>\%
 94 |   prep()
 95 | 
 96 | rec \%>\% juice(all_nominal(), -all_outcomes()) \%>\% names()
 97 | 
 98 | tidy(rec, number = 1)
 99 | 
100 | }
101 | \concept{preprocessing}
102 | \concept{supervised_filter}
103 | \keyword{datagen}
104 | 


--------------------------------------------------------------------------------
/man/step_select_vip.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_vip.R
  3 | \name{step_select_vip}
  4 | \alias{step_select_vip}
  5 | \alias{tidy.step_select_vip}
  6 | \title{Feature selection step using a model's feature importance scores or
  7 | coefficients}
  8 | \usage{
  9 | step_select_vip(
 10 |   recipe,
 11 |   ...,
 12 |   outcome = NULL,
 13 |   role = "predictor",
 14 |   trained = FALSE,
 15 |   model = NULL,
 16 |   top_p = NA,
 17 |   threshold = NA,
 18 |   exclude = NULL,
 19 |   scores = NULL,
 20 |   skip = FALSE,
 21 |   id = recipes::rand_id("select_vip")
 22 | )
 23 | 
 24 | \method{tidy}{step_select_vip}(x, ...)
 25 | }
 26 | \arguments{
 27 | \item{recipe}{A recipe object. The step will be added to the sequence of
 28 | operations for this recipe.}
 29 | 
 30 | \item{...}{One or more selector functions to choose which variables are
 31 | affected by the step. See selections() for more details. For the tidy
 32 | method, these are not currently used.}
 33 | 
 34 | \item{outcome}{A character string with the name of the response variable to
 35 | use to calculate the feature importance scores.}
 36 | 
 37 | \item{role}{Not used by this step since no new variables are created.}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{model}{A `model_spec` object from `parsnip` that has a feature
 43 | importances or coefficients method. The model needs to have an equivalent
 44 | `pull_importances` method defined. See `?pull_importances` for how to
 45 | define methods for models that are not currently supported.}
 46 | 
 47 | \item{top_p}{An integer with the number of best scoring features to
 48 | select.}
 49 | 
 50 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 51 | of best scoring features to select. Features with scores that are _larger_
 52 | than the specified threshold will be retained, for example `threshold =
 53 | 0.9` will retain only predictors with scores in the top 90th percentile.
 54 | Note that this overrides `top_p`.}
 55 | 
 56 | \item{exclude}{A character vector of predictor names that will be removed
 57 | from the data. This will be set when `prep()` is used on the recipe and
 58 | should not be set by the user.}
 59 | 
 60 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 61 | names of the variables and their feature importance scores. This parameter
 62 | is only produced after the recipe has been trained.}
 63 | 
 64 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 65 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 66 | some operations may not be able to be conducted on new data (e.g.
 67 | processing the outcome variable(s)). Care should be taken when using skip =
 68 | TRUE as it may affect the computations for subsequent operations.}
 69 | 
 70 | \item{id}{A character string that is unique to this step to identify it.}
 71 | 
 72 | \item{x}{A `step_select_vip` object.}
 73 | }
 74 | \value{
 75 | a `step_select_vip` object.
 76 | }
 77 | \description{
 78 | `step_select_vip` creates a *specification* of a recipe step that selects a
 79 | subset of predictors based on the ranking of variable importance provided by
 80 | a `parsnip` model specification and the `model` parameter
 81 | }
 82 | \examples{
 83 | library(recipes)
 84 | library(parsnip)
 85 | 
 86 | # load the example iris dataset
 87 | data(cells, package = "modeldata")
 88 | 
 89 | # define a base model to use for feature importances
 90 | base_model <- rand_forest(mode = "classification") \%>\%
 91 |     set_engine("ranger", importance = "permutation")
 92 | 
 93 | # create a preprocessing recipe
 94 | rec <-
 95 |  recipe(class ~ ., data = cells[, -1]) \%>\%
 96 |  step_select_vip(all_predictors(), outcome = "class", model = base_model, top_p = 10, threshold = 0.9)
 97 | 
 98 | prepped <- prep(rec)
 99 | 
100 | preproc_data <- juice(prepped)
101 | prepped
102 | }
103 | 


--------------------------------------------------------------------------------
/man/step_select_carscore.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_carscore.R
  3 | \name{step_select_carscore}
  4 | \alias{step_select_carscore}
  5 | \alias{tidy.step_select_carscore}
  6 | \title{Information gain feature selection step}
  7 | \usage{
  8 | step_select_carscore(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   lambda = NA,
 17 |   diagonal = FALSE,
 18 |   exclude = NULL,
 19 |   scores = NULL,
 20 |   skip = FALSE,
 21 |   id = recipes::rand_id("select_carscore")
 22 | )
 23 | 
 24 | \method{tidy}{step_select_carscore}(x, ...)
 25 | }
 26 | \arguments{
 27 | \item{recipe}{A recipe object. The step will be added to the sequence of
 28 | operations for this recipe.}
 29 | 
 30 | \item{...}{One or more selector functions to choose which variables are
 31 | affected by the step. See selections() for more details. For the tidy
 32 | method, these are not currently used.}
 33 | 
 34 | \item{outcome}{A character string with the name of the response variable.
 35 | This must refer to a numeric feature for regression.}
 36 | 
 37 | \item{role}{Not used by this step since no new variables are created.}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{top_p}{An integer with the number of best scoring features to
 43 | select.}
 44 | 
 45 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 46 | of best scoring features to select. Features with scores that are _larger_
 47 | than the specified threshold will be retained, for example `threshold =
 48 | 0.9` will retain only predictors with scores in the top 90th percentile.
 49 | Note that this overrides `top_p`.}
 50 | 
 51 | \item{lambda}{The correlation shrinkage intensity (range 0-1).}
 52 | 
 53 | \item{diagonal}{For diagonal = FALSE (the default) CAR scores are computed;
 54 | otherwise with diagonal = TRUE marginal correlations.}
 55 | 
 56 | \item{exclude}{A character vector of predictor names that will be removed
 57 | from the data. This will be set when `prep()` is used on the recipe and
 58 | should not be set by the user.}
 59 | 
 60 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 61 | names of the variables and the absolute values of the calculated CAR
 62 | scores. This parameter is only produced after the recipe has been trained.}
 63 | 
 64 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 65 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 66 | some operations may not be able to be conducted on new data (e.g.
 67 | processing the outcome variable(s)). Care should be taken when using skip =
 68 | TRUE as it may affect the computations for subsequent operations.}
 69 | 
 70 | \item{id}{A character string that is unique to this step to identify it.}
 71 | 
 72 | \item{x}{A `step_select_carscore` object.}
 73 | }
 74 | \value{
 75 | A step_select_carscore object.
 76 | }
 77 | \description{
 78 | `step_select_carscore` creates a *specification* of a recipe step that
 79 | selects a subset of predictors as part of a regression model based on the
 80 | scores of the CAR score algorithm. This step requires the `care` package to be
 81 | installed. The top `top_p` scoring features, or features whose scores occur
 82 | in the top percentile `threshold` will be retained as new predictors.
 83 | }
 84 | \details{
 85 | The recipe will stop if both `top_p` and `threshold` are left unspecified.
 86 | }
 87 | \examples{
 88 | library(recipes)
 89 | 
 90 | data(car_prices, package = "modeldata")
 91 | 
 92 | rec <-
 93 |  recipe(Price ~ ., data = car_prices) \%>\%
 94 |  step_select_carscore(all_predictors(), outcome = "Price", top_p = 5, threshold = 0.7)
 95 | 
 96 | prepped <- prep(rec)
 97 | 
 98 | new_data <- juice(prepped)
 99 | prepped
100 | }
101 | \concept{preprocessing}
102 | \concept{supervised_filter}
103 | \keyword{datagen}
104 | 


--------------------------------------------------------------------------------
/man/step_select_tree.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_tree.R
  3 | \name{step_select_tree}
  4 | \alias{step_select_tree}
  5 | \alias{tidy.step_select_tree}
  6 | \title{Feature selection step using a decision tree importance scores}
  7 | \usage{
  8 | step_select_tree(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   engine = "rpart",
 15 |   cost_complexity = NULL,
 16 |   tree_depth = NULL,
 17 |   min_n = NULL,
 18 |   top_p = NA,
 19 |   threshold = NA,
 20 |   exclude = NULL,
 21 |   scores = NULL,
 22 |   skip = FALSE,
 23 |   id = recipes::rand_id("select_tree")
 24 | )
 25 | 
 26 | \method{tidy}{step_select_tree}(x, ...)
 27 | }
 28 | \arguments{
 29 | \item{recipe}{A recipe object. The step will be added to the sequence of
 30 | operations for this recipe.}
 31 | 
 32 | \item{...}{One or more selector functions to choose which variables are
 33 | affected by the step. See selections() for more details. For the tidy
 34 | method, these are not currently used.}
 35 | 
 36 | \item{outcome}{A character string with the name of the response variable to
 37 | use to calculate the feature importance scores.}
 38 | 
 39 | \item{role}{Not used by this step since no new variables are created.}
 40 | 
 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 42 | been estimated.}
 43 | 
 44 | \item{engine}{A supported rand_forest engine that is supported by parsnip.
 45 | The default is "rpart".}
 46 | 
 47 | \item{cost_complexity}{A positive number for the the cost/complexity
 48 | parameter (a.k.a. Cp) used by CART models (specific engines only).}
 49 | 
 50 | \item{tree_depth}{An integer for maximum depth of the tree.}
 51 | 
 52 | \item{min_n}{An integer for the minimum number of data points in a node that
 53 | are required for the node to be split further.}
 54 | 
 55 | \item{top_p}{An integer with the number of best scoring features to
 56 | select.}
 57 | 
 58 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 59 | of best scoring features to select. Features with scores that are _larger_
 60 | than the specified threshold will be retained, for example `threshold =
 61 | 0.9` will retain only predictors with scores in the top 90th percentile.
 62 | Note that this overrides `top_p`.}
 63 | 
 64 | \item{exclude}{A character vector of predictor names that will be removed
 65 | from the data. This will be set when `prep()` is used on the recipe and
 66 | should not be set by the user.}
 67 | 
 68 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 69 | names of the variables and their feature importance scores. This parameter
 70 | is only produced after the recipe has been trained.}
 71 | 
 72 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 73 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 74 | some operations may not be able to be conducted on new data (e.g.
 75 | processing the outcome variable(s)). Care should be taken when using skip =
 76 | TRUE as it may affect the computations for subsequent operations.}
 77 | 
 78 | \item{id}{A character string that is unique to this step to identify it.}
 79 | 
 80 | \item{x}{A `step_select_tree` object.}
 81 | }
 82 | \value{
 83 | a `step_select_tree` object.
 84 | }
 85 | \description{
 86 | `step_select_tree` creates a *specification* of a recipe step that selects a
 87 | subset of predictors based on the ranking of variable importance provided by
 88 | a `parsnip::decision_tree` supported model.
 89 | }
 90 | \examples{
 91 | library(recipes)
 92 | library(parsnip)
 93 | 
 94 | # load the example iris dataset
 95 | data(cells, package = "modeldata")
 96 | 
 97 | # create a preprocessing recipe
 98 | rec <-
 99 |  recipe(class ~ ., data = cells[, -1]) \%>\%
100 |  step_select_tree(all_predictors(), outcome = "class", top_p = 10,
101 |                      threshold = 0.9)
102 | 
103 | prepped <- prep(rec)
104 | 
105 | preproc_data <- juice(prepped)
106 | prepped
107 | }
108 | 


--------------------------------------------------------------------------------
/man/step_select_infgain.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_infgain.R
  3 | \name{step_select_infgain}
  4 | \alias{step_select_infgain}
  5 | \alias{tidy.step_select_infgain}
  6 | \title{Information gain feature selection step}
  7 | \usage{
  8 | step_select_infgain(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   type = "infogain",
 17 |   threads = 1,
 18 |   exclude = NULL,
 19 |   scores = NULL,
 20 |   skip = FALSE,
 21 |   id = recipes::rand_id("select_infgain")
 22 | )
 23 | 
 24 | \method{tidy}{step_select_infgain}(x, ...)
 25 | }
 26 | \arguments{
 27 | \item{recipe}{A recipe object. The step will be added to the sequence of
 28 | operations for this recipe.}
 29 | 
 30 | \item{...}{One or more selector functions to choose which variables are
 31 | affected by the step. See selections() for more details. For the tidy
 32 | method, these are not currently used.}
 33 | 
 34 | \item{outcome}{A character string with the name of the response variable to
 35 | use to evaluate information gain value against the predictors.}
 36 | 
 37 | \item{role}{Not used by this step since no new variables are created.}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{top_p}{An integer with the number of best scoring features to
 43 | select.}
 44 | 
 45 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 46 | of best scoring features to select. Features with scores that are _larger_
 47 | than the specified threshold will be retained, for example `threshold =
 48 | 0.9` will retain only predictors with scores in the top 90th percentile.
 49 | Note that this overrides `top_p`.}
 50 | 
 51 | \item{type}{A character string specifying the information gain method to use.
 52 | One of "infogain", "gainratio", "symuncert". The default is 'infogain'.}
 53 | 
 54 | \item{threads}{An integer specifying the number of threads to use for
 55 | processing. The default = 0 uses all available threads.}
 56 | 
 57 | \item{exclude}{A character vector of predictor names that will be removed
 58 | from the data. This will be set when `prep()` is used on the recipe and
 59 | should not be set by the user.}
 60 | 
 61 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 62 | names of the variables and their information gain scores. This parameter is
 63 | only produced after the recipe has been trained.}
 64 | 
 65 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 66 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 67 | some operations may not be able to be conducted on new data (e.g.
 68 | processing the outcome variable(s)). Care should be taken when using skip =
 69 | TRUE as it may affect the computations for subsequent operations.}
 70 | 
 71 | \item{id}{A character string that is unique to this step to identify it.}
 72 | 
 73 | \item{x}{A `step_select_infgain` object.}
 74 | }
 75 | \value{
 76 | A step_select_infgain object.
 77 | }
 78 | \description{
 79 | `step_select_infgain` creates a *specification* of a recipe step that selects a
 80 | subset of predictors based on the scores of the information gain algorithm.
 81 | This step requires the FSelectorRcpp package to be installed. The top
 82 | `top_p` scoring features, or features whose scores occur in the top
 83 | percentile `threshold` will be retained as new predictors.
 84 | }
 85 | \details{
 86 | The recipe will stop if both `top_p` and `threshold` are left unspecified.
 87 | }
 88 | \examples{
 89 | library(recipes)
 90 | 
 91 | data(cells, package = "modeldata")
 92 | 
 93 | rec <-
 94 |  recipe(class ~ ., data = cells[, -1]) \%>\%
 95 |  step_select_infgain(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9)
 96 | 
 97 | prepped <- prep(rec)
 98 | 
 99 | new_data <- juice(prepped)
100 | prepped
101 | }
102 | \concept{preprocessing}
103 | \concept{supervised_filter}
104 | \keyword{datagen}
105 | 


--------------------------------------------------------------------------------
/man/step_select_linear.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_linear.R
  3 | \name{step_select_linear}
  4 | \alias{step_select_linear}
  5 | \alias{tidy.step_select_linear}
  6 | \title{Feature selection step using the magnitude of a linear models' coefficients}
  7 | \usage{
  8 | step_select_linear(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   engine = "glm",
 15 |   penalty = NULL,
 16 |   mixture = NULL,
 17 |   top_p = NA,
 18 |   threshold = NA,
 19 |   exclude = NULL,
 20 |   scores = NULL,
 21 |   skip = FALSE,
 22 |   id = recipes::rand_id("select_linear")
 23 | )
 24 | 
 25 | \method{tidy}{step_select_linear}(x, ...)
 26 | }
 27 | \arguments{
 28 | \item{recipe}{A recipe object. The step will be added to the sequence of
 29 | operations for this recipe.}
 30 | 
 31 | \item{...}{One or more selector functions to choose which variables are
 32 | affected by the step. See selections() for more details. For the tidy
 33 | method, these are not currently used.}
 34 | 
 35 | \item{outcome}{A character string with the name of the response variable to
 36 | use to calculate the feature importance scores.}
 37 | 
 38 | \item{role}{Not used by this step since no new variables are created.}
 39 | 
 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 41 | been estimated.}
 42 | 
 43 | \item{engine}{A supported rand_forest engine that is supported by parsnip.
 44 | The default is "glm".}
 45 | 
 46 | \item{penalty}{A non-negative number representing the total amount of
 47 | regularization (specific engines only).}
 48 | 
 49 | \item{mixture}{A number between zero and one (inclusive) that is the
 50 | proportion of L1 regularization (i.e. lasso) in the model. When mixture =
 51 | 1, it is a pure lasso model while mixture = 0 indicates that ridge
 52 | regression is being used (specific engines only).}
 53 | 
 54 | \item{top_p}{An integer with the number of best scoring features to
 55 | select.}
 56 | 
 57 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 58 | of best scoring features to select. Features with scores that are _larger_
 59 | than the specified threshold will be retained, for example `threshold =
 60 | 0.9` will retain only predictors with scores in the top 90th percentile.
 61 | Note that this overrides `top_p`.}
 62 | 
 63 | \item{exclude}{A character vector of predictor names that will be removed
 64 | from the data. This will be set when `prep()` is used on the recipe and
 65 | should not be set by the user.}
 66 | 
 67 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 68 | names of the variables and their feature importance scores. This parameter
 69 | is only produced after the recipe has been trained.}
 70 | 
 71 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 72 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 73 | some operations may not be able to be conducted on new data (e.g.
 74 | processing the outcome variable(s)). Care should be taken when using skip =
 75 | TRUE as it may affect the computations for subsequent operations.}
 76 | 
 77 | \item{id}{A character string that is unique to this step to identify it.}
 78 | 
 79 | \item{x}{A `step_select_linear` object.}
 80 | }
 81 | \value{
 82 | a `step_select_linear` object.
 83 | }
 84 | \description{
 85 | `step_select_linear` creates a *specification* of a recipe step that selects
 86 | a subset of predictors based on the ranking of the magnitude of coefficients
 87 | provided by a `parsnip::linear_reg` or `parsnip::logistic_reg` model.
 88 | }
 89 | \examples{
 90 | library(recipes)
 91 | library(parsnip)
 92 | 
 93 | # load the example iris dataset
 94 | data(cells, package = "modeldata")
 95 | 
 96 | # create a preprocessing recipe
 97 | rec <-
 98 |  recipe(class ~ ., data = cells[, -1]) \%>\%
 99 |  step_select_linear(all_predictors(), outcome = "class", top_p = 10,
100 |                      threshold = 0.9)
101 | 
102 | prepped <- prep(rec)
103 | 
104 | preproc_data <- juice(prepped)
105 | prepped
106 | }
107 | 


--------------------------------------------------------------------------------
/man/step_select_forests.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_forests.R
  3 | \name{step_select_forests}
  4 | \alias{step_select_forests}
  5 | \title{Feature selection step using a random forest feature importance scores}
  6 | \usage{
  7 | step_select_forests(
  8 |   recipe,
  9 |   ...,
 10 |   outcome = NULL,
 11 |   role = "predictor",
 12 |   trained = FALSE,
 13 |   engine = "ranger",
 14 |   options = list(importance = "permutation"),
 15 |   mtry = NULL,
 16 |   trees = NULL,
 17 |   min_n = NULL,
 18 |   top_p = NA,
 19 |   threshold = NA,
 20 |   exclude = NULL,
 21 |   scores = NULL,
 22 |   skip = FALSE,
 23 |   id = recipes::rand_id("select_forests")
 24 | )
 25 | }
 26 | \arguments{
 27 | \item{recipe}{A recipe object. The step will be added to the sequence of
 28 | operations for this recipe.}
 29 | 
 30 | \item{...}{One or more selector functions to choose which variables are
 31 | affected by the step. See selections() for more details. For the tidy
 32 | method, these are not currently used.}
 33 | 
 34 | \item{outcome}{A character string with the name of the response variable to
 35 | use to calculate the feature importance scores.}
 36 | 
 37 | \item{role}{Not used by this step since no new variables are created.}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{engine}{A supported rand_forest engine that is supported by parsnip.
 43 | The default is "ranger".}
 44 | 
 45 | \item{options}{A named list of options to pass to the rand_forest engine. For
 46 | example, if `engine = 'ranger'` (the default) then options could be
 47 | `list(permutation = 'importance`) because a feature importance method needs
 48 | to be specified for this engine. This is the default.}
 49 | 
 50 | \item{mtry}{An integer for the number of predictors that will be randomly
 51 | sampled at each split when creating the tree models.}
 52 | 
 53 | \item{trees}{An integer for the number of trees contained in the ensemble.}
 54 | 
 55 | \item{min_n}{An integer for the minimum number of data points in a node that
 56 | are required for the node to be split further.}
 57 | 
 58 | \item{top_p}{An integer with the number of best scoring features to
 59 | select.}
 60 | 
 61 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 62 | of best scoring features to select. Features with scores that are _larger_
 63 | than the specified threshold will be retained, for example `threshold =
 64 | 0.9` will retain only predictors with scores in the top 90th percentile.
 65 | Note that this overrides `top_p`.}
 66 | 
 67 | \item{exclude}{A character vector of predictor names that will be removed
 68 | from the data. This will be set when `prep()` is used on the recipe and
 69 | should not be set by the user.}
 70 | 
 71 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 72 | names of the variables and their feature importance scores. This parameter
 73 | is only produced after the recipe has been trained.}
 74 | 
 75 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 76 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 77 | some operations may not be able to be conducted on new data (e.g.
 78 | processing the outcome variable(s)). Care should be taken when using skip =
 79 | TRUE as it may affect the computations for subsequent operations.}
 80 | 
 81 | \item{id}{A character string that is unique to this step to identify it.}
 82 | }
 83 | \value{
 84 | a `step_select_forests` object.
 85 | }
 86 | \description{
 87 | `step_select_forests` creates a *specification* of a recipe step that selects
 88 | a subset of predictors based on the ranking of variable importance using
 89 | a `parsnip::rand_forest` supported model.
 90 | }
 91 | \examples{
 92 | library(recipes)
 93 | library(parsnip)
 94 | 
 95 | # load the example iris dataset
 96 | data(cells, package = "modeldata")
 97 | 
 98 | # create a preprocessing recipe
 99 | rec <-
100 |  recipe(class ~ ., data = cells[, -1]) \%>\%
101 |  step_select_forests(all_predictors(), outcome = "class", top_p = 10,
102 |                      threshold = 0.9)
103 | 
104 | prepped <- prep(rec)
105 | 
106 | preproc_data <- juice(prepped)
107 | prepped
108 | }
109 | 


--------------------------------------------------------------------------------
/docs/LICENSE-text.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>License • recipeselectors</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css"><script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet"><script src="pkgdown.js"></script><meta property="og:title" content="License"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body data-spy="scroll" data-target="#toc">
 6 |     
 7 | 
 8 |     <div class="container template-title-body">
 9 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
10 |   <div class="container">
11 |     <div class="navbar-header">
12 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
13 |         <span class="sr-only">Toggle navigation</span>
14 |         <span class="icon-bar"></span>
15 |         <span class="icon-bar"></span>
16 |         <span class="icon-bar"></span>
17 |       </button>
18 |       <span class="navbar-brand">
19 |         <a class="navbar-link" href="index.html">recipeselectors</a>
20 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
21 |       </span>
22 |     </div>
23 | 
24 |     <div id="navbar" class="navbar-collapse collapse">
25 |       <ul class="nav navbar-nav"><li>
26 |   <a href="reference/index.html">Reference</a>
27 | </li>
28 |       </ul><ul class="nav navbar-nav navbar-right"><li>
29 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
30 |     <span class="fab fa-github fa-lg"></span>
31 |      
32 |   </a>
33 | </li>
34 |       </ul></div><!--/.nav-collapse -->
35 |   </div><!--/.container -->
36 | </div><!--/.navbar -->
37 | 
38 |       
39 | 
40 |       </header><div class="row">
41 |   <div class="contents col-md-9">
42 |     <div class="page-header">
43 |       <h1>License</h1>
44 |     </div>
45 | 
46 | <pre>YEAR: 2019
47 | COPYRIGHT HOLDER: Steven Pawley
48 | </pre>
49 | 
50 |   </div>
51 | 
52 |   <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
53 |     <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
54 |     </nav></div>
55 | 
56 | </div>
57 | 
58 | 
59 | 
60 |       <footer><div class="copyright">
61 |   <p></p><p>Developed by Steven Pawley.</p>
62 | </div>
63 | 
64 | <div class="pkgdown">
65 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
66 | 2.0.2.</p>
67 | </div>
68 | 
69 |       </footer></div>
70 | 
71 |   
72 | 
73 | 
74 |   
75 | 
76 |   </body></html>
77 | 
78 | 


--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en">
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  5 | <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | <title>Page not found (404) • recipeselectors</title>
  9 | <!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous">
 10 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css">
 11 | <script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
 12 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
 13 | <!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet">
 14 | <script src="pkgdown.js"></script><meta property="og:title" content="Page not found (404)">
 15 | <!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 16 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 17 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 18 | <![endif]-->
 19 | </head>
 20 | <body data-spy="scroll" data-target="#toc">
 21 |     
 22 | 
 23 |     <div class="container template-title-body">
 24 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
 25 |   <div class="container">
 26 |     <div class="navbar-header">
 27 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 28 |         <span class="sr-only">Toggle navigation</span>
 29 |         <span class="icon-bar"></span>
 30 |         <span class="icon-bar"></span>
 31 |         <span class="icon-bar"></span>
 32 |       </button>
 33 |       <span class="navbar-brand">
 34 |         <a class="navbar-link" href="index.html">recipeselectors</a>
 35 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
 36 |       </span>
 37 |     </div>
 38 | 
 39 |     <div id="navbar" class="navbar-collapse collapse">
 40 |       <ul class="nav navbar-nav">
 41 | <li>
 42 |   <a href="reference/index.html">Reference</a>
 43 | </li>
 44 |       </ul>
 45 | <ul class="nav navbar-nav navbar-right">
 46 | <li>
 47 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
 48 |     <span class="fab fa-github fa-lg"></span>
 49 |      
 50 |   </a>
 51 | </li>
 52 |       </ul>
 53 | </div>
 54 | <!--/.nav-collapse -->
 55 |   </div>
 56 | <!--/.container -->
 57 | </div>
 58 | <!--/.navbar -->
 59 | 
 60 |       
 61 | 
 62 |       </header><div class="row">
 63 |   <div class="contents col-md-9">
 64 |     <div class="page-header">
 65 |       <h1>Page not found (404)</h1>
 66 |     </div>
 67 | 
 68 | Content not found. Please use links in the navbar.
 69 | 
 70 |   </div>
 71 | 
 72 |   <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
 73 |     <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
 74 |     </nav>
 75 | </div>
 76 | 
 77 | </div>
 78 | 
 79 | 
 80 | 
 81 |       <footer><div class="copyright">
 82 |   <p></p>
 83 | <p>Developed by Steven Pawley.</p>
 84 | </div>
 85 | 
 86 | <div class="pkgdown">
 87 |   <p></p>
 88 | <p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
 89 | 2.0.2.</p>
 90 | </div>
 91 | 
 92 |       </footer>
 93 | </div>
 94 | 
 95 |   
 96 | 
 97 | 
 98 |   
 99 | 
100 |   </body>
101 | </html>
102 | 


--------------------------------------------------------------------------------
/docs/bootstrap-toc.js:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/)
  3 |  * Copyright 2015 Aidan Feldman
  4 |  * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */
  5 | (function() {
  6 |   'use strict';
  7 | 
  8 |   window.Toc = {
  9 |     helpers: {
 10 |       // return all matching elements in the set, or their descendants
 11 |       findOrFilter: function($el, selector) {
 12 |         // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/
 13 |         // http://stackoverflow.com/a/12731439/358804
 14 |         var $descendants = $el.find(selector);
 15 |         return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])');
 16 |       },
 17 | 
 18 |       generateUniqueIdBase: function(el) {
 19 |         var text = $(el).text();
 20 |         var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-');
 21 |         return anchor || el.tagName.toLowerCase();
 22 |       },
 23 | 
 24 |       generateUniqueId: function(el) {
 25 |         var anchorBase = this.generateUniqueIdBase(el);
 26 |         for (var i = 0; ; i++) {
 27 |           var anchor = anchorBase;
 28 |           if (i > 0) {
 29 |             // add suffix
 30 |             anchor += '-' + i;
 31 |           }
 32 |           // check if ID already exists
 33 |           if (!document.getElementById(anchor)) {
 34 |             return anchor;
 35 |           }
 36 |         }
 37 |       },
 38 | 
 39 |       generateAnchor: function(el) {
 40 |         if (el.id) {
 41 |           return el.id;
 42 |         } else {
 43 |           var anchor = this.generateUniqueId(el);
 44 |           el.id = anchor;
 45 |           return anchor;
 46 |         }
 47 |       },
 48 | 
 49 |       createNavList: function() {
 50 |         return $('<ul class="nav"></ul>');
 51 |       },
 52 | 
 53 |       createChildNavList: function($parent) {
 54 |         var $childList = this.createNavList();
 55 |         $parent.append($childList);
 56 |         return $childList;
 57 |       },
 58 | 
 59 |       generateNavEl: function(anchor, text) {
 60 |         var $a = $('<a></a>');
 61 |         $a.attr('href', '#' + anchor);
 62 |         $a.text(text);
 63 |         var $li = $('<li></li>');
 64 |         $li.append($a);
 65 |         return $li;
 66 |       },
 67 | 
 68 |       generateNavItem: function(headingEl) {
 69 |         var anchor = this.generateAnchor(headingEl);
 70 |         var $heading = $(headingEl);
 71 |         var text = $heading.data('toc-text') || $heading.text();
 72 |         return this.generateNavEl(anchor, text);
 73 |       },
 74 | 
 75 |       // Find the first heading level (`<h1>`, then `<h2>`, etc.) that has more than one element. Defaults to 1 (for `<h1>`).
 76 |       getTopLevel: function($scope) {
 77 |         for (var i = 1; i <= 6; i++) {
 78 |           var $headings = this.findOrFilter($scope, 'h' + i);
 79 |           if ($headings.length > 1) {
 80 |             return i;
 81 |           }
 82 |         }
 83 | 
 84 |         return 1;
 85 |       },
 86 | 
 87 |       // returns the elements for the top level, and the next below it
 88 |       getHeadings: function($scope, topLevel) {
 89 |         var topSelector = 'h' + topLevel;
 90 | 
 91 |         var secondaryLevel = topLevel + 1;
 92 |         var secondarySelector = 'h' + secondaryLevel;
 93 | 
 94 |         return this.findOrFilter($scope, topSelector + ',' + secondarySelector);
 95 |       },
 96 | 
 97 |       getNavLevel: function(el) {
 98 |         return parseInt(el.tagName.charAt(1), 10);
 99 |       },
100 | 
101 |       populateNav: function($topContext, topLevel, $headings) {
102 |         var $context = $topContext;
103 |         var $prevNav;
104 | 
105 |         var helpers = this;
106 |         $headings.each(function(i, el) {
107 |           var $newNav = helpers.generateNavItem(el);
108 |           var navLevel = helpers.getNavLevel(el);
109 | 
110 |           // determine the proper $context
111 |           if (navLevel === topLevel) {
112 |             // use top level
113 |             $context = $topContext;
114 |           } else if ($prevNav && $context === $topContext) {
115 |             // create a new level of the tree and switch to it
116 |             $context = helpers.createChildNavList($prevNav);
117 |           } // else use the current $context
118 | 
119 |           $context.append($newNav);
120 | 
121 |           $prevNav = $newNav;
122 |         });
123 |       },
124 | 
125 |       parseOps: function(arg) {
126 |         var opts;
127 |         if (arg.jquery) {
128 |           opts = {
129 |             $nav: arg
130 |           };
131 |         } else {
132 |           opts = arg;
133 |         }
134 |         opts.$scope = opts.$scope || $(document.body);
135 |         return opts;
136 |       }
137 |     },
138 | 
139 |     // accepts a jQuery object, or an options object
140 |     init: function(opts) {
141 |       opts = this.helpers.parseOps(opts);
142 | 
143 |       // ensure that the data attribute is in place for styling
144 |       opts.$nav.attr('data-toggle', 'toc');
145 | 
146 |       var $topContext = this.helpers.createChildNavList(opts.$nav);
147 |       var topLevel = this.helpers.getTopLevel(opts.$scope);
148 |       var $headings = this.helpers.getHeadings(opts.$scope, topLevel);
149 |       this.helpers.populateNav($topContext, topLevel, $headings);
150 |     }
151 |   };
152 | 
153 |   $(function() {
154 |     $('nav[data-toggle="toc"]').each(function(i, el) {
155 |       var $nav = $(el);
156 |       Toc.init($nav);
157 |     });
158 |   });
159 | })();
160 | 


--------------------------------------------------------------------------------
/docs/reference/pipe.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Pipe operator — %&gt;% • recipeselectors</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Pipe operator — %&gt;%"><meta property="og:description" content="See magrittr::%&amp;gt;% for details."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body data-spy="scroll" data-target="#toc">
 6 |     
 7 | 
 8 |     <div class="container template-reference-topic">
 9 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
10 |   <div class="container">
11 |     <div class="navbar-header">
12 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
13 |         <span class="sr-only">Toggle navigation</span>
14 |         <span class="icon-bar"></span>
15 |         <span class="icon-bar"></span>
16 |         <span class="icon-bar"></span>
17 |       </button>
18 |       <span class="navbar-brand">
19 |         <a class="navbar-link" href="../index.html">recipeselectors</a>
20 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
21 |       </span>
22 |     </div>
23 | 
24 |     <div id="navbar" class="navbar-collapse collapse">
25 |       <ul class="nav navbar-nav"><li>
26 |   <a href="../reference/index.html">Reference</a>
27 | </li>
28 |       </ul><ul class="nav navbar-nav navbar-right"><li>
29 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
30 |     <span class="fab fa-github fa-lg"></span>
31 |      
32 |   </a>
33 | </li>
34 |       </ul></div><!--/.nav-collapse -->
35 |   </div><!--/.container -->
36 | </div><!--/.navbar -->
37 | 
38 |       
39 | 
40 |       </header><div class="row">
41 |   <div class="col-md-9 contents">
42 |     <div class="page-header">
43 |     <h1>Pipe operator</h1>
44 |     <small class="dont-index">Source: <a href="https://github.com/stevenpawley/recipeselectors/blob/HEAD/R/utils-pipe.R" class="external-link"><code>R/utils-pipe.R</code></a></small>
45 |     <div class="hidden name"><code>pipe.Rd</code></div>
46 |     </div>
47 | 
48 |     <div class="ref-description">
49 |     <p>See <code>magrittr::%&gt;%</code> for details.</p>
50 |     </div>
51 | 
52 |     <div id="ref-usage">
53 |     <div class="sourceCode"><pre class="sourceCode r"><code><span class="va">lhs</span> <span class="op">%&gt;%</span> <span class="va">rhs</span></code></pre></div>
54 |     </div>
55 | 
56 | 
57 |   </div>
58 |   <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
59 |     <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
60 |     </nav></div>
61 | </div>
62 | 
63 | 
64 |       <footer><div class="copyright">
65 |   <p></p><p>Developed by Steven Pawley.</p>
66 | </div>
67 | 
68 | <div class="pkgdown">
69 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
70 | 2.0.2.</p>
71 | </div>
72 | 
73 |       </footer></div>
74 | 
75 |   
76 | 
77 | 
78 |   
79 | 
80 |   </body></html>
81 | 
82 | 


--------------------------------------------------------------------------------
/R/step_select_boruta.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using Boruta
  2 | #'
  3 | #' `step_select_boruta` creates a *specification* of a recipe step that selects a
  4 | #' subset of predictors using the Boruta feature selection approach.
  5 | #'
  6 | #' @param recipe A recipe object. The step will be added to the sequence of
  7 | #'   operations for this recipe.
  8 | #' @param ... One or more selector functions to choose which variables are
  9 | #'   affected by the step. See selections() for more details. For the tidy
 10 | #'   method, these are not currently used.
 11 | #' @param outcome A character string with the name of the response variable to
 12 | #'   use to calculate the feature importance scores.
 13 | #' @param role Not used by this step since no new variables are created.
 14 | #' @param trained A logical to indicate if the quantities for preprocessing have
 15 | #'   been estimated.
 16 | #' @param exclude A character vector of predictor names that will be removed
 17 | #'  from the data. This will be set when `prep()` is used on the recipe and
 18 | #'  should not be set by the user.
 19 | #' @param options A list of options to pass to `Boruta::Boruta()`. The defaults
 20 | #'   use Boruta's defaults. *Note* that `x` and `y` should not be passed here.
 21 | #' @param res The `Boruta::Boruta` object is stored here once this preprocessing
 22 | #'   step has been trained by `prep.recipe()`.
 23 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 24 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 25 | #'   some operations may not be able to be conducted on new data (e.g.
 26 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 27 | #'   TRUE as it may affect the computations for subsequent operations.
 28 | #' @param id A character string that is unique to this step to identify it.
 29 | #'
 30 | #' @return a `step_select_boruta` object.
 31 | #' @export
 32 | #' @examples
 33 | #' library(recipes)
 34 | #' library(parsnip)
 35 | #'
 36 | #' # load the example iris dataset
 37 | #' data(cells, package = "modeldata")
 38 | #'
 39 | #' # create a preprocessing recipe
 40 | #' rec <-
 41 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 42 | #'  step_select_boruta(all_predictors(), outcome = "class")
 43 | #'
 44 | #' prepped <- prep(rec)
 45 | #'
 46 | #' preproc_data <- juice(prepped)
 47 | #' prepped
 48 | step_select_boruta <- function(
 49 |   recipe,
 50 |   ...,
 51 |   outcome = NULL,
 52 |   role = "predictor",
 53 |   trained = FALSE,
 54 |   exclude = NULL,
 55 |   options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100),
 56 |   res = NULL,
 57 |   skip = FALSE,
 58 |   id = recipes::rand_id("select_boruta")) {
 59 | 
 60 |   recipes::recipes_pkg_check("Boruta")
 61 | 
 62 |   recipes::add_step(
 63 |     recipe,
 64 |     step_select_boruta_new(
 65 |       terms = recipes::ellipse_check(...),
 66 |       trained = trained,
 67 |       outcome = outcome,
 68 |       role = role,
 69 |       exclude = exclude,
 70 |       options = options,
 71 |       res = res,
 72 |       skip = skip,
 73 |       id = id
 74 |     )
 75 |   )
 76 | }
 77 | 
 78 | # wrapper around 'step' function that sets the class of new step objects
 79 | #' @importFrom recipes step
 80 | step_select_boruta_new <- function(terms, role, trained, outcome, exclude,
 81 |                                    options, res, skip, id) {
 82 |   recipes::step(
 83 |     subclass = "select_boruta",
 84 |     terms = terms,
 85 |     role = role,
 86 |     trained = trained,
 87 |     outcome = outcome,
 88 |     exclude = exclude,
 89 |     options = options,
 90 |     res = res,
 91 |     skip = skip,
 92 |     id = id
 93 |   )
 94 | }
 95 | 
 96 | #' @export
 97 | prep.step_select_boruta <- function(x, training, info = NULL, ...) {
 98 | 
 99 |   # translate the terms arguments
100 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
101 |   y_name <- recipes::terms_select(x$outcome, info = info)
102 |   y_name <- y_name[1]
103 | 
104 |   if (length(x_names) > 0) {
105 | 
106 |     call <- rlang::call2(
107 |       .fn = "Boruta",
108 |       .ns = "Boruta",
109 |       x = rlang::quo(training[, x_names]),
110 |       y = rlang::quo(training[[y_name]]),
111 |       !!!x$options
112 |     )
113 | 
114 |     res <- rlang::eval_tidy(call)
115 | 
116 |     exclude <- names(res$finalDecision[res$finalDecision == "Rejected"])
117 | 
118 |   } else {
119 |     exclude <- character()
120 |   }
121 | 
122 |   step_select_boruta_new(
123 |     terms = x$terms,
124 |     trained = TRUE,
125 |     role = x$role,
126 |     outcome = y_name,
127 |     exclude = exclude,
128 |     options = x$options,
129 |     res = res,
130 |     skip = x$skip,
131 |     id = x$id
132 |   )
133 | }
134 | 
135 | #' @export
136 | bake.step_select_boruta <- function(object, new_data, ...) {
137 |   if (length(object$exclude) > 0) {
138 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
139 |   }
140 |   as_tibble(new_data)
141 | }
142 | 
143 | #' @export
144 | print.step_select_boruta <- function(x, width = max(20, options()$width - 30), ...) {
145 |   cat("Boruta feature selection")
146 | 
147 |   if(recipes::is_trained(x)) {
148 |     n <- length(x$exclude)
149 |     cat(paste0(" (", n, " excluded)"))
150 |   }
151 |   cat("\n")
152 | 
153 |   invisible(x)
154 | }
155 | 
156 | #' @rdname step_select_boruta
157 | #' @param x A `step_select_boruta` object.
158 | #' @export
159 | tidy.step_select_boruta <- function(x, ...) {
160 |   if (recipes::is_trained(x)) {
161 |     res <- tibble(terms = x$exclude)
162 |   } else {
163 |     term_names <- recipes::sel2char(x$terms)
164 |     res <- tibble(terms = rlang::na_chr)
165 |   }
166 |   res$id <- x$id
167 |   res
168 | }
169 | 


--------------------------------------------------------------------------------
/docs/authors.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Authors and Citation • recipeselectors</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css"><script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet"><script src="pkgdown.js"></script><meta property="og:title" content="Authors and Citation"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body data-spy="scroll" data-target="#toc">
 6 |     
 7 | 
 8 |     <div class="container template-citation-authors">
 9 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
10 |   <div class="container">
11 |     <div class="navbar-header">
12 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
13 |         <span class="sr-only">Toggle navigation</span>
14 |         <span class="icon-bar"></span>
15 |         <span class="icon-bar"></span>
16 |         <span class="icon-bar"></span>
17 |       </button>
18 |       <span class="navbar-brand">
19 |         <a class="navbar-link" href="index.html">recipeselectors</a>
20 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
21 |       </span>
22 |     </div>
23 | 
24 |     <div id="navbar" class="navbar-collapse collapse">
25 |       <ul class="nav navbar-nav"><li>
26 |   <a href="reference/index.html">Reference</a>
27 | </li>
28 |       </ul><ul class="nav navbar-nav navbar-right"><li>
29 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
30 |     <span class="fab fa-github fa-lg"></span>
31 |      
32 |   </a>
33 | </li>
34 |       </ul></div><!--/.nav-collapse -->
35 |   </div><!--/.container -->
36 | </div><!--/.navbar -->
37 | 
38 |       
39 | 
40 |       </header><div class="row">
41 |   <div class="contents col-md-9">
42 |     <div class="section level2 authors-section">
43 |       <div class="page-header">
44 |         <h1>Authors</h1>
45 |       </div>
46 | 
47 |       
48 |       <ul class="list-unstyled"><li>
49 |           <p><strong>Steven Pawley</strong>. Author, maintainer. 
50 |           </p>
51 |         </li>
52 |       </ul></div>
53 |     <div class="section level2 citation-section">
54 |     <div>
55 |       <h1 id="citation">Citation</h1>
56 |       <small class="dont-index">Source: <a href="https://github.com/stevenpawley/recipeselectors/blob/HEAD/DESCRIPTION" class="external-link"><code>DESCRIPTION</code></a></small>
57 |     </div>
58 |     </div>
59 | 
60 | 
61 |     <p>Pawley S (2022).
62 | <em>recipeselectors: Extra Recipes Steps for Supervised Feature Selection</em>.
63 | R package version 0.0.1, <a href="https://github.com/stevenpawley/recipeselectors" class="external-link">https://github.com/stevenpawley/recipeselectors</a>. 
64 | </p>
65 |     <pre>@Manual{,
66 |   title = {recipeselectors: Extra Recipes Steps for Supervised Feature Selection},
67 |   author = {Steven Pawley},
68 |   year = {2022},
69 |   note = {R package version 0.0.1},
70 |   url = {https://github.com/stevenpawley/recipeselectors},
71 | }</pre>
72 | 
73 |   </div>
74 | 
75 | </div>
76 | 
77 | 
78 | 
79 |       <footer><div class="copyright">
80 |   <p></p><p>Developed by Steven Pawley.</p>
81 | </div>
82 | 
83 | <div class="pkgdown">
84 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
85 | 2.0.2.</p>
86 | </div>
87 | 
88 |       </footer></div>
89 | 
90 |   
91 | 
92 | 
93 |   
94 | 
95 |   </body></html>
96 | 
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Note
  2 | 
  3 | The package recipeselectors is changing its name to 'colino'. Continued package development and eventual release top CRAN will occur from the 'https://github.com/stevenpawley/colino' repository.
  4 | 
  5 | # recipeselectors
  6 | 
  7 | The goal of recipeselectors is to provide extra supervised feature selection
  8 | steps to be used with the tidymodels recipes package.
  9 | 
 10 | The package is under development.
 11 | 
 12 | ## Installation
 13 | 
 14 | ``` r
 15 | devtools::install_github("stevenpawley/recipeselectors")
 16 | ```
 17 | 
 18 | ## Feature Selection Methods
 19 | 
 20 | The following feature selection methods are implemented:
 21 | 
 22 | - `step_select_infgain` provides Information Gain feature selection. This step
 23 | requires the `FSelectorRcpp` package to be installed.
 24 | 
 25 | - `step_select_mrmr` provides maximum Relevancy Minimum Redundancy feature
 26 | selection. This step requires the `praznik` package to be installed.
 27 | 
 28 | - `step_select_roc` provides ROC-based feature selection based on each
 29 | predictors' relationship with the response outcomeas measured using a Receiver
 30 | Operating Characteristic curve. Thanks to Max Kuhn, along with many other useful
 31 | suggestions.
 32 | 
 33 | - `step_select_xtab` provides feature selection using statistical association
 34 | (also thanks to Max Kuhn).
 35 | 
 36 | - `step_select_vip` provides model-based selection using feature importance
 37 | scores or coefficients. This method allows a `parsnip` model specification to be
 38 | used to select a subset of features based on the models' feature importances or
 39 | coefficients. See below for details. Note, that this step will eventually be
 40 | deprecated in favor of separate steps that contain the specific models that are
 41 | most commonly used for feature selection such as `step_select_forests`, 
 42 | `step_select_tree` and `step_select_linear`.
 43 | 
 44 | - `step_select_boruta` provides a Boruta feature selection step.
 45 | 
 46 | - `step_select_carscore` provides a CAR score feature selection step for
 47 | regression models. This step requires the `care` package to be installed.
 48 | 
 49 | - `step_select_forests`, `step_select_tree`, and `step_select_linear` provide
 50 | model-based methods of selecting a subset of features based on the model's
 51 | feature importance scores or coefficients. These steps, and potential
 52 | `step_select_rules`, `step_select_boost` will replace the `step_select_vip`
 53 | method.
 54 | 
 55 | ## Under Development
 56 | 
 57 | Methods that are planned to be added:
 58 | 
 59 | - Relief-based methods (CORElearn package)
 60 | 
 61 | - Ensemble feature selection (EFS package)
 62 | 
 63 | ## Notes on Wrapper Feature Selection Methods
 64 | 
 65 | The focus of `recipeselectors` is to provide extra recipes for filter-based 
 66 | feature selection. A single wrapper method is also included using the variable
 67 | importance scores of selected algorithms for feature selection.
 68 | 
 69 | The `step_select_vip` is designed to work with the `parsnip` package and
 70 | requires a base model specification that provides a method of ranking the
 71 | importance of features, such as feature importance scores or coefficients, with
 72 | one score per feature. The base model is specified in the step using the `model`
 73 | parameter.
 74 | 
 75 | A limitation is that the model used in the `step_select_vip` cannot be tuned.
 76 | This step will be replaced by a more appropriate structure that allows both
 77 | variable selection and tuning for specific model types.
 78 | 
 79 | The parsnip package does not currently contain a method of pulling feature 
 80 | importance scores from models that support them. The `recipeselectors` package
 81 | provides a generic function `pull_importances` for this purpose that accepts
 82 | a fitted parsnip model, and returns a tibble with two columns 'feature' and
 83 | 'importance':
 84 | 
 85 | ```
 86 | model <- boost_tree(mode = "classification") %>%
 87 |   set_engine("xgboost")
 88 | 
 89 | model_fit <- model %>% 
 90 |   fit(Species ~., iris)
 91 | 
 92 | pull_importances(model_fit)
 93 | ```
 94 | 
 95 | Most of the models and 'engines' that provide feature importances are
 96 | implemented. In addition, `h2o` models are supported using the `h2oparsnip`
 97 | package. Use `methods(pull_importances)` to list models that are currently
 98 | implemented. If need to pull the feature importance scores from a model that is
 99 | not currently supported in this package, then you can add a class to the
100 | pull_importances generic function which returns a two-column tibble:
101 | 
102 | ```
103 | pull_importances._ranger <- function(object, scaled = FALSE, ...) {
104 |   scores <- ranger::importance(object$fit)
105 | 
106 |   # create a tibble with 'feature' and 'importance' columns
107 |   scores <- tibble::tibble(
108 |     feature = names(scores),
109 |     importance = as.numeric(scores)
110 |   )
111 | 
112 |   # optionally rescale the importance scores
113 |   if (scaled)
114 |     scores$importance <- scales::rescale(scores$importance)
115 |   scores
116 | }
117 | ```
118 | 
119 | An example of using the step_importance function:
120 | 
121 | ```
122 | library(parsnip)
123 | library(recipes)
124 | library(magrittr)
125 | 
126 | # load the example iris dataset
127 | data(iris)
128 | 
129 | # define a base model to use for feature importances
130 | base_model <- rand_forest(mode = "classification") %>%
131 |   set_engine("ranger", importance = "permutation")
132 | 
133 | # create a preprocessing recipe
134 | rec <- iris %>%
135 | recipe(Species ~ .) %>%
136 | step_select_vip(all_predictors(), model = base_model, top_p = 2,
137 |                 outcome = "Species")
138 | 
139 | prepped <- prep(rec)
140 | 
141 | # create a model specification
142 | clf <- decision_tree(mode = "classification") %>%
143 | set_engine("rpart")
144 | 
145 | clf_fitted <- clf %>%
146 |   fit(Species ~ ., juice(prepped))
147 | ```
148 | 


--------------------------------------------------------------------------------
/docs/LICENSE.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>MIT License • recipeselectors</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css"><script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet"><script src="pkgdown.js"></script><meta property="og:title" content="MIT License"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body data-spy="scroll" data-target="#toc">
 6 |     
 7 | 
 8 |     <div class="container template-title-body">
 9 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
10 |   <div class="container">
11 |     <div class="navbar-header">
12 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
13 |         <span class="sr-only">Toggle navigation</span>
14 |         <span class="icon-bar"></span>
15 |         <span class="icon-bar"></span>
16 |         <span class="icon-bar"></span>
17 |       </button>
18 |       <span class="navbar-brand">
19 |         <a class="navbar-link" href="index.html">recipeselectors</a>
20 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
21 |       </span>
22 |     </div>
23 | 
24 |     <div id="navbar" class="navbar-collapse collapse">
25 |       <ul class="nav navbar-nav"><li>
26 |   <a href="reference/index.html">Reference</a>
27 | </li>
28 |       </ul><ul class="nav navbar-nav navbar-right"><li>
29 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
30 |     <span class="fab fa-github fa-lg"></span>
31 |      
32 |   </a>
33 | </li>
34 |       </ul></div><!--/.nav-collapse -->
35 |   </div><!--/.container -->
36 | </div><!--/.navbar -->
37 | 
38 |       
39 | 
40 |       </header><div class="row">
41 |   <div class="contents col-md-9">
42 |     <div class="page-header">
43 |       <h1>MIT License</h1>
44 |     </div>
45 | 
46 | <div id="mit-license" class="section level1">
47 | 
48 | <p>Copyright (c) 2019 Steven Pawley</p>
49 | <p>Permission is hereby granted, free of charge, to any person obtaining
50 | a copy of this software and associated documentation files (the
51 | “Software”), to deal in the Software without restriction, including
52 | without limitation the rights to use, copy, modify, merge, publish,
53 | distribute, sublicense, and/or sell copies of the Software, and to
54 | permit persons to whom the Software is furnished to do so, subject to
55 | the following conditions:</p>
56 | <p>The above copyright notice and this permission notice shall be
57 | included in all copies or substantial portions of the Software.</p>
58 | <p>THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
59 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
60 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
61 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
62 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
63 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
64 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.</p>
65 | </div>
66 | 
67 |   </div>
68 | 
69 |   <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
70 |     <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
71 |     </nav></div>
72 | 
73 | </div>
74 | 
75 | 
76 | 
77 |       <footer><div class="copyright">
78 |   <p></p><p>Developed by Steven Pawley.</p>
79 | </div>
80 | 
81 | <div class="pkgdown">
82 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
83 | 2.0.2.</p>
84 | </div>
85 | 
86 |       </footer></div>
87 | 
88 |   
89 | 
90 | 
91 |   
92 | 
93 |   </body></html>
94 | 
95 | 


--------------------------------------------------------------------------------
/R/step_select_mrmr.R:
--------------------------------------------------------------------------------
  1 | #' Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)
  2 | #'
  3 | #' `step_select_mrmr` creates a *specification* of a recipe step that will apply
  4 | #' minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric
  5 | #' data. The top `top_p` scoring features, or features whose scores occur in
  6 | #' the top percentile `threshold` will be retained as new predictors.
  7 | #'
  8 | #' @param recipe 	A recipe object. The step will be added to the sequence of
  9 | #'   operations for this recipe
 10 | #' @param ... One or more selector functions to choose which variables are
 11 | #'   affected by the step. See selections() for more details. For the tidy
 12 | #'   method, these are not currently used
 13 | #' @param role Not used by this step since no new variables are created
 14 | #' @param trained A logical to indicate if the quantities for preprocessing have
 15 | #'   been estimated
 16 | #' @param outcome A character string specifying the name of response variable
 17 | #'   used to evaluate mRMR.
 18 | #' @param top_p An integer that will be used to select the number of best
 19 | #'   scoring features.
 20 | #' @param threshold A numeric value between 0 and 1 representing the percentile
 21 | #'   of best scoring features to select. Features with scores that are _larger_
 22 | #'   than the specified threshold will be retained, for example `threshold =
 23 | #'   0.9` will retain only predictors with scores in the top 90th percentile.
 24 | #'   Note that this overrides `top_p`.
 25 | #' @param threads An integer specifying the number of threads to use for
 26 | #'   processing. The default = 0 uses all available threads.
 27 | #' @param exclude A character vector of predictor names that will be removed
 28 | #'  from the data. This will be set when `prep()` is used on the recipe and
 29 | #'  should not be set by the user.
 30 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 31 | #'   names of the variables and their mRMR scores. This parameter is only
 32 | #'   produced after the recipe has been trained.
 33 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 34 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 35 | #'   some operations may not be able to be conducted on new data (e.g.
 36 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 37 | #'   TRUE as it may affect the computations for subsequent operations.
 38 | #' @param id 	A character string that is unique to this step to identify it.
 39 | #' @return A step_select_mrmr object.
 40 | #' @keywords datagen
 41 | #' @concept preprocessing
 42 | #' @concept supervised_filter
 43 | #' @export
 44 | #' @details
 45 | #'
 46 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified.
 47 | #'
 48 | #' @examples
 49 | #' library(recipes)
 50 | #'
 51 | #' data(cells, package = "modeldata")
 52 | #'
 53 | #' rec <-
 54 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 55 | #'  step_select_mrmr(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9)
 56 | #'
 57 | #' prepped <- prep(rec)
 58 | #'
 59 | #' new_data <- juice(prepped)
 60 | #' prepped
 61 | step_select_mrmr <- function(
 62 |   recipe, ...,
 63 |   outcome = NULL,
 64 |   role = NA,
 65 |   trained = FALSE,
 66 |   top_p = NA,
 67 |   threshold = NA,
 68 |   threads = 0,
 69 |   exclude = NULL,
 70 |   scores = NULL,
 71 |   skip = FALSE,
 72 |   id = recipes::rand_id("select_mrmr")) {
 73 | 
 74 |   recipes::recipes_pkg_check("praznik")
 75 | 
 76 |   terms <- recipes::ellipse_check(...)
 77 | 
 78 |   recipes::add_step(
 79 |     recipe,
 80 |     step_select_mrmr_new(
 81 |       terms = terms,
 82 |       trained = trained,
 83 |       outcome = outcome,
 84 |       role = role,
 85 |       top_p = top_p,
 86 |       threshold = threshold,
 87 |       threads = threads,
 88 |       exclude = exclude,
 89 |       scores = scores,
 90 |       skip = skip,
 91 |       id = id
 92 |     )
 93 |   )
 94 | }
 95 | 
 96 | step_select_mrmr_new <- function(terms, role, trained, outcome, top_p,
 97 |                                  threshold, threads, exclude, scores, skip,
 98 |                                  id) {
 99 |     recipes::step(
100 |       subclass = "select_mrmr",
101 |       terms = terms,
102 |       role = role,
103 |       trained = trained,
104 |       outcome = outcome,
105 |       top_p = top_p,
106 |       threshold = threshold,
107 |       threads = threads,
108 |       exclude = exclude,
109 |       scores = scores,
110 |       skip = skip,
111 |       id = id
112 |     )
113 |   }
114 | 
115 | #' @export
116 | prep.step_select_mrmr <- function(x, training, info = NULL, ...) {
117 |   # extract response and predictor names
118 |   y_name <- recipes::terms_select(x$outcome, info = info)
119 |   y_name <- y_name[1]
120 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
121 | 
122 |   # check criteria
123 |   check_criteria(x$top_p, x$threshold, match.call())
124 |   check_zero_one(x$threshold)
125 |   x$top_p <- check_top_p(x$top_p, length(x_names))
126 | 
127 |   if (length(x_names) > 0) {
128 | 
129 |     call <- rlang::call2(
130 |       .fn = "MRMR",
131 |       .ns = "praznik",
132 |       X = rlang::quo(training[, x_names]),
133 |       Y = rlang::quo(training[[y_name]]),
134 |       k = length(x_names),
135 |       threads = x$threads
136 |     )
137 | 
138 |     res <- rlang::eval_tidy(call)
139 | 
140 |     res <- tibble(
141 |       variable = names(res$selection),
142 |       score = res$score
143 |     )
144 | 
145 |     exclude <-
146 |       select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE)
147 | 
148 |   } else {
149 |     exclude <- character()
150 |   }
151 | 
152 |   step_select_mrmr_new(
153 |     terms = x$terms,
154 |     trained = TRUE,
155 |     role = x$role,
156 |     outcome = y_name,
157 |     top_p = x$top_p,
158 |     threshold = x$threshold,
159 |     threads = x$threads,
160 |     exclude = exclude,
161 |     scores = res,
162 |     skip = x$skip,
163 |     id = x$id
164 |   )
165 | }
166 | 
167 | #' @export
168 | bake.step_select_mrmr <- function(object, new_data, ...) {
169 |   if (length(object$exclude) > 0) {
170 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
171 |   }
172 |   as_tibble(new_data)
173 | }
174 | 
175 | #' @export
176 | print.step_select_mrmr <- function(x, width = max(20, options()$width - 30), ...) {
177 |   cat("mRMR feature selection")
178 | 
179 |   if(recipes::is_trained(x)) {
180 |     n <- length(x$exclude)
181 |     cat(paste0(" (", n, " excluded)"))
182 |   }
183 |   cat("\n")
184 | 
185 |   invisible(x)
186 | }
187 | 
188 | #' @rdname step_select_mrmr
189 | #' @param x A `step_select_mrmr` object.
190 | #' @export
191 | tidy.step_select_mrmr <- function(x, ...) {
192 |   if (recipes::is_trained(x)) {
193 |     res <- tibble(terms = x$exclude)
194 |   } else {
195 |     term_names <- recipes::sel2char(x$terms)
196 |     res <- tibble(terms = rlang::na_chr)
197 |   }
198 |   res$id <- x$id
199 |   res
200 | }
201 | 
202 | #' @export
203 | tunable.step_select_mrmr <- function(x, ...) {
204 |   tibble(
205 |     name = c("top_p", "threshold"),
206 |     call_info = list(
207 |       list(pkg = "recipeselectors", fun = "top_p"),
208 |       list(pkg = "dials", fun = "threshold", range = c(0, 1))
209 |     ),
210 |     source = "recipe",
211 |     component = "step_select_mrmr",
212 |     component_id = x$id
213 |   )
214 | }
215 | 


--------------------------------------------------------------------------------
/docs/reference/top_p.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Parameter functions for feature selection recipes — top_p • recipeselectors</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Parameter functions for feature selection recipes — top_p"><meta property="og:description" content="Feature selection recipes allow the top-performing features to be selected
  3 | using two parameters. `top_p` is for specifying the number of the
  4 | top-performing features."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
  5 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
  6 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
  7 | <![endif]--></head><body data-spy="scroll" data-target="#toc">
  8 |     
  9 | 
 10 |     <div class="container template-reference-topic">
 11 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
 12 |   <div class="container">
 13 |     <div class="navbar-header">
 14 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 15 |         <span class="sr-only">Toggle navigation</span>
 16 |         <span class="icon-bar"></span>
 17 |         <span class="icon-bar"></span>
 18 |         <span class="icon-bar"></span>
 19 |       </button>
 20 |       <span class="navbar-brand">
 21 |         <a class="navbar-link" href="../index.html">recipeselectors</a>
 22 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
 23 |       </span>
 24 |     </div>
 25 | 
 26 |     <div id="navbar" class="navbar-collapse collapse">
 27 |       <ul class="nav navbar-nav"><li>
 28 |   <a href="../reference/index.html">Reference</a>
 29 | </li>
 30 |       </ul><ul class="nav navbar-nav navbar-right"><li>
 31 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
 32 |     <span class="fab fa-github fa-lg"></span>
 33 |      
 34 |   </a>
 35 | </li>
 36 |       </ul></div><!--/.nav-collapse -->
 37 |   </div><!--/.container -->
 38 | </div><!--/.navbar -->
 39 | 
 40 |       
 41 | 
 42 |       </header><div class="row">
 43 |   <div class="col-md-9 contents">
 44 |     <div class="page-header">
 45 |     <h1>Parameter functions for feature selection recipes</h1>
 46 |     <small class="dont-index">Source: <a href="https://github.com/stevenpawley/recipeselectors/blob/HEAD/R/parameters.R" class="external-link"><code>R/parameters.R</code></a></small>
 47 |     <div class="hidden name"><code>top_p.Rd</code></div>
 48 |     </div>
 49 | 
 50 |     <div class="ref-description">
 51 |     <p>Feature selection recipes allow the top-performing features to be selected
 52 | using two parameters. `top_p` is for specifying the number of the
 53 | top-performing features.</p>
 54 |     </div>
 55 | 
 56 |     <div id="ref-usage">
 57 |     <div class="sourceCode"><pre class="sourceCode r"><code><span class="fu">top_p</span><span class="op">(</span>range <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1L</span>, <span class="fl">4L</span><span class="op">)</span>, trans <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span></code></pre></div>
 58 |     </div>
 59 | 
 60 |     <div id="arguments">
 61 |     <h2>Arguments</h2>
 62 |     <dl><dt>range</dt>
 63 | <dd><p>A two-element vector holding the _defaults_ for the smallest and
 64 | largest possible values, respectively.</p></dd>
 65 | <dt>trans</dt>
 66 | <dd><p>A `trans` object from the `scales` package, such as
 67 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
 68 | the default is used which matches the units used in `range`. If no
 69 | transformation, `NULL`.</p></dd>
 70 | </dl></div>
 71 |     <div id="value">
 72 |     <h2>Value</h2>
 73 |     <p>A function with classes "quant_param" and "param"</p>
 74 |     </div>
 75 | 
 76 |     <div id="ref-examples">
 77 |     <h2>Examples</h2>
 78 |     <div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span class="fu">top_p</span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">3</span>, <span class="fl">10</span><span class="op">)</span><span class="op">)</span></span>
 79 | <span class="r-out co"><span class="r-pr">#&gt;</span> # Selected Predictors (quantitative)</span>
 80 | <span class="r-out co"><span class="r-pr">#&gt;</span> Range: [3, 10]</span>
 81 | </code></pre></div>
 82 |     </div>
 83 |   </div>
 84 |   <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
 85 |     <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
 86 |     </nav></div>
 87 | </div>
 88 | 
 89 | 
 90 |       <footer><div class="copyright">
 91 |   <p></p><p>Developed by Steven Pawley.</p>
 92 | </div>
 93 | 
 94 | <div class="pkgdown">
 95 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
 96 | 2.0.2.</p>
 97 | </div>
 98 | 
 99 |       </footer></div>
100 | 
101 |   
102 | 
103 | 
104 |   
105 | 
106 |   </body></html>
107 | 
108 | 


--------------------------------------------------------------------------------
/R/step_select_vip.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using a model's feature importance scores or
  2 | #' coefficients
  3 | #'
  4 | #' `step_select_vip` creates a *specification* of a recipe step that selects a
  5 | #' subset of predictors based on the ranking of variable importance provided by
  6 | #' a `parsnip` model specification and the `model` parameter
  7 | #'
  8 | #' @param recipe A recipe object. The step will be added to the sequence of
  9 | #'   operations for this recipe.
 10 | #' @param ... One or more selector functions to choose which variables are
 11 | #'   affected by the step. See selections() for more details. For the tidy
 12 | #'   method, these are not currently used.
 13 | #' @param outcome A character string with the name of the response variable to
 14 | #'   use to calculate the feature importance scores.
 15 | #' @param role Not used by this step since no new variables are created.
 16 | #' @param trained A logical to indicate if the quantities for preprocessing have
 17 | #'   been estimated.
 18 | #' @param model A `model_spec` object from `parsnip` that has a feature
 19 | #'   importances or coefficients method. The model needs to have an equivalent
 20 | #'   `pull_importances` method defined. See `?pull_importances` for how to
 21 | #'   define methods for models that are not currently supported.
 22 | #' @param top_p An integer with the number of best scoring features to
 23 | #'   select.
 24 | #' @param threshold A numeric value between 0 and 1 representing the percentile
 25 | #'   of best scoring features to select. Features with scores that are _larger_
 26 | #'   than the specified threshold will be retained, for example `threshold =
 27 | #'   0.9` will retain only predictors with scores in the top 90th percentile.
 28 | #'   Note that this overrides `top_p`.
 29 | #' @param exclude A character vector of predictor names that will be removed
 30 | #'  from the data. This will be set when `prep()` is used on the recipe and
 31 | #'  should not be set by the user.
 32 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 33 | #'   names of the variables and their feature importance scores. This parameter
 34 | #'   is only produced after the recipe has been trained.
 35 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 36 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 37 | #'   some operations may not be able to be conducted on new data (e.g.
 38 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 39 | #'   TRUE as it may affect the computations for subsequent operations.
 40 | #' @param id A character string that is unique to this step to identify it.
 41 | #'
 42 | #' @return a `step_select_vip` object.
 43 | #' @export
 44 | #' @examples
 45 | #' library(recipes)
 46 | #' library(parsnip)
 47 | #'
 48 | #' # load the example iris dataset
 49 | #' data(cells, package = "modeldata")
 50 | #'
 51 | #' # define a base model to use for feature importances
 52 | #' base_model <- rand_forest(mode = "classification") %>%
 53 | #'     set_engine("ranger", importance = "permutation")
 54 | #'
 55 | #' # create a preprocessing recipe
 56 | #' rec <-
 57 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 58 | #'  step_select_vip(all_predictors(), outcome = "class", model = base_model, top_p = 10, threshold = 0.9)
 59 | #'
 60 | #' prepped <- prep(rec)
 61 | #'
 62 | #' preproc_data <- juice(prepped)
 63 | #' prepped
 64 | step_select_vip <- function(
 65 |   recipe,
 66 |   ...,
 67 |   outcome = NULL,
 68 |   role = "predictor",
 69 |   trained = FALSE,
 70 |   model = NULL,
 71 |   top_p = NA,
 72 |   threshold = NA,
 73 |   exclude = NULL,
 74 |   scores = NULL,
 75 |   skip = FALSE,
 76 |   id = recipes::rand_id("select_vip")) {
 77 | 
 78 |   if (missing(model))
 79 |     rlang::abort("Model argument should be a `parsnip` model specification")
 80 | 
 81 |   recipes::add_step(
 82 |     recipe,
 83 |     step_select_vip_new(
 84 |       terms = recipes::ellipse_check(...),
 85 |       trained = trained,
 86 |       outcome = outcome,
 87 |       role = role,
 88 |       model = model,
 89 |       top_p = top_p,
 90 |       threshold = threshold,
 91 |       exclude = exclude,
 92 |       scores = scores,
 93 |       skip = skip,
 94 |       id = id
 95 |     )
 96 |   )
 97 | }
 98 | 
 99 | # wrapper around 'step' function that sets the class of new step objects
100 | #' @importFrom recipes step
101 | step_select_vip_new <- function(terms, role, trained, outcome, model, top_p,
102 |                                 threshold, exclude, scores, skip, id) {
103 |   recipes::step(
104 |     subclass = "select_vip",
105 |     terms = terms,
106 |     role = role,
107 |     trained = trained,
108 |     outcome = outcome,
109 |     model = model,
110 |     top_p = top_p,
111 |     threshold = threshold,
112 |     exclude = exclude,
113 |     scores = scores,
114 |     skip = skip,
115 |     id = id
116 |   )
117 | }
118 | 
119 | #' @export
120 | prep.step_select_vip <- function(x, training, info = NULL, ...) {
121 | 
122 |   # translate the terms arguments
123 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
124 |   y_name <- recipes::terms_select(x$outcome, info = info)
125 |   y_name <- y_name[1]
126 | 
127 |   # check criteria
128 |   check_criteria(x$top_p, x$threshold, match.call())
129 |   check_zero_one(x$threshold)
130 |   x$top_p <- check_top_p(x$top_p, length(x_names))
131 | 
132 |   if (length(x_names) > 0) {
133 |     # fit initial model
134 |     X <- training[, x_names]
135 |     y <- training[[y_name]]
136 | 
137 |     initial_model <- parsnip::fit_xy(x$model, X, y)
138 |     res <- pull_importances(initial_model)
139 |     names(res) <- c("variable", "score")
140 |     res$score <- rlang::set_names(res$score, res$variable)
141 | 
142 |     exclude <-
143 |       select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE)
144 | 
145 |   } else {
146 |     exclude <- character()
147 |   }
148 | 
149 |   step_select_vip_new(
150 |     terms = x$terms,
151 |     trained = TRUE,
152 |     role = x$role,
153 |     outcome = y_name,
154 |     model = x$model,
155 |     top_p = x$top_p,
156 |     threshold = x$threshold,
157 |     exclude = exclude,
158 |     scores = res,
159 |     skip = x$skip,
160 |     id = x$id
161 |   )
162 | }
163 | 
164 | #' @export
165 | bake.step_select_vip <- function(object, new_data, ...) {
166 |   if (length(object$exclude) > 0) {
167 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
168 |   }
169 |   as_tibble(new_data)
170 | }
171 | 
172 | #' @export
173 | print.step_select_vip <- function(x, width = max(20, options()$width - 30), ...) {
174 |   cat("Variable importance feature selection")
175 | 
176 |   if(recipes::is_trained(x)) {
177 |     n <- length(x$exclude)
178 |     cat(paste0(" (", n, " excluded)"))
179 |   }
180 |   cat("\n")
181 | 
182 |   invisible(x)
183 | }
184 | 
185 | #' @rdname step_select_vip
186 | #' @param x A `step_select_vip` object.
187 | #' @export
188 | tidy.step_select_vip <- function(x, ...) {
189 |   if (recipes::is_trained(x)) {
190 |     res <- tibble(terms = x$exclude)
191 |   } else {
192 |     term_names <- recipes::sel2char(x$terms)
193 |     res <- tibble(terms = rlang::na_chr)
194 |   }
195 |   res$id <- x$id
196 |   res
197 | }
198 | 
199 | #' @export
200 | tunable.step_select_vip <- function(x, ...) {
201 |   tibble(
202 |     name = c("top_p", "threshold"),
203 |     call_info = list(
204 |       list(pkg = "recipeselectors", fun = "top_p"),
205 |       list(pkg = "dials", fun = "threshold", range = c(0, 1))
206 |     ),
207 |     source = "recipe",
208 |     component = "step_select_vip",
209 |     component_id = x$id
210 |   )
211 | }
212 | 


--------------------------------------------------------------------------------
/R/step_select_roc.R:
--------------------------------------------------------------------------------
  1 | #' Filter Numeric Predictors using ROC Curve
  2 | #'
  3 | #' `step_select_roc` creates a *specification* of a recipe step that will
  4 | #'  filter predictors using their relationship with the outcome as measured
  5 | #'  using a Receiver Operating Characteristic curve.
  6 | #'
  7 | #' @param recipe 	A recipe object. The step will be added to the sequence of
  8 | #'   operations for this recipe.
  9 | #' @param ... One or more selector functions to choose which predictors are
 10 | #'  affected by the step. See [selections()] for more details. For the `tidy`
 11 | #'  method, these are not currently used.
 12 | #' @param outcome A single character string that specifies a single categorical
 13 | #'  variable to be used as the class.
 14 | #' @param role For model terms created by this step, what analysis role should
 15 | #'  they be assigned?. By default, the function assumes that resulting distances
 16 | #'  will be used as predictors in a model.
 17 | #' @param threshold A numeric value, in AUC units, where predictors with ROC
 18 | #'  AUC values _larger_ than the threshold will be retained. A value of `NA`
 19 | #'  implies that this criterion will be ignored.
 20 | #' @param top_p An integer that will be used to select the predictors with the
 21 | #'  largest ROC AUC values. A value of `NA` implies that this criterion will be
 22 | #'  ignored.
 23 | #' @param exclude A character vector of predictor names that will be removed
 24 | #'  from the data. This will be set when `prep()` is used on the recipe and
 25 | #'  should not be set by the user.
 26 | #' @param trained A logical to indicate if the quantities for preprocessing have
 27 | #'   been estimated.
 28 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 29 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 30 | #'   some operations may not be able to be conducted on new data (e.g.
 31 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 32 | #'   TRUE as it may affect the computations for subsequent operations.
 33 | #' @param id 	A character string that is unique to this step to identify it.
 34 | #' @return An updated version of `recipe` with the new step
 35 | #'  added to the sequence of existing steps (if any). For the
 36 | #'  `tidy` method, a tibble with a `terms` column for which predictors were
 37 | #' removed.
 38 | #' @keywords datagen
 39 | #' @concept preprocessing
 40 | #' @concept supervised_filter
 41 | #' @export
 42 | #' @details
 43 | #'
 44 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified.
 45 | #'
 46 | #' The ROC AUC will be set to be 1 - AUC if the value is less than 0.50.
 47 | #' @examples
 48 | #' data(cells, package = "modeldata")
 49 | #'
 50 | #' rec <-
 51 | #'   recipe(class ~ ., data = cells[, -1]) %>%
 52 | #'   step_select_roc(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) %>%
 53 | #'   prep()
 54 | #'
 55 | #' rec %>% juice(all_predictors()) %>% names()
 56 | #'
 57 | #' # Use ROC values to select but always keep at least one:
 58 | #' rec <-
 59 | #'   recipe(class ~ ., data = cells[, -1]) %>%
 60 | #'   step_select_roc(all_predictors(), outcome = "class", top_p = 1, threshold = 0.99) %>%
 61 | #'   prep()
 62 | #'
 63 | #' rec %>% juice(all_predictors()) %>% names()
 64 | #'
 65 | #' # in case of missing data...
 66 | step_select_roc <- function(recipe,
 67 |                            ...,
 68 |                            outcome,
 69 |                            role = "predictor",
 70 |                            trained = FALSE,
 71 |                            threshold = NA,
 72 |                            top_p = NA,
 73 |                            exclude = NULL,
 74 |                            skip = FALSE,
 75 |                            id = recipes::rand_id("select_roc")) {
 76 |   recipes::add_step(
 77 |     recipe,
 78 |     step_select_roc_new(
 79 |       terms = recipes::ellipse_check(...),
 80 |       outcome = outcome,
 81 |       role = role,
 82 |       trained = trained,
 83 |       threshold = threshold,
 84 |       top_p = top_p,
 85 |       exclude = exclude,
 86 |       skip = skip,
 87 |       id = id
 88 |     )
 89 |   )
 90 | }
 91 | 
 92 | step_select_roc_new <-
 93 |   function(terms, outcome, role, trained, threshold, top_p, exclude, skip, id) {
 94 |     recipes::step(
 95 |       subclass = "select_roc",
 96 |       terms = terms,
 97 |       outcome = outcome,
 98 |       role = role,
 99 |       trained = trained,
100 |       threshold = threshold,
101 |       top_p = top_p,
102 |       exclude = exclude,
103 |       skip = skip,
104 |       id = id
105 |     )
106 |   }
107 | 
108 | roc_calc <- function(x, y) {
109 |   suppressMessages(
110 |     suppressWarnings(
111 |       {
112 |         if (length(levels(y)) == 2) {
113 |           res <- try(pROC::roc(y, x, direction = "auto"), silent = TRUE)
114 |         } else {
115 |           res <- try(pROC::multiclass.roc(y, x, direction = "auto"), silent = TRUE)
116 |         }
117 |       }
118 |     )
119 |   )
120 | 
121 |   if (inherits(res, "try-error")) {
122 |     res <- NA_real_
123 |   } else {
124 |     res <- unname(pROC::auc(res))
125 |   }
126 |   res
127 | }
128 | 
129 | #' @export
130 | prep.step_select_roc <- function(x, training, info = NULL, ...) {
131 |   y_name <- recipes::terms_select(x$outcome, info = info)
132 |   y_name <- x$outcome[1]
133 |   recipes::check_type(training[, y_name], quant = FALSE)
134 |   x_names <- recipes::terms_select(x$terms, info = info, empty_fun = I)
135 | 
136 |   if(length(x_names) > 0) {
137 | 
138 |     recipes::check_type(training[, x_names])
139 | 
140 |     # check criteria
141 |     check_criteria(x$top_p, x$threshold, match.call())
142 |     check_zero_one(x$threshold)
143 |     x$top_p <- check_top_p(x$top_p, length(x_names))
144 | 
145 |     # filter
146 |     scores <- purrr::map_dbl(training[, x_names], ~ roc_calc(.x, training[[y_name]]))
147 |     exclude_chr <- dual_filter(scores, x$top_p, x$threshold, maximize = TRUE)
148 |   } else {
149 |     exclude_chr <- character()
150 |   }
151 | 
152 |   step_select_roc_new(
153 |     terms = x$terms,
154 |     outcome = x$outcome,
155 |     role = x$role,
156 |     trained = TRUE,
157 |     threshold = x$threshold,
158 |     top_p = x$top_p,
159 |     exclude = exclude_chr,
160 |     skip = x$skip,
161 |     id = x$id
162 |   )
163 | }
164 | 
165 | #' @export
166 | bake.step_select_roc <- function(object, new_data, ...) {
167 |   if (length(object$exclude) > 0) {
168 |     new_data <- new_data %>% dplyr::select(-dplyr::one_of(object$exclude))
169 |   }
170 |   new_data
171 | }
172 | 
173 | #' @export
174 | print.step_select_roc <- function(x, width = max(20, options()$width - 30), ...) {
175 |   cat("ROC curve feature selection")
176 | 
177 |   if(recipes::is_trained(x)) {
178 |     n <- length(x$exclude)
179 |     cat(paste0(" (", n, " excluded)"))
180 |   }
181 |   cat("\n")
182 | 
183 |   invisible(x)
184 | }
185 | 
186 | #' @rdname step_select_roc
187 | #' @param x A `step_select_roc` object.
188 | #' @export
189 | tidy.step_select_roc <- function(x, ...) {
190 |   if (recipes::is_trained(x)) {
191 |     res <- tibble(terms = x$exclude)
192 |   } else {
193 |     term_names <- recipes::sel2char(x$terms)
194 |     res <- tibble(terms = rlang::na_chr)
195 |   }
196 |   res$id <- x$id
197 |   res
198 | }
199 | 
200 | #' @export
201 | tunable.step_select_roc <- function(x, ...) {
202 |   tibble::tibble(
203 |     name = c("top_p", "threshold"),
204 |     call_info = list(
205 |       list(pkg = "recipeselectors", fun = "top_p"),
206 |       list(pkg = "dials", fun = "threshold", range = c(0, 1))
207 |     ),
208 |     source = "recipe",
209 |     component = "step_select_roc",
210 |     component_id = x$id
211 |   )
212 | }
213 | 


--------------------------------------------------------------------------------
/R/step_select_carscore.R:
--------------------------------------------------------------------------------
  1 | #' Information gain feature selection step
  2 | #'
  3 | #' `step_select_carscore` creates a *specification* of a recipe step that
  4 | #' selects a subset of predictors as part of a regression model based on the
  5 | #' scores of the CAR score algorithm. This step requires the `care` package to be
  6 | #' installed. The top `top_p` scoring features, or features whose scores occur
  7 | #' in the top percentile `threshold` will be retained as new predictors.
  8 | #'
  9 | #' @param recipe 	A recipe object. The step will be added to the sequence of
 10 | #'   operations for this recipe.
 11 | #' @param ... One or more selector functions to choose which variables are
 12 | #'   affected by the step. See selections() for more details. For the tidy
 13 | #'   method, these are not currently used.
 14 | #' @param role Not used by this step since no new variables are created.
 15 | #' @param trained A logical to indicate if the quantities for preprocessing have
 16 | #'   been estimated.
 17 | #' @param lambda The correlation shrinkage intensity (range 0-1).
 18 | #' @param diagonal For diagonal = FALSE (the default) CAR scores are computed;
 19 | #'   otherwise with diagonal = TRUE marginal correlations.
 20 | #' @param outcome A character string with the name of the response variable.
 21 | #'   This must refer to a numeric feature for regression.
 22 | #' @param top_p An integer with the number of best scoring features to
 23 | #'   select.
 24 | #' @param threshold A numeric value between 0 and 1 representing the percentile
 25 | #'   of best scoring features to select. Features with scores that are _larger_
 26 | #'   than the specified threshold will be retained, for example `threshold =
 27 | #'   0.9` will retain only predictors with scores in the top 90th percentile.
 28 | #'   Note that this overrides `top_p`.
 29 | #' @param exclude A character vector of predictor names that will be removed
 30 | #'  from the data. This will be set when `prep()` is used on the recipe and
 31 | #'  should not be set by the user.
 32 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 33 | #'   names of the variables and the absolute values of the calculated CAR
 34 | #'   scores. This parameter is only produced after the recipe has been trained.
 35 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 36 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 37 | #'   some operations may not be able to be conducted on new data (e.g.
 38 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 39 | #'   TRUE as it may affect the computations for subsequent operations.
 40 | #' @param id 	A character string that is unique to this step to identify it.
 41 | #' @return A step_select_carscore object.
 42 | #' @export
 43 | #' @keywords datagen
 44 | #' @concept preprocessing
 45 | #' @concept supervised_filter
 46 | #' @export
 47 | #' @details
 48 | #'
 49 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified.
 50 | #'
 51 | #' @examples
 52 | #' library(recipes)
 53 | #'
 54 | #' data(car_prices, package = "modeldata")
 55 | #'
 56 | #' rec <-
 57 | #'  recipe(Price ~ ., data = car_prices) %>%
 58 | #'  step_select_carscore(all_predictors(), outcome = "Price", top_p = 5, threshold = 0.7)
 59 | #'
 60 | #' prepped <- prep(rec)
 61 | #'
 62 | #' new_data <- juice(prepped)
 63 | #' prepped
 64 | step_select_carscore <- function(
 65 |   recipe, ...,
 66 |   outcome = NULL,
 67 |   role = NA,
 68 |   trained = FALSE,
 69 |   top_p = NA,
 70 |   threshold = NA,
 71 |   lambda = NA,
 72 |   diagonal = FALSE,
 73 |   exclude = NULL,
 74 |   scores = NULL,
 75 |   skip = FALSE,
 76 |   id = recipes::rand_id("select_carscore")) {
 77 | 
 78 |   recipes::recipes_pkg_check("care")
 79 | 
 80 |   terms <- recipes::ellipse_check(...)
 81 | 
 82 |   recipes::add_step(
 83 |     recipe,
 84 |     step_select_carscore_new(
 85 |       terms = terms,
 86 |       trained = trained,
 87 |       outcome = outcome,
 88 |       role = role,
 89 |       top_p = top_p,
 90 |       threshold = threshold,
 91 |       lambda = lambda,
 92 |       diagonal = diagonal,
 93 |       exclude = exclude,
 94 |       scores = scores,
 95 |       skip = skip,
 96 |       id = id
 97 |     )
 98 |   )
 99 | }
100 | 
101 | 
102 | # wrapper around 'step' function that sets the class of new step objects
103 | step_select_carscore_new <- function(terms, role, trained, outcome, top_p,
104 |                                     threshold, lambda, diagonal, exclude, scores,
105 |                                     skip, id) {
106 |   recipes::step(
107 |     subclass = "select_carscore",
108 |     terms = terms,
109 |     role = role,
110 |     trained = trained,
111 |     outcome = outcome,
112 |     top_p = top_p,
113 |     threshold = threshold,
114 |     lambda = lambda,
115 |     diagonal = diagonal,
116 |     exclude = exclude,
117 |     scores = scores,
118 |     skip = skip,
119 |     id = id
120 |   )
121 | }
122 | 
123 | 
124 | #' @export
125 | prep.step_select_carscore <- function(x, training, info = NULL, ...) {
126 | 
127 |   # extract response and predictor names
128 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
129 |   y_name <- recipes::terms_select(x$outcome, info = info)
130 |   y_name <- y_name[1]
131 | 
132 |   # check criteria
133 |   recipes::check_type(training[, y_name], quant = TRUE)
134 |   check_criteria(x$top_p, x$threshold, match.call())
135 |   check_zero_one(x$threshold)
136 |   x$top_p <- check_top_p(x$top_p, length(x_names))
137 | 
138 |   # information gain
139 |   if (length(x_names) > 0) {
140 | 
141 |     args <- list()
142 | 
143 |     if (!is.na(x$lambda))
144 |       args$lambda <- x$lambda
145 | 
146 |     call <- rlang::call2(
147 |       .fn = "carscore",
148 |       .ns = "care",
149 |       Xtrain = training[, x_names],
150 |       Ytrain = training[, y_name],
151 |       diagonal = x$diagonal,
152 |       !!!args
153 |     )
154 | 
155 |     res <- rlang::eval_tidy(call)
156 | 
157 |     res <- tibble(
158 |       variable = names(res),
159 |       score = abs(res)
160 |     )
161 | 
162 |     exclude <-
163 |       select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE)
164 | 
165 |   } else {
166 |     exclude <- character()
167 |   }
168 | 
169 |   step_select_carscore_new(
170 |     terms = x$terms,
171 |     trained = TRUE,
172 |     role = x$role,
173 |     outcome = y_name,
174 |     top_p = x$top_p,
175 |     threshold = x$threshold,
176 |     lambda = x$lambda,
177 |     diagonal = x$diagonal,
178 |     exclude = exclude,
179 |     scores = res,
180 |     skip = x$skip,
181 |     id = x$id
182 |   )
183 | }
184 | 
185 | #' @export
186 | bake.step_select_carscore <- function(object, new_data, ...) {
187 |   if (length(object$exclude > 0)) {
188 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
189 |   }
190 |   as_tibble(new_data)
191 | }
192 | 
193 | #' @export
194 | print.step_select_carscore <- function(x, width = max(20, options()$width - 30), ...) {
195 |   cat("Carscore feature selection")
196 | 
197 |   if(recipes::is_trained(x)) {
198 |     n <- length(x$exclude)
199 |     cat(paste0(" (", n, " excluded)"))
200 |   }
201 |   cat("\n")
202 | 
203 |   invisible(x)
204 | }
205 | 
206 | #' @rdname step_select_carscore
207 | #' @param x A `step_select_carscore` object.
208 | #' @export
209 | tidy.step_select_carscore <- function(x, ...) {
210 |   if (recipes::is_trained(x)) {
211 |     res <- tibble(terms = x$exclude)
212 |   } else {
213 |     term_names <- recipes::sel2char(x$terms)
214 |     res <- tibble(terms = rlang::na_chr)
215 |   }
216 |   res$id <- x$id
217 |   res
218 | }
219 | 
220 | #' @export
221 | tunable.step_select_carscore <- function(x, ...) {
222 |   tibble::tibble(
223 |     name = c("top_p", "threshold"),
224 |     call_info = list(
225 |       list(pkg = "recipeselectors", fun = "top_p"),
226 |       list(pkg = "dials", fun = "threshold", range = c(0, 1))
227 |     ),
228 |     source = "recipe",
229 |     component = "step_select_carscore",
230 |     component_id = x$id
231 |   )
232 | }
233 | 


--------------------------------------------------------------------------------
/R/step_select_xtab.R:
--------------------------------------------------------------------------------
  1 | #' Filter Categorical Predictors using Contingency Tables
  2 | #'
  3 | #' `step_select_xtab` creates a *specification* of a recipe step that will
  4 | #'  filter predictors using their relationship with the outcome as measured
  5 | #'  using statistical tests for association.
  6 | #'
  7 | #' @param recipe 	A recipe object. The step will be added to the sequence of
  8 | #'   operations for this recipe.
  9 | #' @param ... One or more selector functions to choose which predictors are
 10 | #'  affected by the step. See [selections()] for more details. For the `tidy`
 11 | #'  method, these are not currently used.
 12 | #' @param outcome A single character string that specifies a single categorical
 13 | #'  variable to be used as the class.
 14 | #' @param role For model terms created by this step, what analysis role should
 15 | #'  they be assigned?. By default, the function assumes that resulting distances
 16 | #'  will be used as predictors in a model.
 17 | #' @param threshold A numeric value, in p-value/FDR units, where predictors with
 18 | #'  _smaller_ than the threshold will be retained. A value of `NA`
 19 | #'  implies that this criterion will be ignored.
 20 | #' @param top_p An integer that will be used to select the predictors with the
 21 | #'  smallest p/FDR values. A value of `NA` implies that this criterion will be
 22 | #'  ignored.
 23 | #' @param exact Should an exact test be used?
 24 | #' @param fdr Should false discovery rates (FDR) be used instead of p-values?
 25 | #' @param exclude A character vector of predictor names that will be removed
 26 | #'  from the data. This will be set when `prep()` is used on the recipe and
 27 | #'  should not be set by the user.
 28 | #' @param trained A logical to indicate if the quantities for preprocessing have
 29 | #'   been estimated.
 30 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 31 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 32 | #'   some operations may not be able to be conducted on new data (e.g.
 33 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 34 | #'   TRUE as it may affect the computations for subsequent operations.
 35 | #' @param id 	A character string that is unique to this step to identify it.
 36 | #' @return An updated version of `recipe` with the new step added to the
 37 | #'  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 38 | #'  `terms` column for which predictors were removed.
 39 | #' @keywords datagen
 40 | #' @concept preprocessing
 41 | #' @concept supervised_filter
 42 | #' @export
 43 | #' @details
 44 | #'
 45 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified. If
 46 | #' both are used, they are combined via 'or'.
 47 | #'
 48 | #' The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]).
 49 | #'
 50 | #' Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed.
 51 | #' @examples
 52 | #' data(attrition, package = "modeldata")
 53 | #'
 54 | #' rec <-
 55 | #'   recipe(Attrition ~ ., data = attrition) %>%
 56 | #'   step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition",
 57 | #'                    top_p = 1, threshold = 0.001, exact = TRUE) %>%
 58 | #'   prep()
 59 | #'
 60 | #' rec %>% juice(all_nominal(), -all_outcomes()) %>% names()
 61 | #'
 62 | #' tidy(rec, number = 1)
 63 | #'
 64 | step_select_xtab <- function(recipe,
 65 |                              ...,
 66 |                              outcome,
 67 |                              role = "predictor",
 68 |                              trained = FALSE,
 69 |                              threshold = NA,
 70 |                              top_p = NA,
 71 |                              exact = FALSE,
 72 |                              fdr = TRUE,
 73 |                              exclude = NULL,
 74 |                              skip = FALSE,
 75 |                              id = recipes::rand_id("select_xtab")) {
 76 |   recipes::add_step(
 77 |     recipe,
 78 |     step_select_xtab_new(
 79 |       terms = recipes::ellipse_check(...),
 80 |       outcome = outcome,
 81 |       role = role,
 82 |       trained = trained,
 83 |       threshold = threshold,
 84 |       top_p = top_p,
 85 |       exact = exact,
 86 |       fdr = fdr,
 87 |       exclude = exclude,
 88 |       skip = skip,
 89 |       id = id
 90 |     )
 91 |   )
 92 | }
 93 | 
 94 | step_select_xtab_new <-
 95 |   function(terms, outcome, role, trained, threshold, top_p, exact, fdr,
 96 |            exclude, skip, id) {
 97 |     recipes::step(
 98 |       subclass = "select_xtab",
 99 |       terms = terms,
100 |       outcome = outcome,
101 |       role = role,
102 |       trained = trained,
103 |       threshold = threshold,
104 |       top_p = top_p,
105 |       exact = exact,
106 |       fdr = fdr,
107 |       exclude = exclude,
108 |       skip = skip,
109 |       id = id
110 |     )
111 |   }
112 | 
113 | tbl_calc <- function(x, y, exact) {
114 |   xtab <- table(x, y)
115 |   if (exact) {
116 |     res <- suppressWarnings(try(stats::fisher.test(xtab)$p.value, silent = TRUE))
117 |   } else {
118 |     res <- suppressWarnings(try(stats::chisq.test(xtab)$p.value, silent = TRUE))
119 |   }
120 |   if (inherits(res, "try-error")) {
121 |     res <- NA_real_
122 |   }
123 |   res
124 | }
125 | 
126 | #' @export
127 | prep.step_select_xtab <- function(x, training, info = NULL, ...) {
128 |   y_name <- recipes::terms_select(x$outcome, info = info)
129 |   y_name <- x$outcome[1]
130 |   recipes::check_type(training[, y_name], quant = FALSE)
131 |   x_names <- recipes::terms_select(x$terms, info = info, empty_fun = I)
132 | 
133 |   if(length(x_names) > 0) {
134 | 
135 |     recipes::check_type(training[, x_names], quant = FALSE)
136 | 
137 |     # check criteria
138 |     check_criteria(x$top_p, x$threshold, match.call())
139 |     check_zero_one(x$threshold)
140 |     x$top_p <- check_top_p(x$top_p, length(x_names))
141 | 
142 |     # filter
143 |     scores <- purrr::map_dbl(training[, x_names],
144 |                              ~ tbl_calc(.x, training[[y_name]], exact = x$exact))
145 |     scores <- sort(scores, na.last = TRUE)
146 |     if (x$fdr) {
147 |       scores <- stats::p.adjust(scores, method = "BH")
148 |     }
149 | 
150 |     exclude_chr <- dual_filter(scores, x$top_p, x$threshold, maximize = FALSE)
151 |   } else {
152 |     exclude_chr <- character()
153 |   }
154 | 
155 |   step_select_xtab_new(
156 |     terms = x$terms,
157 |     outcome = x$outcome,
158 |     role = x$role,
159 |     trained = TRUE,
160 |     threshold = x$threshold,
161 |     top_p = x$top_p,
162 |     exact = x$exact,
163 |     fdr = x$fdr,
164 |     exclude = exclude_chr,
165 |     skip = x$skip,
166 |     id = x$id
167 |   )
168 | }
169 | 
170 | #' @export
171 | bake.step_select_xtab <- function(object, new_data, ...) {
172 |   if (length(object$exclude) > 0) {
173 |     new_data <- new_data %>% dplyr::select(-dplyr::one_of(object$exclude))
174 |   }
175 |   new_data
176 | }
177 | 
178 | #' @export
179 | print.step_select_xtab <- function(x, width = max(20, options()$width - 30), ...) {
180 |   cat("Association test feature selection")
181 | 
182 |   if(recipes::is_trained(x)) {
183 |     n <- length(x$exclude)
184 |     cat(paste0(" (", n, " excluded)"))
185 |   }
186 |   cat("\n")
187 | 
188 |   invisible(x)
189 | }
190 | 
191 | #' @rdname step_select_xtab
192 | #' @param x A `step_select_xtab` object.
193 | #' @export
194 | tidy.step_select_xtab <- function(x, ...) {
195 |   if (recipes::is_trained(x)) {
196 |     res <- tibble(terms = x$exclude)
197 |   } else {
198 |     term_names <- recipes::sel2char(x$terms)
199 |     res <- tibble(terms = rlang::na_chr)
200 |   }
201 |   res$id <- x$id
202 |   res
203 | }
204 | 
205 | #' @export
206 | tunable.step_select_xtab <- function(x, ...) {
207 |   tibble::tibble(
208 |     name = c("top_p", "threshold"),
209 |     call_info = list(
210 |       list(pkg = "recipeselectors", fun = "top_p"),
211 |       list(pkg = "dials", fun = "threshold", range = c(-10, -1))
212 |     ),
213 |     source = "recipe",
214 |     component = "step_select_xtab",
215 |     component_id = x$id
216 |   )
217 | }
218 | 


--------------------------------------------------------------------------------
/R/step_select_infgain.R:
--------------------------------------------------------------------------------
  1 | #' Information gain feature selection step
  2 | #'
  3 | #' `step_select_infgain` creates a *specification* of a recipe step that selects a
  4 | #' subset of predictors based on the scores of the information gain algorithm.
  5 | #' This step requires the FSelectorRcpp package to be installed. The top
  6 | #' `top_p` scoring features, or features whose scores occur in the top
  7 | #' percentile `threshold` will be retained as new predictors.
  8 | #'
  9 | #' @param recipe 	A recipe object. The step will be added to the sequence of
 10 | #'   operations for this recipe.
 11 | #' @param ... One or more selector functions to choose which variables are
 12 | #'   affected by the step. See selections() for more details. For the tidy
 13 | #'   method, these are not currently used.
 14 | #' @param role Not used by this step since no new variables are created.
 15 | #' @param trained A logical to indicate if the quantities for preprocessing have
 16 | #'   been estimated.
 17 | #' @param type A character string specifying the information gain method to use.
 18 | #'   One of "infogain", "gainratio", "symuncert". The default is 'infogain'.
 19 | #' @param outcome A character string with the name of the response variable to
 20 | #'   use to evaluate information gain value against the predictors.
 21 | #' @param top_p An integer with the number of best scoring features to
 22 | #'   select.
 23 | #' @param threshold A numeric value between 0 and 1 representing the percentile
 24 | #'   of best scoring features to select. Features with scores that are _larger_
 25 | #'   than the specified threshold will be retained, for example `threshold =
 26 | #'   0.9` will retain only predictors with scores in the top 90th percentile.
 27 | #'   Note that this overrides `top_p`.
 28 | #' @param threads An integer specifying the number of threads to use for
 29 | #'   processing. The default = 0 uses all available threads.
 30 | #' @param exclude A character vector of predictor names that will be removed
 31 | #'  from the data. This will be set when `prep()` is used on the recipe and
 32 | #'  should not be set by the user.
 33 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 34 | #'   names of the variables and their information gain scores. This parameter is
 35 | #'   only produced after the recipe has been trained.
 36 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 37 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 38 | #'   some operations may not be able to be conducted on new data (e.g.
 39 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 40 | #'   TRUE as it may affect the computations for subsequent operations.
 41 | #' @param id 	A character string that is unique to this step to identify it.
 42 | #' @return A step_select_infgain object.
 43 | #' @export
 44 | #' @keywords datagen
 45 | #' @concept preprocessing
 46 | #' @concept supervised_filter
 47 | #' @export
 48 | #' @details
 49 | #'
 50 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified.
 51 | #'
 52 | #' @examples
 53 | #' library(recipes)
 54 | #'
 55 | #' data(cells, package = "modeldata")
 56 | #'
 57 | #' rec <-
 58 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 59 | #'  step_select_infgain(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9)
 60 | #'
 61 | #' prepped <- prep(rec)
 62 | #'
 63 | #' new_data <- juice(prepped)
 64 | #' prepped
 65 | step_select_infgain <- function(
 66 |   recipe, ...,
 67 |   outcome = NULL,
 68 |   role = NA,
 69 |   trained = FALSE,
 70 |   top_p = NA,
 71 |   threshold = NA,
 72 |   type = "infogain",
 73 |   threads = 1,
 74 |   exclude = NULL,
 75 |   scores = NULL,
 76 |   skip = FALSE,
 77 |   id = recipes::rand_id("select_infgain")) {
 78 | 
 79 |   recipes::recipes_pkg_check("FSelectorRcpp")
 80 | 
 81 |   terms <- recipes::ellipse_check(...)
 82 | 
 83 |   recipes::add_step(
 84 |     recipe,
 85 |     step_select_infgain_new(
 86 |       terms = terms,
 87 |       trained = trained,
 88 |       outcome = outcome,
 89 |       role = role,
 90 |       top_p = top_p,
 91 |       threshold = threshold,
 92 |       type = type,
 93 |       threads = threads,
 94 |       exclude = exclude,
 95 |       scores = scores,
 96 |       skip = skip,
 97 |       id = id
 98 |     )
 99 |   )
100 | }
101 | 
102 | 
103 | # wrapper around 'step' function that sets the class of new step objects
104 | step_select_infgain_new <- function(terms, role, trained, outcome, top_p,
105 |                                     threshold, type, threads, exclude, scores,
106 |                                     skip, id) {
107 |   recipes::step(
108 |     subclass = "select_infgain",
109 |     terms = terms,
110 |     role = role,
111 |     trained = trained,
112 |     outcome = outcome,
113 |     top_p = top_p,
114 |     threshold = threshold,
115 |     type = type,
116 |     threads = threads,
117 |     exclude = exclude,
118 |     scores = scores,
119 |     skip = skip,
120 |     id = id
121 |   )
122 | }
123 | 
124 | 
125 | #' @export
126 | prep.step_select_infgain <- function(x, training, info = NULL, ...) {
127 |   # extract response and predictor names
128 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
129 |   y_name <- recipes::terms_select(x$outcome, info = info)
130 |   y_name <- y_name[1]
131 | 
132 |   # check criteria
133 |   check_criteria(x$top_p, x$threshold, match.call())
134 |   check_zero_one(x$threshold)
135 |   x$top_p <- check_top_p(x$top_p, length(x_names))
136 | 
137 |   # information gain
138 |   if (length(x_names) > 0) {
139 | 
140 |     f <- stats::as.formula(paste(y_name, "~", paste0(x_names, collapse = " + ")))
141 | 
142 |     ig_call <- rlang::call2(
143 |       .fn = "information_gain",
144 |       .ns = "FSelectorRcpp",
145 |       formula = f,
146 |       data = rlang::quo(training),
147 |       type = x$type,
148 |       threads = x$threads,
149 |       discIntegers = TRUE,
150 |       equal = FALSE
151 |     )
152 | 
153 |     res <- rlang::eval_tidy(ig_call)
154 |     res <- as_tibble(res)
155 |     res <- rlang::set_names(res, c("variable", "score"))
156 |     res$score <- rlang::set_names(res$score, res$variable)
157 | 
158 |     exclude <-
159 |       select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE)
160 | 
161 |   } else {
162 |     exclude <- character()
163 |   }
164 | 
165 |   step_select_infgain_new(
166 |     terms = x$terms,
167 |     trained = TRUE,
168 |     role = x$role,
169 |     outcome = y_name,
170 |     top_p = x$top_p,
171 |     threshold = x$threshold,
172 |     type = x$type,
173 |     threads = x$threads,
174 |     exclude = exclude,
175 |     scores = res,
176 |     skip = x$skip,
177 |     id = x$id
178 |   )
179 | }
180 | 
181 | #' @export
182 | bake.step_select_infgain <- function(object, new_data, ...) {
183 |   if (length(object$exclude > 0)) {
184 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
185 |   }
186 |   as_tibble(new_data)
187 | }
188 | 
189 | #' @export
190 | print.step_select_infgain <- function(x, width = max(20, options()$width - 30), ...) {
191 |   cat("Information Gain feature selection")
192 | 
193 |   if(recipes::is_trained(x)) {
194 |     n <- length(x$exclude)
195 |     cat(paste0(" (", n, " excluded)"))
196 |   }
197 |   cat("\n")
198 | 
199 |   invisible(x)
200 | }
201 | 
202 | #' @rdname step_select_infgain
203 | #' @param x A `step_select_infgain` object.
204 | #' @export
205 | tidy.step_select_infgain <- function(x, ...) {
206 |   if (recipes::is_trained(x)) {
207 |     res <- tibble(terms = x$exclude)
208 |   } else {
209 |     term_names <- recipes::sel2char(x$terms)
210 |     res <- tibble(terms = rlang::na_chr)
211 |   }
212 |   res$id <- x$id
213 |   res
214 | }
215 | 
216 | #' @export
217 | tunable.step_select_infgain <- function(x, ...) {
218 |   tibble::tibble(
219 |     name = c("top_p", "threshold"),
220 |     call_info = list(
221 |       list(pkg = "recipeselectors", fun = "top_p"),
222 |       list(pkg = "dials", fun = "threshold", range = c(0, 1))
223 |     ),
224 |     source = "recipe",
225 |     component = "step_select_infgain",
226 |     component_id = x$id
227 |   )
228 | }
229 | 


--------------------------------------------------------------------------------
/docs/reference/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Function reference • recipeselectors</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Function reference"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
  3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
  4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
  5 | <![endif]--></head><body data-spy="scroll" data-target="#toc">
  6 |     
  7 | 
  8 |     <div class="container template-reference-index">
  9 |       <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
 10 |   <div class="container">
 11 |     <div class="navbar-header">
 12 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 13 |         <span class="sr-only">Toggle navigation</span>
 14 |         <span class="icon-bar"></span>
 15 |         <span class="icon-bar"></span>
 16 |         <span class="icon-bar"></span>
 17 |       </button>
 18 |       <span class="navbar-brand">
 19 |         <a class="navbar-link" href="../index.html">recipeselectors</a>
 20 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
 21 |       </span>
 22 |     </div>
 23 | 
 24 |     <div id="navbar" class="navbar-collapse collapse">
 25 |       <ul class="nav navbar-nav"><li>
 26 |   <a href="../reference/index.html">Reference</a>
 27 | </li>
 28 |       </ul><ul class="nav navbar-nav navbar-right"><li>
 29 |   <a href="https://github.com/stevenpawley/recipeselectors/" class="external-link">
 30 |     <span class="fab fa-github fa-lg"></span>
 31 |      
 32 |   </a>
 33 | </li>
 34 |       </ul></div><!--/.nav-collapse -->
 35 |   </div><!--/.container -->
 36 | </div><!--/.navbar -->
 37 | 
 38 |       
 39 | 
 40 |       </header><div class="row">
 41 |   <div class="contents col-md-9">
 42 |     <div class="page-header">
 43 |       <h1>Reference</h1>
 44 |     </div>
 45 | 
 46 |     <table class="ref-index"><colgroup><col class="alias"><col class="title"></colgroup><tbody><tr><th colspan="2">
 47 |           <h2 id="all-functions">All functions <a href="#all-functions" class="anchor" aria-hidden="true"></a></h2>
 48 |           <p class="section-desc"></p>
 49 |         </th>
 50 |       </tr></tbody><tbody><tr><td>
 51 |           <p><code><a href="pull_importances.html">pull_importances()</a></code> </p>
 52 |         </td>
 53 |         <td><p>Pull feature importances from a parsnip fitted model</p></td>
 54 |       </tr><tr><td>
 55 |           <p><code><a href="recipeselectors.html">recipeselectors</a></code> </p>
 56 |         </td>
 57 |         <td><p>recipeselectors: A collection of steps for feature selection to use with the
 58 | 'recipes' package</p></td>
 59 |       </tr><tr><td>
 60 |           <p><code><a href="step_select_boruta.html">step_select_boruta()</a></code> <code><a href="step_select_boruta.html">tidy(<i>&lt;step_select_boruta&gt;</i>)</a></code> </p>
 61 |         </td>
 62 |         <td><p>Feature selection step using Boruta</p></td>
 63 |       </tr><tr><td>
 64 |           <p><code><a href="step_select_carscore.html">step_select_carscore()</a></code> <code><a href="step_select_carscore.html">tidy(<i>&lt;step_select_carscore&gt;</i>)</a></code> </p>
 65 |         </td>
 66 |         <td><p>Information gain feature selection step</p></td>
 67 |       </tr><tr><td>
 68 |           <p><code><a href="step_select_forests.html">step_select_forests()</a></code> <code><a href="step_select_forests.html">tidy(<i>&lt;step_select_forests&gt;</i>)</a></code> </p>
 69 |         </td>
 70 |         <td><p>Feature selection step using a random forest feature importance scores</p></td>
 71 |       </tr><tr><td>
 72 |           <p><code><a href="step_select_infgain.html">step_select_infgain()</a></code> <code><a href="step_select_infgain.html">tidy(<i>&lt;step_select_infgain&gt;</i>)</a></code> </p>
 73 |         </td>
 74 |         <td><p>Information gain feature selection step</p></td>
 75 |       </tr><tr><td>
 76 |           <p><code><a href="step_select_linear.html">step_select_linear()</a></code> <code><a href="step_select_linear.html">tidy(<i>&lt;step_select_linear&gt;</i>)</a></code> </p>
 77 |         </td>
 78 |         <td><p>Feature selection step using the magnitude of a linear models' coefficients</p></td>
 79 |       </tr><tr><td>
 80 |           <p><code><a href="step_select_mrmr.html">step_select_mrmr()</a></code> <code><a href="step_select_mrmr.html">tidy(<i>&lt;step_select_mrmr&gt;</i>)</a></code> </p>
 81 |         </td>
 82 |         <td><p>Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)</p></td>
 83 |       </tr><tr><td>
 84 |           <p><code><a href="step_select_roc.html">step_select_roc()</a></code> <code><a href="step_select_roc.html">tidy(<i>&lt;step_select_roc&gt;</i>)</a></code> </p>
 85 |         </td>
 86 |         <td><p>Filter Numeric Predictors using ROC Curve</p></td>
 87 |       </tr><tr><td>
 88 |           <p><code><a href="step_select_tree.html">step_select_tree()</a></code> <code><a href="step_select_tree.html">tidy(<i>&lt;step_select_tree&gt;</i>)</a></code> </p>
 89 |         </td>
 90 |         <td><p>Feature selection step using a decision tree importance scores</p></td>
 91 |       </tr><tr><td>
 92 |           <p><code><a href="step_select_vip.html">step_select_vip()</a></code> <code><a href="step_select_vip.html">tidy(<i>&lt;step_select_vip&gt;</i>)</a></code> </p>
 93 |         </td>
 94 |         <td><p>Feature selection step using a model's feature importance scores or
 95 | coefficients</p></td>
 96 |       </tr><tr><td>
 97 |           <p><code><a href="step_select_xtab.html">step_select_xtab()</a></code> <code><a href="step_select_xtab.html">tidy(<i>&lt;step_select_xtab&gt;</i>)</a></code> </p>
 98 |         </td>
 99 |         <td><p>Filter Categorical Predictors using Contingency Tables</p></td>
100 |       </tr><tr><td>
101 |           <p><code><a href="top_p.html">top_p()</a></code> </p>
102 |         </td>
103 |         <td><p>Parameter functions for feature selection recipes</p></td>
104 |       </tr></tbody></table></div>
105 | 
106 |   <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
107 |     <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
108 |     </nav></div>
109 | </div>
110 | 
111 | 
112 |       <footer><div class="copyright">
113 |   <p></p><p>Developed by Steven Pawley.</p>
114 | </div>
115 | 
116 | <div class="pkgdown">
117 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a>
118 | 2.0.2.</p>
119 | </div>
120 | 
121 |       </footer></div>
122 | 
123 |   
124 | 
125 | 
126 |   
127 | 
128 |   </body></html>
129 | 
130 | 


--------------------------------------------------------------------------------
/R/step_select_tree.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using a decision tree importance scores
  2 | #'
  3 | #' `step_select_tree` creates a *specification* of a recipe step that selects a
  4 | #' subset of predictors based on the ranking of variable importance provided by
  5 | #' a `parsnip::decision_tree` supported model.
  6 | #'
  7 | #' @param recipe A recipe object. The step will be added to the sequence of
  8 | #'   operations for this recipe.
  9 | #' @param ... One or more selector functions to choose which variables are
 10 | #'   affected by the step. See selections() for more details. For the tidy
 11 | #'   method, these are not currently used.
 12 | #' @param outcome A character string with the name of the response variable to
 13 | #'   use to calculate the feature importance scores.
 14 | #' @param role Not used by this step since no new variables are created.
 15 | #' @param trained A logical to indicate if the quantities for preprocessing have
 16 | #'   been estimated.
 17 | #' @param engine A supported rand_forest engine that is supported by parsnip.
 18 | #'   The default is "rpart".
 19 | #' @param top_p An integer with the number of best scoring features to
 20 | #'   select.
 21 | #' @param cost_complexity A positive number for the the cost/complexity
 22 | #'   parameter (a.k.a. Cp) used by CART models (specific engines only).
 23 | #' @param tree_depth An integer for maximum depth of the tree.
 24 | #' @param min_n An integer for the minimum number of data points in a node that
 25 | #'   are required for the node to be split further.
 26 | #' @param threshold A numeric value between 0 and 1 representing the percentile
 27 | #'   of best scoring features to select. Features with scores that are _larger_
 28 | #'   than the specified threshold will be retained, for example `threshold =
 29 | #'   0.9` will retain only predictors with scores in the top 90th percentile.
 30 | #'   Note that this overrides `top_p`.
 31 | #' @param exclude A character vector of predictor names that will be removed
 32 | #'  from the data. This will be set when `prep()` is used on the recipe and
 33 | #'  should not be set by the user.
 34 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 35 | #'   names of the variables and their feature importance scores. This parameter
 36 | #'   is only produced after the recipe has been trained.
 37 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 38 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 39 | #'   some operations may not be able to be conducted on new data (e.g.
 40 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 41 | #'   TRUE as it may affect the computations for subsequent operations.
 42 | #' @param id A character string that is unique to this step to identify it.
 43 | #'
 44 | #' @return a `step_select_tree` object.
 45 | #' @export
 46 | #' @examples
 47 | #' library(recipes)
 48 | #' library(parsnip)
 49 | #'
 50 | #' # load the example iris dataset
 51 | #' data(cells, package = "modeldata")
 52 | #'
 53 | #' # create a preprocessing recipe
 54 | #' rec <-
 55 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 56 | #'  step_select_tree(all_predictors(), outcome = "class", top_p = 10,
 57 | #'                      threshold = 0.9)
 58 | #'
 59 | #' prepped <- prep(rec)
 60 | #'
 61 | #' preproc_data <- juice(prepped)
 62 | #' prepped
 63 | step_select_tree <- function(
 64 |     recipe,
 65 |     ...,
 66 |     outcome = NULL,
 67 |     role = "predictor",
 68 |     trained = FALSE,
 69 |     engine = "rpart",
 70 |     cost_complexity = NULL,
 71 |     tree_depth = NULL,
 72 |     min_n = NULL,
 73 |     top_p = NA,
 74 |     threshold = NA,
 75 |     exclude = NULL,
 76 |     scores = NULL,
 77 |     skip = FALSE,
 78 |     id = recipes::rand_id("select_tree")) {
 79 | 
 80 |   engines <- parsnip::show_engines("decision_tree")$engine
 81 | 
 82 |   if (!engine %in% engines) {
 83 |     rlang::abort(
 84 |       paste("Engine argument should be one of", paste(engines, collapse = ", "))
 85 |     )
 86 |   }
 87 | 
 88 |   recipes::add_step(
 89 |     recipe,
 90 |     step_select_tree_new(
 91 |       terms = recipes::ellipse_check(...),
 92 |       trained = trained,
 93 |       outcome = outcome,
 94 |       role = role,
 95 |       engine = engine,
 96 |       cost_complexity = cost_complexity,
 97 |       tree_depth = tree_depth,
 98 |       min_n = min_n,
 99 |       top_p = top_p,
100 |       threshold = threshold,
101 |       exclude = exclude,
102 |       scores = scores,
103 |       skip = skip,
104 |       id = id
105 |     )
106 |   )
107 | }
108 | 
109 | # wrapper around 'step' function that sets the class of new step objects
110 | #' @importFrom recipes step
111 | step_select_tree_new <- function(terms, role, trained, outcome, engine,
112 |                                  top_p, cost_complexity, tree_depth, min_n,
113 |                                  threshold, exclude, scores, skip, id) {
114 |   recipes::step(
115 |     subclass = "select_tree",
116 |     terms = terms,
117 |     role = role,
118 |     trained = trained,
119 |     outcome = outcome,
120 |     engine = engine,
121 |     cost_complexity = cost_complexity,
122 |     tree_depth = tree_depth,
123 |     min_n = min_n,
124 |     top_p = top_p,
125 |     threshold = threshold,
126 |     exclude = exclude,
127 |     scores = scores,
128 |     skip = skip,
129 |     id = id
130 |   )
131 | }
132 | 
133 | #' @export
134 | prep.step_select_tree <- function(x, training, info = NULL, ...) {
135 | 
136 |   # translate the terms arguments
137 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
138 |   y_name <- recipes::terms_select(x$outcome, info = info)
139 |   y_name <- y_name[1]
140 | 
141 |   # check criteria
142 |   check_criteria(x$top_p, x$threshold, match.call())
143 |   check_zero_one(x$threshold)
144 |   x$top_p <- check_top_p(x$top_p, length(x_names))
145 | 
146 |   if (length(x_names) > 0) {
147 |     # fit initial model
148 |     X <- training[, x_names]
149 |     y <- training[[y_name]]
150 | 
151 |     model_mode <- ifelse(inherits(y, "numeric"), "regression", "classification")
152 | 
153 |     model_args <- list(
154 |       cost_complexity = x$cost_complexity,
155 |       tree_depth = x$tree_depth,
156 |       min_n = x$min_n
157 |     )
158 | 
159 |     model_spec <-
160 |       parsnip::make_call("decision_tree", args = model_args, ns = "parsnip")
161 | 
162 |     model_spec <-
163 |       rlang::eval_tidy(model_spec) %>%
164 |       parsnip::set_mode(model_mode) %>%
165 |       parsnip::set_engine(x$engine)
166 | 
167 |     initial_model <- parsnip::fit_xy(model_spec, X, y)
168 |     res <- pull_importances(initial_model)
169 |     names(res) <- c("variable", "score")
170 |     res$score <- rlang::set_names(res$score, res$variable)
171 | 
172 |     exclude <-
173 |       select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE)
174 | 
175 |   } else {
176 |     exclude <- character()
177 |   }
178 | 
179 |   step_select_tree_new(
180 |     terms = x$terms,
181 |     trained = TRUE,
182 |     role = x$role,
183 |     outcome = y_name,
184 |     engine = x$engine,
185 |     cost_complexity = x$cost_complexity,
186 |     tree_depth = x$tree_depth,
187 |     min_n = x$min_n,
188 |     top_p = x$top_p,
189 |     threshold = x$threshold,
190 |     exclude = exclude,
191 |     scores = res,
192 |     skip = x$skip,
193 |     id = x$id
194 |   )
195 | }
196 | 
197 | #' @export
198 | bake.step_select_tree <- function(object, new_data, ...) {
199 |   if (length(object$exclude) > 0) {
200 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
201 |   }
202 | 
203 |   as_tibble(new_data)
204 | }
205 | 
206 | #' @export
207 | print.step_select_tree <- function(x, width = max(20, options()$width - 30),
208 |                                       ...) {
209 |   cat("Variable importance feature selection")
210 | 
211 |   if (recipes::is_trained(x)) {
212 |     n <- length(x$exclude)
213 |     cat(paste0(" (", n, " excluded)"))
214 |   }
215 |   cat("\n")
216 | 
217 |   invisible(x)
218 | }
219 | 
220 | #' @rdname step_select_tree
221 | #' @param x A `step_select_tree` object.
222 | #' @export
223 | tidy.step_select_tree <- function(x, ...) {
224 |   if (recipes::is_trained(x)) {
225 |     res <- tibble(terms = x$exclude)
226 | 
227 |   } else {
228 |     term_names <- recipes::sel2char(x$terms)
229 |     res <- tibble(terms = term_names)
230 |   }
231 |   res$id <- x$id
232 |   res
233 | }
234 | 
235 | #' @export
236 | tunable.step_select_tree <- function(x, ...) {
237 |   tibble(
238 |     name = c("top_p", "threshold", "cost_complexity", "tree_depth", "min_n"),
239 |     call_info = list(
240 |       list(pkg = "recipeselectors", fun = "top_p"),
241 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
242 |       list(pkg = "dials", fun = "cost_complexity", range = c(-10, -1),
243 |            trans = scales::log10_trans()),
244 |       list(pkg = "dials", fun = "tree_depth", range = c(1L, 15L)),
245 |       list(pkg = "dials", fun = "min_n", range = c(2L, 40L))
246 |     ),
247 |     source = "recipe",
248 |     component = "step_select_tree",
249 |     component_id = x$id
250 |   )
251 | }
252 | 


--------------------------------------------------------------------------------
/docs/pkgdown.css:
--------------------------------------------------------------------------------
  1 | /* Sticky footer */
  2 | 
  3 | /**
  4 |  * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/
  5 |  * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css
  6 |  *
  7 |  * .Site -> body > .container
  8 |  * .Site-content -> body > .container .row
  9 |  * .footer -> footer
 10 |  *
 11 |  * Key idea seems to be to ensure that .container and __all its parents__
 12 |  * have height set to 100%
 13 |  *
 14 |  */
 15 | 
 16 | html, body {
 17 |   height: 100%;
 18 | }
 19 | 
 20 | body {
 21 |   position: relative;
 22 | }
 23 | 
 24 | body > .container {
 25 |   display: flex;
 26 |   height: 100%;
 27 |   flex-direction: column;
 28 | }
 29 | 
 30 | body > .container .row {
 31 |   flex: 1 0 auto;
 32 | }
 33 | 
 34 | footer {
 35 |   margin-top: 45px;
 36 |   padding: 35px 0 36px;
 37 |   border-top: 1px solid #e5e5e5;
 38 |   color: #666;
 39 |   display: flex;
 40 |   flex-shrink: 0;
 41 | }
 42 | footer p {
 43 |   margin-bottom: 0;
 44 | }
 45 | footer div {
 46 |   flex: 1;
 47 | }
 48 | footer .pkgdown {
 49 |   text-align: right;
 50 | }
 51 | footer p {
 52 |   margin-bottom: 0;
 53 | }
 54 | 
 55 | img.icon {
 56 |   float: right;
 57 | }
 58 | 
 59 | /* Ensure in-page images don't run outside their container */
 60 | .contents img {
 61 |   max-width: 100%;
 62 |   height: auto;
 63 | }
 64 | 
 65 | /* Fix bug in bootstrap (only seen in firefox) */
 66 | summary {
 67 |   display: list-item;
 68 | }
 69 | 
 70 | /* Typographic tweaking ---------------------------------*/
 71 | 
 72 | .contents .page-header {
 73 |   margin-top: calc(-60px + 1em);
 74 | }
 75 | 
 76 | dd {
 77 |   margin-left: 3em;
 78 | }
 79 | 
 80 | /* Section anchors ---------------------------------*/
 81 | 
 82 | a.anchor {
 83 |   display: none;
 84 |   margin-left: 5px;
 85 |   width: 20px;
 86 |   height: 20px;
 87 | 
 88 |   background-image: url(./link.svg);
 89 |   background-repeat: no-repeat;
 90 |   background-size: 20px 20px;
 91 |   background-position: center center;
 92 | }
 93 | 
 94 | h1:hover .anchor,
 95 | h2:hover .anchor,
 96 | h3:hover .anchor,
 97 | h4:hover .anchor,
 98 | h5:hover .anchor,
 99 | h6:hover .anchor {
100 |   display: inline-block;
101 | }
102 | 
103 | /* Fixes for fixed navbar --------------------------*/
104 | 
105 | .contents h1, .contents h2, .contents h3, .contents h4 {
106 |   padding-top: 60px;
107 |   margin-top: -40px;
108 | }
109 | 
110 | /* Navbar submenu --------------------------*/
111 | 
112 | .dropdown-submenu {
113 |   position: relative;
114 | }
115 | 
116 | .dropdown-submenu>.dropdown-menu {
117 |   top: 0;
118 |   left: 100%;
119 |   margin-top: -6px;
120 |   margin-left: -1px;
121 |   border-radius: 0 6px 6px 6px;
122 | }
123 | 
124 | .dropdown-submenu:hover>.dropdown-menu {
125 |   display: block;
126 | }
127 | 
128 | .dropdown-submenu>a:after {
129 |   display: block;
130 |   content: " ";
131 |   float: right;
132 |   width: 0;
133 |   height: 0;
134 |   border-color: transparent;
135 |   border-style: solid;
136 |   border-width: 5px 0 5px 5px;
137 |   border-left-color: #cccccc;
138 |   margin-top: 5px;
139 |   margin-right: -10px;
140 | }
141 | 
142 | .dropdown-submenu:hover>a:after {
143 |   border-left-color: #ffffff;
144 | }
145 | 
146 | .dropdown-submenu.pull-left {
147 |   float: none;
148 | }
149 | 
150 | .dropdown-submenu.pull-left>.dropdown-menu {
151 |   left: -100%;
152 |   margin-left: 10px;
153 |   border-radius: 6px 0 6px 6px;
154 | }
155 | 
156 | /* Sidebar --------------------------*/
157 | 
158 | #pkgdown-sidebar {
159 |   margin-top: 30px;
160 |   position: -webkit-sticky;
161 |   position: sticky;
162 |   top: 70px;
163 | }
164 | 
165 | #pkgdown-sidebar h2 {
166 |   font-size: 1.5em;
167 |   margin-top: 1em;
168 | }
169 | 
170 | #pkgdown-sidebar h2:first-child {
171 |   margin-top: 0;
172 | }
173 | 
174 | #pkgdown-sidebar .list-unstyled li {
175 |   margin-bottom: 0.5em;
176 | }
177 | 
178 | /* bootstrap-toc tweaks ------------------------------------------------------*/
179 | 
180 | /* All levels of nav */
181 | 
182 | nav[data-toggle='toc'] .nav > li > a {
183 |   padding: 4px 20px 4px 6px;
184 |   font-size: 1.5rem;
185 |   font-weight: 400;
186 |   color: inherit;
187 | }
188 | 
189 | nav[data-toggle='toc'] .nav > li > a:hover,
190 | nav[data-toggle='toc'] .nav > li > a:focus {
191 |   padding-left: 5px;
192 |   color: inherit;
193 |   border-left: 1px solid #878787;
194 | }
195 | 
196 | nav[data-toggle='toc'] .nav > .active > a,
197 | nav[data-toggle='toc'] .nav > .active:hover > a,
198 | nav[data-toggle='toc'] .nav > .active:focus > a {
199 |   padding-left: 5px;
200 |   font-size: 1.5rem;
201 |   font-weight: 400;
202 |   color: inherit;
203 |   border-left: 2px solid #878787;
204 | }
205 | 
206 | /* Nav: second level (shown on .active) */
207 | 
208 | nav[data-toggle='toc'] .nav .nav {
209 |   display: none; /* Hide by default, but at >768px, show it */
210 |   padding-bottom: 10px;
211 | }
212 | 
213 | nav[data-toggle='toc'] .nav .nav > li > a {
214 |   padding-left: 16px;
215 |   font-size: 1.35rem;
216 | }
217 | 
218 | nav[data-toggle='toc'] .nav .nav > li > a:hover,
219 | nav[data-toggle='toc'] .nav .nav > li > a:focus {
220 |   padding-left: 15px;
221 | }
222 | 
223 | nav[data-toggle='toc'] .nav .nav > .active > a,
224 | nav[data-toggle='toc'] .nav .nav > .active:hover > a,
225 | nav[data-toggle='toc'] .nav .nav > .active:focus > a {
226 |   padding-left: 15px;
227 |   font-weight: 500;
228 |   font-size: 1.35rem;
229 | }
230 | 
231 | /* orcid ------------------------------------------------------------------- */
232 | 
233 | .orcid {
234 |   font-size: 16px;
235 |   color: #A6CE39;
236 |   /* margins are required by official ORCID trademark and display guidelines */
237 |   margin-left:4px;
238 |   margin-right:4px;
239 |   vertical-align: middle;
240 | }
241 | 
242 | /* Reference index & topics ----------------------------------------------- */
243 | 
244 | .ref-index th {font-weight: normal;}
245 | 
246 | .ref-index td {vertical-align: top; min-width: 100px}
247 | .ref-index .icon {width: 40px;}
248 | .ref-index .alias {width: 40%;}
249 | .ref-index-icons .alias {width: calc(40% - 40px);}
250 | .ref-index .title {width: 60%;}
251 | 
252 | .ref-arguments th {text-align: right; padding-right: 10px;}
253 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px}
254 | .ref-arguments .name {width: 20%;}
255 | .ref-arguments .desc {width: 80%;}
256 | 
257 | /* Nice scrolling for wide elements --------------------------------------- */
258 | 
259 | table {
260 |   display: block;
261 |   overflow: auto;
262 | }
263 | 
264 | /* Syntax highlighting ---------------------------------------------------- */
265 | 
266 | pre, code, pre code {
267 |   background-color: #f8f8f8;
268 |   color: #333;
269 | }
270 | pre, pre code {
271 |   white-space: pre-wrap;
272 |   word-break: break-all;
273 |   overflow-wrap: break-word;
274 | }
275 | 
276 | pre {
277 |   border: 1px solid #eee;
278 | }
279 | 
280 | pre .img, pre .r-plt {
281 |   margin: 5px 0;
282 | }
283 | 
284 | pre .img img, pre .r-plt img {
285 |   background-color: #fff;
286 | }
287 | 
288 | code a, pre a {
289 |   color: #375f84;
290 | }
291 | 
292 | a.sourceLine:hover {
293 |   text-decoration: none;
294 | }
295 | 
296 | .fl      {color: #1514b5;}
297 | .fu      {color: #000000;} /* function */
298 | .ch,.st  {color: #036a07;} /* string */
299 | .kw      {color: #264D66;} /* keyword */
300 | .co      {color: #888888;} /* comment */
301 | 
302 | .error   {font-weight: bolder;}
303 | .warning {font-weight: bolder;}
304 | 
305 | /* Clipboard --------------------------*/
306 | 
307 | .hasCopyButton {
308 |   position: relative;
309 | }
310 | 
311 | .btn-copy-ex {
312 |   position: absolute;
313 |   right: 0;
314 |   top: 0;
315 |   visibility: hidden;
316 | }
317 | 
318 | .hasCopyButton:hover button.btn-copy-ex {
319 |   visibility: visible;
320 | }
321 | 
322 | /* headroom.js ------------------------ */
323 | 
324 | .headroom {
325 |   will-change: transform;
326 |   transition: transform 200ms linear;
327 | }
328 | .headroom--pinned {
329 |   transform: translateY(0%);
330 | }
331 | .headroom--unpinned {
332 |   transform: translateY(-100%);
333 | }
334 | 
335 | /* mark.js ----------------------------*/
336 | 
337 | mark {
338 |   background-color: rgba(255, 255, 51, 0.5);
339 |   border-bottom: 2px solid rgba(255, 153, 51, 0.3);
340 |   padding: 1px;
341 | }
342 | 
343 | /* vertical spacing after htmlwidgets */
344 | .html-widget {
345 |   margin-bottom: 10px;
346 | }
347 | 
348 | /* fontawesome ------------------------ */
349 | 
350 | .fab {
351 |     font-family: "Font Awesome 5 Brands" !important;
352 | }
353 | 
354 | /* don't display links in code chunks when printing */
355 | /* source: https://stackoverflow.com/a/10781533 */
356 | @media print {
357 |   code a:link:after, code a:visited:after {
358 |     content: "";
359 |   }
360 | }
361 | 
362 | /* Section anchors ---------------------------------
363 |    Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71
364 | */
365 | 
366 | div.csl-bib-body { }
367 | div.csl-entry {
368 |   clear: both;
369 | }
370 | .hanging-indent div.csl-entry {
371 |   margin-left:2em;
372 |   text-indent:-2em;
373 | }
374 | div.csl-left-margin {
375 |   min-width:2em;
376 |   float:left;
377 | }
378 | div.csl-right-inline {
379 |   margin-left:2em;
380 |   padding-left:1em;
381 | }
382 | div.csl-indent {
383 |   margin-left: 2em;
384 | }
385 | 


--------------------------------------------------------------------------------
/R/step_select_forests.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using a random forest feature importance scores
  2 | #'
  3 | #' `step_select_forests` creates a *specification* of a recipe step that selects
  4 | #' a subset of predictors based on the ranking of variable importance using
  5 | #' a `parsnip::rand_forest` supported model.
  6 | #'
  7 | #' @param recipe A recipe object. The step will be added to the sequence of
  8 | #'   operations for this recipe.
  9 | #' @param ... One or more selector functions to choose which variables are
 10 | #'   affected by the step. See selections() for more details. For the tidy
 11 | #'   method, these are not currently used.
 12 | #' @param outcome A character string with the name of the response variable to
 13 | #'   use to calculate the feature importance scores.
 14 | #' @param role Not used by this step since no new variables are created.
 15 | #' @param trained A logical to indicate if the quantities for preprocessing have
 16 | #'   been estimated.
 17 | #' @param engine A supported rand_forest engine that is supported by parsnip.
 18 | #'   The default is "ranger".
 19 | #' @param options A named list of options to pass to the rand_forest engine. For
 20 | #'   example, if `engine = 'ranger'` (the default) then options could be
 21 | #'   `list(permutation = 'importance`) because a feature importance method needs
 22 | #'   to be specified for this engine. This is the default.
 23 | #' @param top_p An integer with the number of best scoring features to
 24 | #'   select.
 25 | #' @param mtry An integer for the number of predictors that will be randomly
 26 | #'   sampled at each split when creating the tree models.
 27 | #' @param trees An integer for the number of trees contained in the ensemble.
 28 | #' @param min_n An integer for the minimum number of data points in a node that
 29 | #'   are required for the node to be split further.
 30 | #' @param threshold A numeric value between 0 and 1 representing the percentile
 31 | #'   of best scoring features to select. Features with scores that are _larger_
 32 | #'   than the specified threshold will be retained, for example `threshold =
 33 | #'   0.9` will retain only predictors with scores in the top 90th percentile.
 34 | #'   Note that this overrides `top_p`.
 35 | #' @param exclude A character vector of predictor names that will be removed
 36 | #'  from the data. This will be set when `prep()` is used on the recipe and
 37 | #'  should not be set by the user.
 38 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 39 | #'   names of the variables and their feature importance scores. This parameter
 40 | #'   is only produced after the recipe has been trained.
 41 | #' @param skip A logical. Should the step be skipped when the recipe is baked by
 42 | #'   bake.recipe()? While all operations are baked when prep.recipe() is run,
 43 | #'   some operations may not be able to be conducted on new data (e.g.
 44 | #'   processing the outcome variable(s)). Care should be taken when using skip =
 45 | #'   TRUE as it may affect the computations for subsequent operations.
 46 | #' @param id A character string that is unique to this step to identify it.
 47 | #'
 48 | #' @return a `step_select_forests` object.
 49 | #' @export
 50 | #' @examples
 51 | #' library(recipes)
 52 | #' library(parsnip)
 53 | #'
 54 | #' # load the example iris dataset
 55 | #' data(cells, package = "modeldata")
 56 | #'
 57 | #' # create a preprocessing recipe
 58 | #' rec <-
 59 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 60 | #'  step_select_forests(all_predictors(), outcome = "class", top_p = 10,
 61 | #'                      threshold = 0.9)
 62 | #'
 63 | #' prepped <- prep(rec)
 64 | #'
 65 | #' preproc_data <- juice(prepped)
 66 | #' prepped
 67 | step_select_forests <- function(
 68 |     recipe,
 69 |     ...,
 70 |     outcome = NULL,
 71 |     role = "predictor",
 72 |     trained = FALSE,
 73 |     engine = "ranger",
 74 |     options = list(importance = "permutation"),
 75 |     mtry = NULL,
 76 |     trees = NULL,
 77 |     min_n = NULL,
 78 |     top_p = NA,
 79 |     threshold = NA,
 80 |     exclude = NULL,
 81 |     scores = NULL,
 82 |     skip = FALSE,
 83 |     id = recipes::rand_id("select_forests")) {
 84 | 
 85 |   engines <- parsnip::show_engines("rand_forest")$engine
 86 | 
 87 |   if (!engine %in% parsnip::show_engines("rand_forest")$engine)
 88 |     rlang::abort(
 89 |       paste("Engine argument should be one of", paste(engines, collapse = ", "))
 90 |     )
 91 | 
 92 |   recipes::add_step(
 93 |     recipe,
 94 |     step_select_forests_new(
 95 |       terms = recipes::ellipse_check(...),
 96 |       trained = trained,
 97 |       outcome = outcome,
 98 |       role = role,
 99 |       engine = engine,
100 |       options = options,
101 |       mtry = mtry,
102 |       trees = trees,
103 |       min_n = min_n,
104 |       top_p = top_p,
105 |       threshold = threshold,
106 |       exclude = exclude,
107 |       scores = scores,
108 |       skip = skip,
109 |       id = id
110 |     )
111 |   )
112 | }
113 | 
114 | # wrapper around 'step' function that sets the class of new step objects
115 | #' @importFrom recipes step
116 | step_select_forests_new <- function(terms, role, trained, outcome, engine,
117 |                                     options, top_p, mtry, trees, min_n,
118 |                                     threshold, exclude, scores, skip, id) {
119 |   recipes::step(
120 |     subclass = "select_forests",
121 |     terms = terms,
122 |     role = role,
123 |     trained = trained,
124 |     outcome = outcome,
125 |     engine = engine,
126 |     options = options,
127 |     mtry = mtry,
128 |     trees = trees,
129 |     min_n = min_n,
130 |     top_p = top_p,
131 |     threshold = threshold,
132 |     exclude = exclude,
133 |     scores = scores,
134 |     skip = skip,
135 |     id = id
136 |   )
137 | }
138 | 
139 | #' @export
140 | prep.step_select_forests <- function(x, training, info = NULL, ...) {
141 |   # translate the terms arguments
142 |   x_names <- recipes::terms_select(terms = x$terms, info = info)
143 |   y_name <- recipes::terms_select(x$outcome, info = info)
144 |   y_name <- y_name[1]
145 | 
146 |   # check criteria
147 |   check_criteria(x$top_p, x$threshold, match.call())
148 |   check_zero_one(x$threshold)
149 |   x$top_p <- check_top_p(x$top_p, length(x_names))
150 | 
151 |   if (length(x_names) > 0) {
152 |     # fit initial model
153 |     X <- training[, x_names]
154 |     y <- training[[y_name]]
155 | 
156 |     model_mode <- ifelse(inherits(y, "numeric"), "regression", "classification")
157 | 
158 |     model_args <- list(
159 |       trees = x$trees,
160 |       mtry = x$mtry,
161 |       min_n = x$min_n
162 |     )
163 | 
164 |     model_spec <-
165 |       parsnip::make_call("rand_forest", args = model_args, ns = "parsnip")
166 | 
167 |     model_spec <-
168 |       rlang::eval_tidy(model_spec) %>%
169 |       parsnip::set_mode(model_mode) %>%
170 |       parsnip::set_engine(x$engine, !!!x$options)
171 | 
172 |     initial_model <- parsnip::fit_xy(model_spec, X, y)
173 |     res <- pull_importances(initial_model)
174 |     names(res) <- c("variable", "score")
175 |     res$score <- rlang::set_names(res$score, res$variable)
176 | 
177 |     exclude <-
178 |       select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE)
179 | 
180 |   } else {
181 |     exclude <- character()
182 |   }
183 | 
184 |   step_select_forests_new(
185 |     terms = x$terms,
186 |     trained = TRUE,
187 |     role = x$role,
188 |     outcome = y_name,
189 |     engine = x$engine,
190 |     options = x$options,
191 |     mtry = x$mtry,
192 |     trees = x$trees,
193 |     min_n = x$min_n,
194 |     top_p = x$top_p,
195 |     threshold = x$threshold,
196 |     exclude = exclude,
197 |     scores = res,
198 |     skip = x$skip,
199 |     id = x$id
200 |   )
201 | }
202 | 
203 | #' @export
204 | bake.step_select_forests <- function(object, new_data, ...) {
205 |   if (length(object$exclude) > 0) {
206 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
207 |   }
208 | 
209 |   as_tibble(new_data)
210 | }
211 | 
212 | #' @export
213 | print.step_select_forests <- function(x, width = max(20, options()$width - 30),
214 |                                       ...) {
215 |   cat("Variable importance feature selection")
216 | 
217 |   if (recipes::is_trained(x)) {
218 |     n <- length(x$exclude)
219 |     cat(paste0(" (", n, " excluded)"))
220 |   }
221 |   cat("\n")
222 | 
223 |   invisible(x)
224 | }
225 | 
226 | #' @rdname tidy.recipe
227 | #' @param x A `step_select_forests` object.
228 | #' @export
229 | tidy.step_select_forests <- function(x, ...) {
230 |   if (recipes::is_trained(x)) {
231 |     res <- tibble(terms = x$exclude)
232 | 
233 |   } else {
234 |     term_names <- recipes::sel2char(x$terms)
235 |     res <- tibble(terms = term_names)
236 |   }
237 |   res$id <- x$id
238 |   res
239 | }
240 | 
241 | #' @export
242 | tunable.step_select_forests <- function(x, ...) {
243 |   tibble(
244 |     name = c("top_p", "threshold", "mtry", "trees", "min_n"),
245 |     call_info = list(
246 |       list(pkg = "recipeselectors", fun = "top_p"),
247 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
248 |       list(pkg = "dials", fun = "mtry", range = c(1L, dials::unknown())),
249 |       list(pkg = "dials", fun = "trees", range = c(1L, 2000L)),
250 |       list(pkg = "dials", fun = "min_n", range = c(2L, 40L))
251 |     ),
252 |     source = "recipe",
253 |     component = "step_select_forests",
254 |     component_id = x$id
255 |   )
256 | }
257 | 


--------------------------------------------------------------------------------
/R/pull_importances.R:
--------------------------------------------------------------------------------
  1 | #' Pull feature importances from a parsnip fitted model
  2 | #'
  3 | #' `pull_importances` is a generic function to extract feature importance scores
  4 | #' or coefficients from a parsnip `model_fit` object and return them as a tibble
  5 | #' with a 'feature' and 'importance' column. This is designed to support the
  6 | #' `step_importance` recipe step.
  7 | #'
  8 | #' Most of the basic models within the parsnip package that support feature
  9 | #' importances are implemented (call `methods(pull_importances)` to list models
 10 | #' that are currently implemented). If need to pull the feature importance scores
 11 | #' from a model that is not currently supported in this package, then you can
 12 | #' add a class to the pull_importances generic function which returns a
 13 | #' two-column tibble:
 14 | #'
 15 | #' @param object A `model_fit` object.
 16 | #' @param scaled A logical indicating whether to rescale the importances between
 17 | #'   0 and 1. Default is TRUE.
 18 | #' @param ... A list of other parameters passed to the feature importance
 19 | #'   method.
 20 | #'
 21 | #' @return tibble
 22 | #' @export
 23 | #'
 24 | #' @examples
 25 | #' library(parsnip)
 26 | #'
 27 | #' # pull feature importances from a model_fit object
 28 | #' model <- boost_tree(mode = "classification") %>%
 29 | #'     set_engine("xgboost")
 30 | #' model_fit <- model %>% fit(Species ~., iris)
 31 | #' pull_importances(model_fit)
 32 | #'
 33 | #' # create a new pull_importances method
 34 | #' pull_importances._ranger <- function(object, scaled = FALSE, ...) {
 35 | #'     # create a call to the ranger::importance function avoiding having to use
 36 | #'     # ranger as a dependency
 37 | #'     call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit)
 38 | #'     scores <- rlang::eval_tidy(call)
 39 | #'
 40 | #'     # create a tibble with 'feature' and 'importance' columns
 41 | #'     scores <- tibble::tibble(
 42 | #'       feature = names(scores),
 43 | #'       importance = as.numeric(scores)
 44 | #'     )
 45 | 
 46 | #'     # optionally rescale the importance scores
 47 | #'     if (isTRUE(scaled))
 48 | #'       scores$importance <- rescale(scores$importance)
 49 | #'
 50 | #'     scores
 51 | #' }
 52 | pull_importances <- function(object, scaled = TRUE, ...) {
 53 |   UseMethod("pull_importances", object)
 54 | }
 55 | 
 56 | 
 57 | rescale <- function(x)
 58 |   (x - min(x)) / (max(x) - min(x)) * 100
 59 | 
 60 | 
 61 | #' @export
 62 | pull_importances.default <- function(object, scaled = TRUE, ...) {
 63 |   message(paste(
 64 |     "No method for pulling feature importances is defined for",
 65 |     class(object)[1]
 66 |   ))
 67 | }
 68 | 
 69 | 
 70 | #' @export
 71 | pull_importances._xgb.Booster <-
 72 |   function(object,
 73 |            scaled = TRUE,
 74 |            type = "Gain",
 75 |            ...) {
 76 | 
 77 |     call <- rlang::call2(
 78 |       .fn = "xgb.importance",
 79 |       .ns = "xgboost",
 80 |       model = object$fit
 81 |     )
 82 |     scores <- rlang::eval_tidy(call)
 83 |     scores <- tibble(feature = scores$Feature, importance = scores[[type]])
 84 | 
 85 |     if (scaled)
 86 |       scores$importance <- rescale(scores$importance)
 87 | 
 88 |     scores
 89 |   }
 90 | 
 91 | #' @export
 92 | pull_importances._C5.0 <- function(object, scaled = TRUE, ...) {
 93 |   others <- list(...)
 94 | 
 95 |   if (!length(others))
 96 |     others$metric = "usage"
 97 | 
 98 |   call <- rlang::call2(.fn = "C5imp", .ns = "C50", object = object$fit,!!!others)
 99 |   scores <- rlang::eval_tidy(call)
100 | 
101 |   scores <- tibble(feature = rownames(scores), importance = scores$Overall)
102 | 
103 |   if (scaled)
104 |     scores$importance <- rescale(scores$importance)
105 | 
106 |   scores
107 | }
108 | 
109 | #' @export
110 | pull_importances._H2OMultinomialModel <-
111 |   function(object, scaled = TRUE, ...) {
112 |     call <- rlang::call2(.fn = "h2o.varimp", .ns = "h2o", object = object$fit)
113 |     scores <- rlang::eval_tidy(call)
114 | 
115 |     scores <-
116 |       tibble(feature = scores$variable, importance = scores$relative_importance)
117 | 
118 |     if (scaled)
119 |       scores$importance <- rescale(scores$importance)
120 | 
121 |     scores
122 |   }
123 | 
124 | #' @export
125 | pull_importances._H2ORegressionModel <-
126 |   function(object, scaled = TRUE, ...) {
127 | 
128 |     call <- rlang::call2(.fn = "h2o.varimp", .ns = "h2o", object = object$fit)
129 |     scores <- rlang::eval_tidy(call)
130 | 
131 |     scores <-
132 |       tibble(feature = scores$variable, importance = scores$relative_importance)
133 | 
134 |     if (scaled)
135 |       scores$importance <- rescale(scores$importance)
136 | 
137 |     scores
138 |   }
139 | 
140 | #' @export
141 | pull_importances._ranger <- function(object, scaled = TRUE, ...) {
142 |   call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit)
143 |   scores <- rlang::eval_tidy(call)
144 | 
145 |   scores <- tibble(feature = names(scores), importance = as.numeric(scores))
146 | 
147 |   if (scaled)
148 |     scores$importance <- rescale(scores$importance)
149 | 
150 |   scores
151 | }
152 | 
153 | #' @export
154 | pull_importances._cubist <- function(object, scaled = TRUE, ...) {
155 |   scores <- object$fit$usage
156 | 
157 |   scores <- tibble(feature = scores$Variable, importance = scores$Model)
158 | 
159 |   if (scaled)
160 |     scores$importance <- rescale(scores$importance)
161 | 
162 |   scores
163 | }
164 | 
165 | #' @export
166 | pull_importances._earth <- function(object, scaled = TRUE, ...) {
167 |   call <- rlang::call2(.fn = "evimp", .ns = "earth", object = object$fit)
168 |   scores <- rlang::eval_tidy(call)
169 | 
170 |   scores <- tibble(feature = rownames(scores), importance = scores[, "rss"])
171 | 
172 |   if (scaled)
173 |     scores$importance <- rescale(scores$importance)
174 | 
175 |   scores
176 | }
177 | 
178 | #' @export
179 | pull_importances._lm <-
180 |   function(object,
181 |            scaled = FALSE,
182 |            intercept = FALSE,
183 |            ...) {
184 | 
185 |     scores <- tibble(
186 |       feature = names(stats::coefficients(object$fit)),
187 |       importance = stats::coefficients(object$fit)
188 |     )
189 | 
190 |     if (!intercept)
191 |       scores <- scores[scores$feature != "(Intercept)",]
192 | 
193 |     if (scaled)
194 |       scores$importance <- rescale(abs(scores$importance))
195 | 
196 |     scores
197 |   }
198 | 
199 | #' @export
200 | pull_importances._glm <-
201 |   function(object,
202 |            scaled = FALSE,
203 |            intercept = FALSE,
204 |            ...) {
205 | 
206 |     scores <- tibble(feature = names(stats::coefficients(object$fit)),
207 |                      importance = stats::coefficients(object$fit))
208 | 
209 |     if (!intercept)
210 |       scores <- scores[scores$feature != "(Intercept)", ]
211 | 
212 |     if (scaled)
213 |       scores$importance <- rescale(abs(scores$importance))
214 | 
215 |     scores
216 |   }
217 | 
218 | #' @export
219 | pull_importances._elnet <-
220 |   function(object,
221 |            scaled = FALSE,
222 |            intercept = FALSE,
223 |            penalty = NULL,
224 |            ...) {
225 |     if (is.null(penalty))
226 |       penalty <- object$spec$args$penalty
227 | 
228 |     if (is.null(penalty))
229 |       rlang::abort(
230 |         "model specification was not fitted using a `penalty` value. `penalty` should be supplied to the `pull_importances` method"
231 |       )
232 | 
233 |     scores <- tibble(feature = rownames(stats::coef(object$fit, s = penalty)),
234 |                      importance = stats::coef(object$fit, s = penalty)[, 1])
235 | 
236 |     if (!intercept)
237 |       scores <- scores[scores$feature != "(Intercept)", ]
238 | 
239 |     if (scaled)
240 |       scores$importance <- rescale(abs(scores$importance))
241 | 
242 |     scores
243 |   }
244 | 
245 | #' @export
246 | pull_importances._lognet <-
247 |   function(object,
248 |            scaled = FALSE,
249 |            intercept = FALSE,
250 |            penalty = NULL,
251 |            ...) {
252 |     if (!is.null(penalty)) {
253 |       s <- penalty
254 |     } else {
255 |       s <- object$spec$args$penalty
256 |     }
257 | 
258 |     if (is.null(s))
259 |       rlang::abort(
260 |         "model specification was not fitted using a `penalty` value. `penalty` should be supplied to the `pull_importances` method"
261 |       )
262 | 
263 |     scores <- tibble(
264 |       feature = rownames(stats::coef(object$fit, s = s)),
265 |       importance = stats::coef(object$fit, s = s)[, 1]
266 |     )
267 | 
268 |     if (!intercept)
269 |       scores <- scores[scores$feature != "(Intercept)",]
270 | 
271 |     if (scaled)
272 |       scores$importance <- rescale(abs(scores$importance))
273 | 
274 |     scores
275 |   }
276 | 
277 | #' @export
278 | pull_importances._randomForest <-
279 |   function(object, scaled = TRUE, ...) {
280 |     scores <- tibble(
281 |       feature = rownames(object$fit$importance),
282 |       importance = object$fit$importance
283 |     )
284 | 
285 |     if (scaled)
286 |       scores$importance <- rescale(scores$importance)
287 | 
288 |     scores
289 |   }
290 | 
291 | #' @export
292 | pull_importances._rpart <- function(object, scaled = TRUE, ...) {
293 |   scores <- tibble(
294 |     feature = names(object$fit$variable.importance),
295 |     importance = object$fit$variable.importance
296 |   )
297 | 
298 |   if (scaled)
299 |     scores$importance <- rescale(scores$importance)
300 | 
301 |   scores
302 | }
303 | 
304 | # stan?
305 | # surv?
306 | 


--------------------------------------------------------------------------------